From 035328c202d26a824b8632fd3b00635db5aee5a2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 9 Apr 2014 12:25:40 +1000 Subject: md/bitmap: don't abuse i_writecount for bitmap files. md bitmap code currently tries to use i_writecount to stop any other process from writing to out bitmap file. But that is really an abuse and has bit-rotted so locking is all wrong. So discard that - root should be allowed to shoot self in foot. Still use it in a much less intrusive way to stop the same file being used as bitmap on two different array, and apply other checks to ensure the file is at least vaguely usable for bitmap storage (is regular, is open for write. Support for ->bmap is already checked elsewhere). Reported-by: Al Viro Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 1 - drivers/md/md.c | 49 ++++++++++++++++--------------------------------- drivers/md/md.h | 1 - 3 files changed, 16 insertions(+), 35 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 4195a01b153..9a8e66ae04f 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1988,7 +1988,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; mddev->bitmap_info.file = NULL; - restore_bitmap_write_access(f); fput(f); } } else { diff --git a/drivers/md/md.c b/drivers/md/md.c index 4ad5cc4e63e..3fa2fc0a5dd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5181,32 +5181,6 @@ static int restart_array(struct mddev *mddev) return 0; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - -void restore_bitmap_write_access(struct file *file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); - spin_unlock(&inode->i_lock); -} - static void md_clean(struct mddev *mddev) { mddev->array_sectors = 0; @@ -5427,7 +5401,6 @@ static int do_md_stop(struct mddev * mddev, int mode, bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); fput(mddev->bitmap_info.file); mddev->bitmap_info.file = NULL; } @@ -5979,7 +5952,7 @@ abort_export: static int set_bitmap_file(struct mddev *mddev, int fd) { - int err; + int err = 0; if (mddev->pers) { if (!mddev->pers->quiesce) @@ -5991,6 +5964,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { + struct inode *inode; if (mddev->bitmap) return -EEXIST; /* cannot add when bitmap is present */ mddev->bitmap_info.file = fget(fd); @@ -6001,10 +5975,21 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_info.file); - if (err) { + inode = mddev->bitmap_info.file->f_mapping->host; + if (!S_ISREG(inode->i_mode)) { + printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", + mdname(mddev)); + err = -EBADF; + } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + printk(KERN_ERR "%s: error: bitmap file must open for write\n", + mdname(mddev)); + err = -EBADF; + } else if (atomic_read(&inode->i_writecount) != 1) { printk(KERN_ERR "%s: error: bitmap file is already in use\n", mdname(mddev)); + err = -EBUSY; + } + if (err) { fput(mddev->bitmap_info.file); mddev->bitmap_info.file = NULL; return err; @@ -6027,10 +6012,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_info.file) { - restore_bitmap_write_access(mddev->bitmap_info.file); + if (mddev->bitmap_info.file) fput(mddev->bitmap_info.file); - } mddev->bitmap_info.file = NULL; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 07bba96de26..a49d991f3fe 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -605,7 +605,6 @@ extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); -extern void restore_bitmap_write_access(struct file *file); extern void mddev_init(struct mddev *mddev); extern int md_run(struct mddev *mddev); -- cgit v1.2.3-18-g5258 From da1aab3dca9aa88ae34ca392470b8943159e25fe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 9 Apr 2014 12:25:43 +1000 Subject: md/raid1: r1buf_pool_alloc: free allocate pages when subsequent allocation fails. When performing a user-request check/repair (MD_RECOVERY_REQUEST is set) on a raid1, we allocate multiple bios each with their own set of pages. If the page allocations for one bio fails, we currently do *not* free the pages allocated for the previous bios, nor do we free the bio itself. This patch frees all the already-allocate pages, and makes sure that all the bios are freed as well. This bug can cause a memory leak which can ultimately OOM a machine. It was introduced in 3.10-rc1. Fixes: a07876064a0b73ab5ef1ebcf14b1cf0231c07858 Cc: Kent Overstreet Cc: stable@vger.kernel.org (3.10+) Reported-by: Russell King - ARM Linux Signed-off-by: NeilBrown --- drivers/md/raid1.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4a6ca1cb2e7..56e24c072b6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -97,6 +97,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) struct pool_info *pi = data; struct r1bio *r1_bio; struct bio *bio; + int need_pages; int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, pi); @@ -119,15 +120,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) * RESYNC_PAGES for each bio. */ if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) - j = pi->raid_disks; + need_pages = pi->raid_disks; else - j = 1; - while(j--) { + need_pages = 1; + for (j = 0; j < need_pages; j++) { bio = r1_bio->bios[j]; bio->bi_vcnt = RESYNC_PAGES; if (bio_alloc_pages(bio, gfp_flags)) - goto out_free_bio; + goto out_free_pages; } /* If not user-requests, copy the page pointers to all bios */ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { @@ -141,6 +142,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; +out_free_pages: + while (--j >= 0) { + struct bio_vec *bv; + + bio_for_each_segment_all(bv, r1_bio->bios[j], i) + __free_page(bv->bv_page); + } + out_free_bio: while (++j < pi->raid_disks) bio_put(r1_bio->bios[j]); -- cgit v1.2.3-18-g5258 From e2f23b606b94f28a8febd5aa715df697d80b018e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 9 Apr 2014 14:33:51 +1000 Subject: md: avoid oops on unload if some process is in poll or select. If md-mod is unloaded while some process is in poll() or select(), then that process maintains a pointer to md_event_waiters, and when the try to unlink from that list, they will oops. The procfs infrastructure ensures that ->poll won't be called after remove_proc_entry, but doesn't provide a wait_queue_head for us to use, and the waitqueue code doesn't provide a way to remove all listeners from a waitqueue. So we need to: 1/ make sure no further references to md_event_waiters are taken (by setting md_unloading) 2/ wake up all processes currently waiting, and 3/ wait until all those processes have disconnected from our wait_queue_head. Reported-by: "majianpeng" Signed-off-by: NeilBrown --- drivers/md/md.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3fa2fc0a5dd..8fda38d23e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7165,11 +7165,14 @@ static int md_seq_open(struct inode *inode, struct file *file) return error; } +static int md_unloading; static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { struct seq_file *seq = filp->private_data; int mask; + if (md_unloading) + return POLLIN|POLLRDNORM|POLLERR|POLLPRI;; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ @@ -8655,6 +8658,7 @@ static __exit void md_exit(void) { struct mddev *mddev; struct list_head *tmp; + int delay = 1; blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); @@ -8663,7 +8667,19 @@ static __exit void md_exit(void) unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); + + /* We cannot unload the modules while some process is + * waiting for us in select() or poll() - wake them up + */ + md_unloading = 1; + while (waitqueue_active(&md_event_waiters)) { + /* not safe to leave yet */ + wake_up(&md_event_waiters); + msleep(delay); + delay += delay; + } remove_proc_entry("mdstat", NULL); + for_each_mddev(mddev, tmp) { export_array(mddev); mddev->hold_active = 0; -- cgit v1.2.3-18-g5258 From 27c0f68f0745218cec70f19ba7560c8c5fc3f817 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 9 Apr 2014 11:25:47 +0800 Subject: raid5: make_request does less prepare wait In NUMA machine, prepare_to_wait/finish_wait in make_request exposes a lot of contention for sequential workload (or big request size workload). For such workload, each bio includes several stripes. So we can just do prepare_to_wait/finish_wait once for the whold bio instead of every stripe. This reduces the lock contention completely for such workload. Random workload might have the similar lock contention too, but I didn't see it yet, maybe because my stroage is still not fast enough. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 16f5c21963d..a904a2c80fc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4552,6 +4552,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; + DEFINE_WAIT(w); + bool do_prepare; if (unlikely(bi->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bi); @@ -4575,15 +4577,18 @@ static void make_request(struct mddev *mddev, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { - DEFINE_WAIT(w); int previous; int seq; + do_prepare = false; retry: seq = read_seqcount_begin(&conf->gen_lock); previous = 0; - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + if (do_prepare) + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); if (unlikely(conf->reshape_progress != MaxSector)) { /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be @@ -4604,6 +4609,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); schedule(); + do_prepare = true; goto retry; } } @@ -4640,6 +4646,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if (must_retry) { release_stripe(sh); schedule(); + do_prepare = true; goto retry; } } @@ -4663,8 +4670,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) prepare_to_wait(&conf->wait_for_overlap, &w, TASK_INTERRUPTIBLE); if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) + logical_sector < mddev->suspend_hi) { schedule(); + do_prepare = true; + } goto retry; } @@ -4677,9 +4686,9 @@ static void make_request(struct mddev *mddev, struct bio * bi) md_wakeup_thread(mddev->thread); release_stripe(sh); schedule(); + do_prepare = true; goto retry; } - finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((bi->bi_rw & REQ_SYNC) && @@ -4689,10 +4698,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); - finish_wait(&conf->wait_for_overlap, &w); break; } } + finish_wait(&conf->wait_for_overlap, &w); remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { -- cgit v1.2.3-18-g5258 From e240c1839d11152b0355442f8ac6d2d2d921be36 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 9 Apr 2014 11:27:42 +0800 Subject: raid5: get_active_stripe avoids device_lock For sequential workload (or request size big workload), get_active_stripe can find cached stripe. In this case, we always hold device_lock, which exposes a lot of lock contention for such workload. If stripe count isn't 0, we don't need hold the lock actually, since we just increase its count. And this is the hot code path for such workload. Unfortunately we must delete the BUG_ON. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a904a2c80fc..25247a85291 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -679,14 +679,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, init_stripe(sh, sector, previous); atomic_inc(&sh->count); } - } else { + } else if (!atomic_inc_not_zero(&sh->count)) { spin_lock(&conf->device_lock); - if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru) - && !test_bit(STRIPE_EXPANDING, &sh->state) - && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) - ); - } else { + if (!atomic_read(&sh->count)) { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); BUG_ON(list_empty(&sh->lru) && -- cgit v1.2.3-18-g5258