From 97a851ed71cd9cc2542955e92a001c6ea3d21d35 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 11:58:58 -0400 Subject: ext4: use io_end for multiple bios Change writeback path to create just one io_end structure for the extent to which we submit IO and share it among bios writing that extent. This prevents needless splitting and joining of unwritten extents when they cannot be submitted as a single bio. Bugs in ENOMEM handling found by Linux File System Verification project (linuxtesting.org) and fixed by Alexey Khoroshilov . CC: Alexey Khoroshilov Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 121 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 45 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 4acf1f78881..19599bded62 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -62,15 +62,28 @@ void ext4_ioend_shutdown(struct inode *inode) cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_release_io_end(ext4_io_end_t *io_end) { - BUG_ON(!io); - BUG_ON(!list_empty(&io->list)); - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); + if (io_end->flag & EXT4_IO_END_DIRECT) + inode_dio_done(io_end->inode); + if (io_end->iocb) + aio_complete(io_end->iocb, io_end->result, 0); + kmem_cache_free(io_end_cachep, io_end); +} + +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); } /* check a range of space and convert unwritten extents to written. */ @@ -93,13 +106,8 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); - if (io->iocb) - aio_complete(io->iocb, io->result, 0); + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); return ret; } @@ -130,7 +138,7 @@ static void dump_completed_IO(struct inode *inode) } /* Add the io_end to per-inode completed end_io list. */ -void ext4_add_complete_io(ext4_io_end_t *io_end) +static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct workqueue_struct *wq; @@ -167,8 +175,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode) err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - io->flag &= ~EXT4_IO_END_UNWRITTEN; - ext4_free_io_end(io); } return ret; } @@ -200,10 +206,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); } return io; } +void ext4_put_io_end_defer(ext4_io_end_t *io_end) +{ + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->inode, + io_end->offset, io_end->size); + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; +} + /* * Print an buffer I/O error compatible with the fs/buffer.c. This * provides compatibility with dmesg scrapers that look for a specific @@ -286,12 +325,7 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; - } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } void ext4_io_submit(struct ext4_io_submit *io) @@ -305,40 +339,37 @@ void ext4_io_submit(struct ext4_io_submit *io) bio_put(io->io_bio); } io->io_bio = NULL; - io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_bio = NULL; io->io_end = NULL; } -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { - ext4_io_end_t *io_end; - struct page *page = bh->b_page; int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - + bio->bi_private = ext4_get_io_end(io->io_end); + if (!io->io_end->size) + io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, - struct writeback_control *wbc, struct buffer_head *bh) { ext4_io_end_t *io_end; @@ -349,18 +380,18 @@ submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); + ret = io_submit_init_bio(io, bh); if (ret) return ret; } + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; io_end = io->io_end; if (test_clear_buffer_uninit(bh)) ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; + io_end->size += bh->b_size; io->io_next_block++; - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); - if (ret != bh->b_size) - goto submit_and_retry; return 0; } @@ -432,7 +463,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, wbc, bh); + ret = io_submit_add_bh(io, inode, bh); if (ret) { /* * We only get here on ENOMEM. Not much else -- cgit v1.2.3-18-g5258 From 4e7ea81db53465ddd753678bc4cebf95369d0984 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 13:17:40 -0400 Subject: ext4: restructure writeback path There are two issues with current writeback path in ext4. For one we don't necessarily map complete pages when blocksize < pagesize and thus needn't do any writeback in one iteration. We always map some blocks though so we will eventually finish mapping the page. Just if writeback races with other operations on the file, forward progress is not really guaranteed. The second problem is that current code structure makes it hard to associate all the bios to some range of pages with one io_end structure so that unwritten extents can be converted after all the bios are finished. This will be especially difficult later when io_end will be associated with reserved transaction handle. We restructure the writeback path to a relatively simple loop which first prepares extent of pages, then maps one or more extents so that no page is partially mapped, and once page is fully mapped it is submitted for IO. We keep all the mapping and IO submission information in mpage_da_data structure to somewhat reduce stack usage. Resulting code is somewhat shorter than the old one and hopefully also easier to read. Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 19599bded62..3e585462512 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -360,9 +360,6 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); - if (!io->io_end->size) - io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) - + bh_offset(bh); io->io_bio = bio; io->io_next_block = bh->b_blocknr; return 0; @@ -390,7 +387,6 @@ submit_and_retry: io_end = io->io_end; if (test_clear_buffer_uninit(bh)) ext4_set_io_unwritten_flag(inode, io_end); - io_end->size += bh->b_size; io->io_next_block++; return 0; } -- cgit v1.2.3-18-g5258 From 3613d22807a2616e9346800bacd88aa8bbbefcd7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 13:19:34 -0400 Subject: ext4: remove buffer_uninit handling There isn't any need for setting BH_Uninit on buffers anymore. It was only used to signal we need to mark io_end as needing extent conversion in add_bh_to_extent() but now we can mark the io_end directly when mapping extent. Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3e585462512..de6860c7836 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -369,7 +369,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, struct buffer_head *bh) { - ext4_io_end_t *io_end; int ret; if (io->io_bio && bh->b_blocknr != io->io_next_block) { @@ -384,9 +383,6 @@ submit_and_retry: ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - io_end = io->io_end; - if (test_clear_buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); io->io_next_block++; return 0; } -- cgit v1.2.3-18-g5258 From 6b523df4fb5ae281ddbc817f40504b33e6226554 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 13:21:11 -0400 Subject: ext4: use transaction reservation for extent conversion in ext4_end_io Later we would like to clear PageWriteback bit only after extent conversion from unwritten to written extents is performed. However it is not possible to start a transaction after PageWriteback is set because that violates lock ordering (and is easy to deadlock). So we have to reserve a transaction before locking pages and sending them for IO and later we use the transaction for extent conversion from ext4_end_io(). Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index de6860c7836..5f20bc48104 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -66,6 +66,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) { BUG_ON(!list_empty(&io_end->list)); BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->handle); if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) wake_up_all(ext4_ioend_wq(io_end->inode)); @@ -92,13 +93,15 @@ static int ext4_end_io(ext4_io_end_t *io) struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + handle_t *handle = io->handle; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - ret = ext4_convert_unwritten_extents(inode, offset, size); + io->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_extents(handle, inode, offset, size); if (ret < 0) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " @@ -228,8 +231,10 @@ int ext4_put_io_end(ext4_io_end_t *io_end) if (atomic_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->inode, - io_end->offset, io_end->size); + err = ext4_convert_unwritten_extents(io_end->handle, + io_end->inode, io_end->offset, + io_end->size); + io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } ext4_release_io_end(io_end); -- cgit v1.2.3-18-g5258 From 2e8fa54e3b48e4ce8c4e9ca4674ffbc973f58be5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 14:21:02 -0400 Subject: ext4: split extent conversion lists to reserved & unreserved parts Now that we have extent conversions with reserved transaction, we have to prevent extent conversions without reserved transaction (from DIO code) to block these (as that would effectively void any transaction reservation we did). So split lists, work items, and work queues to reserved and unreserved parts. Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 65 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5f20bc48104..bcdfd6bdde0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -58,8 +58,10 @@ void ext4_ioend_shutdown(struct inode *inode) * We need to make sure the work structure is finished being * used before we let the inode get destroyed. */ - if (work_pending(&EXT4_I(inode)->i_unwritten_work)) - cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); + if (work_pending(&EXT4_I(inode)->i_rsv_conversion_work)) + cancel_work_sync(&EXT4_I(inode)->i_rsv_conversion_work); + if (work_pending(&EXT4_I(inode)->i_unrsv_conversion_work)) + cancel_work_sync(&EXT4_I(inode)->i_unrsv_conversion_work); } static void ext4_release_io_end(ext4_io_end_t *io_end) @@ -114,20 +116,17 @@ static int ext4_end_io(ext4_io_end_t *io) return ret; } -static void dump_completed_IO(struct inode *inode) +static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; - if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { - ext4_debug("inode %lu completed_io list is empty\n", - inode->i_ino); + if (list_empty(head)) return; - } - ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + list_for_each_entry(io, head, list) { cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -148,16 +147,23 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) unsigned long flags; BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (list_empty(&ei->i_completed_io_list)) - queue_work(wq, &ei->i_unwritten_work); - list_add_tail(&io_end->list, &ei->i_completed_io_list); + if (io_end->handle) { + wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); + } else { + wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; + if (list_empty(&ei->i_unrsv_conversion_list)) + queue_work(wq, &ei->i_unrsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); + } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } -static int ext4_do_flush_completed_IO(struct inode *inode) +static int ext4_do_flush_completed_IO(struct inode *inode, + struct list_head *head) { ext4_io_end_t *io; struct list_head unwritten; @@ -166,8 +172,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode) int err, ret = 0; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - dump_completed_IO(inode); - list_replace_init(&ei->i_completed_io_list, &unwritten); + dump_completed_IO(inode, head); + list_replace_init(head, &unwritten); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { @@ -183,21 +189,34 @@ static int ext4_do_flush_completed_IO(struct inode *inode) } /* - * work on completed aio dio IO, to convert unwritten extents to extents + * work on completed IO, to convert unwritten extents to extents */ -void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_rsv_work(struct work_struct *work) { struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, - i_unwritten_work); - ext4_do_flush_completed_IO(&ei->vfs_inode); + i_rsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); +} + +void ext4_end_io_unrsv_work(struct work_struct *work) +{ + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_unrsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); } int ext4_flush_unwritten_io(struct inode *inode) { - int ret; + int ret, err; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && !(inode->i_state & I_FREEING)); - ret = ext4_do_flush_completed_IO(inode); + ret = ext4_do_flush_completed_IO(inode, + &EXT4_I(inode)->i_rsv_conversion_list); + err = ext4_do_flush_completed_IO(inode, + &EXT4_I(inode)->i_unrsv_conversion_list); + if (!ret) + ret = err; ext4_unwritten_wait(inode); return ret; } -- cgit v1.2.3-18-g5258 From b0857d309faefaf5443752458e8af1a4b22b3e92 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 14:23:41 -0400 Subject: ext4: defer clearing of PageWriteback after extent conversion Currently PageWriteback bit gets cleared from put_io_page() called from ext4_end_bio(). This is somewhat inconvenient as extent tree is not fully updated at that time (unwritten extents are not marked as written) so we cannot read the data back yet. This design was dictated by lock ordering as we cannot start a transaction while PageWriteback bit is set (we could easily deadlock with ext4_da_writepages()). But now that we use transaction reservation for extent conversion, locking issues are solved and we can move PageWriteback bit clearing after extent conversion is done. As a result we can remove wait for unwritten extent conversion from ext4_sync_file() because it already implicitely happens through wait_on_page_writeback(). We implement deferring of PageWriteback clearing by queueing completed bios to appropriate io_end and processing all the pages when io_end is going to be freed instead of at the moment ext4_io_end() is called. Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 138 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 79 insertions(+), 59 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index bcdfd6bdde0..755741c211a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -64,14 +64,83 @@ void ext4_ioend_shutdown(struct inode *inode) cancel_work_sync(&EXT4_I(inode)->i_unrsv_conversion_work); } +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_finish_bio(struct bio *bio) +{ + int i; + int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + + for (i = 0; i < bio->bi_vcnt; i++) { + struct bio_vec *bvec = &bio->bi_io_vec[i]; + struct page *page = bvec->bv_page; + struct buffer_head *bh, *head; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; + + if (error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + } + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) + end_page_writeback(page); + } +} + static void ext4_release_io_end(ext4_io_end_t *io_end) { + struct bio *bio, *next_bio; + BUG_ON(!list_empty(&io_end->list)); BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); WARN_ON(io_end->handle); if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) wake_up_all(ext4_ioend_wq(io_end->inode)); + + for (bio = io_end->bio; bio; bio = next_bio) { + next_bio = bio->bi_private; + ext4_finish_bio(bio); + bio_put(bio); + } if (io_end->flag & EXT4_IO_END_DIRECT) inode_dio_done(io_end->inode); if (io_end->iocb) @@ -267,79 +336,31 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) return io_end; } -/* - * Print an buffer I/O error compatible with the fs/buffer.c. This - * provides compatibility with dmesg scrapers that look for a specific - * buffer I/O error message. We really need a unified error reporting - * structure to userspace ala Digital Unix's uerf system, but it's - * probably not going to happen in my lifetime, due to LKML politics... - */ -static void buffer_io_error(struct buffer_head *bh) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); -} - static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - struct inode *inode; - int i; - int blocksize; sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); - inode = io_end->inode; - blocksize = 1 << inode->i_blkbits; - bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - for (i = 0; i < bio->bi_vcnt; i++) { - struct bio_vec *bvec = &bio->bi_io_vec[i]; - struct page *page = bvec->bv_page; - struct buffer_head *bh, *head; - unsigned bio_start = bvec->bv_offset; - unsigned bio_end = bio_start + bvec->bv_len; - unsigned under_io = 0; - unsigned long flags; - if (!page) - continue; - - if (error) { - SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); - } - bh = head = page_buffers(page); + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { /* - * We check all buffers in the page under BH_Uptodate_Lock - * to avoid races with other end io clearing async_write flags + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. */ - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &head->b_state); - do { - if (bh_offset(bh) < bio_start || - bh_offset(bh) + blocksize > bio_end) { - if (buffer_async_write(bh)) - under_io++; - continue; - } - clear_buffer_async_write(bh); - if (error) - buffer_io_error(bh); - } while ((bh = bh->b_this_page) != head); - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); - local_irq_restore(flags); - if (!under_io) - end_page_writeback(page); + bio->bi_private = xchg(&io_end->bio, bio); + } else { + ext4_finish_bio(bio); + bio_put(bio); } - bio_put(bio); if (error) { - io_end->flag |= EXT4_IO_END_ERROR; + struct inode *inode = io_end->inode; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " "(offset %llu size %ld starting block %llu)", inode->i_ino, @@ -348,7 +369,6 @@ static void ext4_end_bio(struct bio *bio, int error) (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); } - ext4_put_io_end_defer(io_end); } -- cgit v1.2.3-18-g5258 From a115f749c14ee94e8b7bdbd203a5afdb1659156b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 14:30:00 -0400 Subject: ext4: remove wait for unwritten extent conversion from ext4_truncate() Since PageWriteback bit is now cleared after extents are converted from unwritten to written ones, we have full exclusion of writeback path from truncate (truncate_inode_pages() waits for PageWriteback bits to get cleared on all invalidated pages). Exclusion from DIO path is achieved by inode_dio_wait() call in ext4_setattr(). So there's no need to wait for extent convertion in ext4_truncate() anymore. Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 755741c211a..0f65561ab5a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -158,7 +158,14 @@ static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) wake_up_all(ext4_ioend_wq(inode)); } -/* check a range of space and convert unwritten extents to written. */ +/* + * Check a range of space and convert unwritten extents to written. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). + */ static int ext4_end_io(ext4_io_end_t *io) { struct inode *inode = io->inode; -- cgit v1.2.3-18-g5258 From c724585b62411f7abdea5b1054b9f1e1e7c964be Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 14:44:36 -0400 Subject: ext4: don't wait for extent conversion in ext4_punch_hole() We don't have to wait for extent conversion in ext4_punch_hole() as buffered IO for the punched range has been flushed and waited upon (thus all extent conversions for that range have completed). Also we wait for all DIO to finish using inode_dio_wait() so there cannot be any extent conversions pending due to direct IO. Also remove ext4_flush_unwritten_io() since it's unused now. Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0f65561ab5a..6ee5bd38940 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -281,22 +281,6 @@ void ext4_end_io_unrsv_work(struct work_struct *work) ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); } -int ext4_flush_unwritten_io(struct inode *inode) -{ - int ret, err; - - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && - !(inode->i_state & I_FREEING)); - ret = ext4_do_flush_completed_IO(inode, - &EXT4_I(inode)->i_rsv_conversion_list); - err = ext4_do_flush_completed_IO(inode, - &EXT4_I(inode)->i_unrsv_conversion_list); - if (!ret) - ret = err; - ext4_unwritten_wait(inode); - return ret; -} - ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); -- cgit v1.2.3-18-g5258 From 5dc23bdd5f846ef868e82f789dfd9b13093f9ba6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Jun 2013 14:46:12 -0400 Subject: ext4: remove ext4_ioend_wait() Now that we clear PageWriteback after extent conversion, there's no need to wait for io_end processing in ext4_evict_inode(). Running AIO/DIO keeps file reference until aio_complete() is called so ext4_evict_inode() cannot be called. For io_end structures resulting from buffered IO waiting is happening because we wait for PageWriteback in truncate_inode_pages(). Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 6ee5bd38940..ce8c15a7eab 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -45,25 +45,6 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_end_cachep); } -/* - * This function is called by ext4_evict_inode() to make sure there is - * no more pending I/O completion work left to do. - */ -void ext4_ioend_shutdown(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); - /* - * We need to make sure the work structure is finished being - * used before we let the inode get destroyed. - */ - if (work_pending(&EXT4_I(inode)->i_rsv_conversion_work)) - cancel_work_sync(&EXT4_I(inode)->i_rsv_conversion_work); - if (work_pending(&EXT4_I(inode)->i_unrsv_conversion_work)) - cancel_work_sync(&EXT4_I(inode)->i_unrsv_conversion_work); -} - /* * Print an buffer I/O error compatible with the fs/buffer.c. This * provides compatibility with dmesg scrapers that look for a specific -- cgit v1.2.3-18-g5258 From a1d8d9a757cd46e044a3f6061c315eda14bf697e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Jun 2013 10:18:22 -0400 Subject: ext4: add check to io_submit_init_bio The bio_alloc() function can return NULL if the memory allocation fails. So we need to check for this. Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ce8c15a7eab..48786cdb5e6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -372,6 +372,8 @@ static int io_submit_init_bio(struct ext4_io_submit *io, struct bio *bio; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); + if (!bio) + return -ENOMEM; bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; -- cgit v1.2.3-18-g5258 From 822dbba33458cd6ad0e715f3f4a57ebc99d54d1b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Jul 2013 21:31:04 -0400 Subject: ext4: fix warning in ext4_evict_inode() The following race can lead to ext4_evict_inode() seeing i_ioend_count > 0 and thus triggering a sanity check warning: CPU1 CPU2 ext4_end_bio() ext4_evict_inode() ext4_finish_bio() end_page_writeback(); truncate_inode_pages() evict page WARN_ON(i_ioend_count > 0); ext4_put_io_end_defer() ext4_release_io_end() dec i_ioend_count This is possible use-after-free bug since we decrement i_ioend_count in possibly released inode. Since i_ioend_count is used only for sanity checks one possible solution would be to just remove it but for now I'd like to keep those sanity checks to help debugging the new ext4 writeback code. This patch changes ext4_end_bio() to call ext4_put_io_end_defer() before ext4_finish_bio() in the shortcut case when unwritten extent conversion isn't needed. In that case we don't need the io_end so we are safe to drop it early. Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 48786cdb5e6..d63cc5e9d3b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,6 +308,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) return io_end; } +/* BIO completion function for page writeback */ static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; @@ -318,18 +319,6 @@ static void ext4_end_bio(struct bio *bio, int error) if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - /* - * Link bio into list hanging from io_end. We have to do it - * atomically as bio completions can be racing against each - * other. - */ - bio->bi_private = xchg(&io_end->bio, bio); - } else { - ext4_finish_bio(bio); - bio_put(bio); - } - if (error) { struct inode *inode = io_end->inode; @@ -341,7 +330,24 @@ static void ext4_end_bio(struct bio *bio, int error) (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); } - ext4_put_io_end_defer(io_end); + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + /* + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. + */ + bio->bi_private = xchg(&io_end->bio, bio); + ext4_put_io_end_defer(io_end); + } else { + /* + * Drop io_end reference early. Inode can get freed once + * we finish the bio. + */ + ext4_put_io_end_defer(io_end); + ext4_finish_bio(bio); + bio_put(bio); + } } void ext4_io_submit(struct ext4_io_submit *io) -- cgit v1.2.3-18-g5258 From e8974c3930ae9692bb4f77380961421e9a2f76ab Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Thu, 11 Jul 2013 22:42:42 -0400 Subject: ext4: rate limit printk in buffer_io_error() If there are a lot of outstanding buffered IOs when a device is taken offline (due to hardware errors etc), ext4_end_bio prints out a message for each failed logical block. While this is desirable, we see thousands of such lines being printed out before the serial console gets overwhelmed, causing ext4_end_bio() wait for the printk to complete. This in itself isn't a disaster, except for the detail that this function is being called with the queue lock held. This causes any other function in the block layer to spin on its spin_lock_irqsave while the serial console is draining. If NMI watchdog is enabled on this machine then it eventually comes along and shoots the machine in the head. The end result is that losing any one disk causes the machine to go down. This patch rate limits the printk to bandaid around the problem. Tested: xfstests Change-Id: I8ab5690dcf4f3a67e78be147d45e489fdf4a88d8 Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4/page-io.c') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index d63cc5e9d3b..6625d210fb4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -55,7 +56,7 @@ void ext4_exit_pageio(void) static void buffer_io_error(struct buffer_head *bh) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } -- cgit v1.2.3-18-g5258