From ae55e1aaa7e2e57e538cb98cf617f511c5dc4f73 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Jan 2012 17:20:33 -0800 Subject: fs/direct-io.c: calculate fs_count correctly in get_more_blocks() In get_more_blocks(), we use dio_count to calcuate fs_count and do some tricky things to increase fs_count if dio_count isn't aligned. But actually it still has some corner cases that can't be coverd. See the following example: dio_write foo -s 1024 -w 4096 (direct write 4096 bytes at offset 1024). The same goes if the offset isn't aligned to fs_blocksize. In this case, the old calculation counts fs_count to be 1, but actually we will write into 2 different blocks (if fs_blocksize=4096). The old code just works, since it will call get_block twice (and may have to allocate and create extents twice for filesystems like ext4). So we'd better call get_block just once with the proper fs_count. Signed-off-by: Tao Ma Cc: "Theodore Ts'o" Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs/direct-io.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab67ff6..389863f59cd 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -580,9 +580,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, { int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ - unsigned long dio_count;/* Number of dio_block-sized blocks */ - unsigned long blkmask; int create; /* @@ -593,11 +592,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, if (ret == 0) { BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); fs_startblk = sdio->block_in_file >> sdio->blkfactor; - dio_count = sdio->final_block_in_request - sdio->block_in_file; - fs_count = dio_count >> sdio->blkfactor; - blkmask = (1 << sdio->blkfactor) - 1; - if (dio_count & blkmask) - fs_count++; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; -- cgit v1.2.3-18-g5258 From 65dd2aa90aa17a26703c28652408192856aa0396 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:35 -0800 Subject: dio: optimize cache misses in the submission path Some investigation of a transaction processing workload showed that a major consumer of cycles in __blockdev_direct_IO is the cache miss while accessing the block size. This is because it has to walk the chain from block_dev to gendisk to queue. The block size is needed early on to check alignment and sizes. It's only done if the check for the inode block size fails. But the costly block device state is unconditionally fetched. - Reorganize the code to only fetch block dev state when actually needed. Then do a prefetch on the block dev early on in the direct IO path. This is worth it, because there is substantial code run before we actually touch the block dev now. - I also added some unlikelies to make it clear the compiler that block device fetch code is not normally executed. This gave a small, but measurable improvement on a large database benchmark (about 0.3%) [akpm@linux-foundation.org: coding-style fixes] [sfr@canb.auug.org.au: using prefetch requires including prefetch.h] Signed-off-by: Andi Kleen Cc: Jeff Moyer Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) (limited to 'fs/direct-io.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 389863f59cd..4a588dbd11b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -36,6 +36,7 @@ #include #include #include +#include /* * How many user pages to map in one call to get_user_pages(). This determines @@ -1087,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +static inline ssize_t +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1097,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, size_t size; unsigned long addr; unsigned blkbits = inode->i_blkbits; - unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1110,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw & WRITE) rw = WRITE_ODIRECT; - if (bdev) - bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ if (offset & blocksize_mask) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) goto out; @@ -1126,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (unlikely((addr & blocksize_mask) || + (size & blocksize_mask))) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits( + bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; } } @@ -1313,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: return retval; } + +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, + submit_io, flags); +} + EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) -- cgit v1.2.3-18-g5258 From 37fbf4bfb826372c3ca6c09d8a015d1fe9f5e186 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Thu, 23 Feb 2012 23:40:05 +0000 Subject: Restore direct_io / truncate locking API With kernel 3.1, Christoph removed i_alloc_sem and replaced it with calls (namely inode_dio_wait() and inode_dio_done()) which are EXPORT_SYMBOL_GPL() thus they cannot be used by non-GPL file systems and further inode_dio_wait() was pushed from notify_change() into the file system ->setattr() method but no non-GPL file system can make this call. That means non-GPL file systems cannot exist any more unless they do not use any VFS functionality related to reading/writing as far as I can tell or at least as long as they want to implement direct i/o. Both Linus and Al (and others) have said on LKML that this breakage of the VFS API should not have happened and that the change was simply missed as it was not documented in the change logs of the patches that did those changes. This patch changes the two function exports in question to be EXPORT_SYMBOL() thus restoring the VFS API as it used to be - accessible for all modules. Christoph, who introduced the two functions and exported them GPL-only is CC-ed on this patch to give him the opportunity to object to the symbols being changed in this manner if he did indeed intend them to be GPL-only and does not want them to become available to all modules. Signed-off-by: Anton Altaparmakov CC: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/direct-io.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 4a588dbd11b..f4aadd15b61 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -173,7 +173,7 @@ void inode_dio_wait(struct inode *inode) if (atomic_read(&inode->i_dio_count)) __inode_dio_wait(inode); } -EXPORT_SYMBOL_GPL(inode_dio_wait); +EXPORT_SYMBOL(inode_dio_wait); /* * inode_dio_done - signal finish of a direct I/O requests @@ -187,7 +187,7 @@ void inode_dio_done(struct inode *inode) if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } -EXPORT_SYMBOL_GPL(inode_dio_done); +EXPORT_SYMBOL(inode_dio_done); /* * How many pages are in the queue? -- cgit v1.2.3-18-g5258