diff options
Diffstat (limited to 'fs/block_dev.c')
| -rw-r--r-- | fs/block_dev.c | 484 |
1 files changed, 308 insertions, 176 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 88928701959..6d7274619bf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -16,7 +16,9 @@ #include <linux/blkdev.h> #include <linux/module.h> #include <linux/blkpg.h> +#include <linux/magic.h> #include <linux/buffer_head.h> +#include <linux/swap.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/mpage.h> @@ -24,7 +26,8 @@ #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> -#include <linux/kmemleak.h> +#include <linux/cleancache.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" @@ -44,45 +47,67 @@ inline struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } - EXPORT_SYMBOL(I_BDEV); /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_lock); + struct backing_dev_info *old = inode->i_data.backing_dev_info; + bool wakeup_bdi = false; + + if (unlikely(dst == old)) /* deadlock avoidance */ + return; + bdi_lock_two(&old->wb, &dst->wb); + spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; - if (inode->i_state & I_DIRTY) + if (inode->i_state & I_DIRTY) { + if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) + wakeup_bdi = true; list_move(&inode->i_wb_list, &dst->wb.b_dirty); - spin_unlock(&inode_lock); -} - -static sector_t max_block(struct block_device *bdev) -{ - sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); - - if (sz) { - unsigned int size = block_size(bdev); - unsigned int sizebits = blksize_bits(size); - retval = (sz >> sizebits); } - return retval; + spin_unlock(&inode->i_lock); + spin_unlock(&old->wb.list_lock); + spin_unlock(&dst->wb.list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(dst); } /* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) +void kill_bdev(struct block_device *bdev) { - if (bdev->bd_inode->i_mapping->nrpages == 0) + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; + invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); + truncate_inode_pages(mapping, 0); } +EXPORT_SYMBOL(kill_bdev); + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_invalidate_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { @@ -133,61 +158,22 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= max_block(I_BDEV(inode))) { - if (create) - return -EIO; - - /* - * for reads, we're just trying to fill a partial page. - * return a hole, they will have to call get_block again - * before they can fill it, and they will get -EIO at that - * time - */ - return 0; - } bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - sector_t end_block = max_block(I_BDEV(inode)); - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - - if ((iblock + max_blocks) > end_block) { - max_blocks = end_block - iblock; - if ((long)max_blocks <= 0) { - if (create) - return -EIO; /* write fully beyond EOF */ - /* - * It is a read which is fully beyond EOF. We return - * a !buffer_mapped buffer - */ - max_blocks = 0; - } - } - - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - bh->b_size = max_blocks << inode->i_blkbits; - if (max_blocks) - set_buffer_mapped(bh); - return 0; -} - static ssize_t -blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_blocks, NULL, NULL, 0); + return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter, + offset, blkdev_get_block, + NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -341,59 +327,106 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, /* * private llseek: - * for a block special file file->f_path.dentry->d_inode->i_size is zero + * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ -static loff_t block_llseek(struct file *file, loff_t offset, int origin) +static loff_t block_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = file->f_mapping->host; - loff_t size; loff_t retval; mutex_lock(&bd_inode->i_mutex); - size = i_size_read(bd_inode); - - switch (origin) { - case 2: - offset += size; - break; - case 1: - offset += file->f_pos; - } - retval = -EINVAL; - if (offset >= 0 && offset <= size) { - if (offset != file->f_pos) { - file->f_pos = offset; - } - retval = offset; - } + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); mutex_unlock(&bd_inode->i_mutex); return retval; } -int blkdev_fsync(struct file *filp, int datasync) +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); int error; + + error = filemap_write_and_wait_range(filp->f_mapping, start, end); + if (error) + return error; /* * There is no need to serialise calls to blkdev_issue_flush with * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - mutex_unlock(&bd_inode->i_mutex); - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); if (error == -EOPNOTSUPP) error = 0; - mutex_lock(&bd_inode->i_mutex); - return error; } EXPORT_SYMBOL(blkdev_fsync); +/** + * bdev_read_page() - Start reading a page from a block device + * @bdev: The device to read the page from + * @sector: The offset on the device to read the page to (need not be aligned) + * @page: The page to read + * + * On entry, the page should be locked. It will be unlocked when the page + * has been read. If the block driver implements rw_page synchronously, + * that will be true on exit from this function, but it need not be. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to read this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_read_page(struct block_device *bdev, sector_t sector, + struct page *page) +{ + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); +} +EXPORT_SYMBOL_GPL(bdev_read_page); + +/** + * bdev_write_page() - Start writing a page to a block device + * @bdev: The device to write the page to + * @sector: The offset on the device to write the page to (need not be aligned) + * @page: The page to write + * @wbc: The writeback_control for the write + * + * On entry, the page should be locked and not currently under writeback. + * On exit, if the write started successfully, the page will be unlocked and + * under writeback. If the write failed already (eg the driver failed to + * queue the page to the device), the page will still be locked. If the + * caller is a ->writepage implementation, it will need to unlock the page. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to write this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_write_page(struct block_device *bdev, sector_t sector, + struct page *page, struct writeback_control *wbc) +{ + int result; + int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + set_page_writeback(page); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + if (result) + end_page_writeback(page); + else + unlock_page(page); + return result; +} +EXPORT_SYMBOL_GPL(bdev_write_page); + /* * pseudo-fs */ @@ -414,7 +447,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bdev_cachep, bdi); } @@ -451,9 +483,9 @@ static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; struct list_head *p; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ - end_writeback(inode); + clear_inode(inode); spin_lock(&bdev_lock); while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { __bd_forget(list_entry(p, struct inode, i_devices)); @@ -473,7 +505,7 @@ static const struct super_operations bdev_sops = { static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); + return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); } static struct file_system_type bd_type = { @@ -482,12 +514,12 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -struct super_block *blockdev_superblock __read_mostly; +static struct super_block *blockdev_superblock __read_mostly; void __init bdev_cache_init(void) { int err; - struct vfsmount *bd_mnt; + static struct vfsmount *bd_mnt; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -499,12 +531,7 @@ void __init bdev_cache_init(void) bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); - /* - * This vfsmount structure is only used to obtain the - * blockdev_superblock, so tell kmemleak not to report it. - */ - kmemleak_not_leak(bd_mnt); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } /* @@ -545,6 +572,7 @@ struct block_device *bdget(dev_t dev) if (inode->i_state & I_NEW) { bdev->bd_contains = NULL; + bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; @@ -574,6 +602,7 @@ struct block_device *bdgrab(struct block_device *bdev) ihold(bdev->bd_inode); return bdev; } +EXPORT_SYMBOL(bdgrab); long nr_blockdev_pages(void) { @@ -627,6 +656,11 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } +int sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} + /* Call when you free inode */ void bd_forget(struct inode *inode) @@ -634,11 +668,9 @@ void bd_forget(struct inode *inode) struct block_device *bdev = NULL; spin_lock(&bdev_lock); - if (inode->i_bdev) { - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - __bd_forget(inode); - } + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); spin_unlock(&bdev_lock); if (bdev) @@ -651,7 +683,7 @@ void bd_forget(struct inode *inode) * @whole: whole block device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * - * Test whther @bdev can be claimed by @holder. + * Test whether @bdev can be claimed by @holder. * * CONTEXT: * spin_lock(&bdev_lock). @@ -760,7 +792,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, if (!disk) return ERR_PTR(-ENXIO); - whole = bdget_disk(disk, 0); + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + module_put(disk->fops->owner); put_disk(disk); if (!whole) @@ -947,7 +991,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) if (!bdev->bd_disk) return; - if (disk_partitionable(bdev->bd_disk)) + if (disk_part_scan_enabled(bdev->bd_disk)) bdev->bd_invalidated = 1; } @@ -1000,6 +1044,7 @@ int revalidate_disk(struct gendisk *disk) mutex_lock(&bdev->bd_mutex); check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); return ret; @@ -1038,7 +1083,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - bdev->bd_inode->i_size = size; + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, size); + mutex_unlock(&bdev->bd_inode->i_mutex); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1049,7 +1096,7 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); /* * bd_mutex locking: @@ -1061,6 +1108,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; + struct module *owner; int ret; int partno; int perm = 0; @@ -1086,10 +1134,13 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; + owner = disk->fops->owner; + disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1099,6 +1150,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_part) goto out_clear; + ret = 0; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); if (ret == -ERESTARTSYS) { @@ -1108,24 +1160,38 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) */ disk_put_part(bdev->bd_part); bdev->bd_part = NULL; - module_put(disk->fops->owner); - put_disk(disk); bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); + put_disk(disk); + module_put(owner); goto restart; } - if (ret) - goto out_clear; } - if (!bdev->bd_openers) { + + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); if (bdi == NULL) bdi = &default_backing_dev_info; bdev_inode_switch_bdi(bdev->bd_inode, bdi); } - if (bdev->bd_invalidated) - rescan_partitions(disk, bdev); + + /* + * If the device is invalidated, rescan partition + * if open succeeded or failed with -ENOMEDIUM. + * The latter is necessary to prevent ghost + * partitions on a removed medium. + */ + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } + if (ret) + goto out_clear; } else { struct block_device *whole; whole = bdget_disk(disk, 0); @@ -1148,39 +1214,46 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } } else { - module_put(disk->fops->owner); - put_disk(disk); - disk = NULL; if (bdev->bd_contains == bdev) { - if (bdev->bd_disk->fops->open) { + ret = 0; + if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); - if (ret) - goto out_unlock_bdev; + /* the same as first opener case, read comment there */ + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); } - if (bdev->bd_invalidated) - rescan_partitions(bdev->bd_disk, bdev); + if (ret) + goto out_unlock_bdev; } + /* only one opener holds refs to the module and disk */ + put_disk(disk); + module_put(owner); } bdev->bd_openers++; if (for_part) bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); return 0; out_clear: disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); - out: - if (disk) - module_put(disk->fops->owner); + disk_unblock_events(disk); put_disk(disk); + module_put(owner); + out: bdput(bdev); return ret; @@ -1223,6 +1296,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) res = __blkdev_get(bdev, mode, 0); if (whole) { + struct gendisk *disk = whole->bd_disk; + /* finish claiming */ mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); @@ -1249,15 +1324,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) spin_unlock(&bdev_lock); /* - * Block event polling for write claims. Any write - * holder makes the write_holder state stick until all - * are released. This is good enough and tracking - * individual writeable reference is too fragile given - * the way @mode is used in blkdev_get/put(). + * Block event polling for write claims if requested. Any + * write holder makes the write_holder state stick until + * all are released. This is good enough and tracking + * individual writeable reference is too fragile given the + * way @mode is used in blkdev_get/put(). */ - if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; - disk_block_events(bdev->bd_disk); + disk_block_events(disk); } mutex_unlock(&bdev->bd_mutex); @@ -1375,9 +1451,8 @@ static int blkdev_open(struct inode * inode, struct file * filp) return blkdev_get(bdev, filp->f_mode, filp); } -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) { - int ret = 0; struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; @@ -1389,34 +1464,39 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); + /* ->release can cause the old bdi to disappear, + * so must switch it out first + */ + bdev_inode_switch_bdi(bdev->bd_inode, + &default_backing_dev_info); } if (bdev->bd_contains == bdev) { if (disk->fops->release) - ret = disk->fops->release(disk, mode); + disk->fops->release(disk, mode); } if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; - put_disk(disk); - module_put(owner); disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev_inode_switch_bdi(bdev->bd_inode, - &default_backing_dev_info); if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); if (victim) __blkdev_put(victim, mode, 1); - return ret; } -int blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, fmode_t mode) { + mutex_lock(&bdev->bd_mutex); + if (mode & FMODE_EXCL) { bool bdev_free; @@ -1425,7 +1505,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * are protected with bdev_lock. bd_mutex is to * synchronize disk_holder unlinking. */ - mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); @@ -1443,27 +1522,30 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * If this was the last claim, remove holder link and * unblock evpoll if it was a write holder. */ - if (bdev_free) { - if (bdev->bd_write_holder) { - disk_unblock_events(bdev->bd_disk); - bdev->bd_write_holder = false; - } else - disk_check_events(bdev->bd_disk); + if (bdev_free && bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; } + } - mutex_unlock(&bdev->bd_mutex); - } else - disk_check_events(bdev->bd_disk); + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); - return __blkdev_put(bdev, mode, 0); + mutex_unlock(&bdev->bd_mutex); + + __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); - - return blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, filp->f_mode); + return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) @@ -1490,25 +1572,39 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) * Does not take i_mutex for the write and thus is not for general purpose * use. */ -ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct blk_plug plug; ssize_t ret; - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); - if (ret > 0 || ret == -EIOCBQUEUED) { + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + if (ret > 0) { ssize_t err; - - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } + blk_finish_plug(&plug); return ret; } -EXPORT_SYMBOL_GPL(blkdev_aio_write); +EXPORT_SYMBOL_GPL(blkdev_write_iter); + +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = file->f_mapping->host; + loff_t size = i_size_read(bd_inode); + loff_t pos = iocb->ki_pos; + + if (pos >= size) + return 0; + + size -= pos; + iov_iter_truncate(to, size); + return generic_file_read_iter(iocb, to); +} /* * Try to release a page associated with block device when the system @@ -1527,22 +1623,22 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .writepage = blkdev_writepage, - .sync_page = block_sync_page, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = generic_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, + .is_dirty_writeback = buffer_check_dirty_writeback, }; const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = blkdev_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, @@ -1550,7 +1646,7 @@ const struct file_operations def_blk_fops = { .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) @@ -1627,3 +1723,39 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) return res; } EXPORT_SYMBOL(__invalidate_device); + +void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +{ + struct inode *inode, *old_inode = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + + func(I_BDEV(inode), arg); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); +} |
