aboutsummaryrefslogtreecommitdiff
path: root/fs/block_dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r--fs/block_dev.c484
1 files changed, 308 insertions, 176 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 88928701959..6d7274619bf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -16,7 +16,9 @@
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
+#include <linux/magic.h>
#include <linux/buffer_head.h>
+#include <linux/swap.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
@@ -24,7 +26,8 @@
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
-#include <linux/kmemleak.h>
+#include <linux/cleancache.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -44,45 +47,67 @@ inline struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
-
EXPORT_SYMBOL(I_BDEV);
/*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
*/
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
- spin_lock(&inode_lock);
+ struct backing_dev_info *old = inode->i_data.backing_dev_info;
+ bool wakeup_bdi = false;
+
+ if (unlikely(dst == old)) /* deadlock avoidance */
+ return;
+ bdi_lock_two(&old->wb, &dst->wb);
+ spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
- if (inode->i_state & I_DIRTY)
+ if (inode->i_state & I_DIRTY) {
+ if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
+ wakeup_bdi = true;
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
- spin_unlock(&inode_lock);
-}
-
-static sector_t max_block(struct block_device *bdev)
-{
- sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
-
- if (sz) {
- unsigned int size = block_size(bdev);
- unsigned int sizebits = blksize_bits(size);
- retval = (sz >> sizebits);
}
- return retval;
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&old->wb.list_lock);
+ spin_unlock(&dst->wb.list_lock);
+
+ if (wakeup_bdi)
+ bdi_wakeup_thread_delayed(dst);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
-static void kill_bdev(struct block_device *bdev)
+void kill_bdev(struct block_device *bdev)
{
- if (bdev->bd_inode->i_mapping->nrpages == 0)
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping->nrpages == 0 && mapping->nrshadows == 0)
return;
+
invalidate_bh_lrus();
- truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+ truncate_inode_pages(mapping, 0);
}
+EXPORT_SYMBOL(kill_bdev);
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
+{
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping->nrpages == 0)
+ return;
+
+ invalidate_bh_lrus();
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
+ invalidate_mapping_pages(mapping, 0, -1);
+ /* 99% of the time, we don't need to flush the cleancache on the bdev.
+ * But, for the strange corners, lets be cautious
+ */
+ cleancache_invalidate_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
int set_blocksize(struct block_device *bdev, int size)
{
@@ -133,61 +158,22 @@ static int
blkdev_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
- if (iblock >= max_block(I_BDEV(inode))) {
- if (create)
- return -EIO;
-
- /*
- * for reads, we're just trying to fill a partial page.
- * return a hole, they will have to call get_block again
- * before they can fill it, and they will get -EIO at that
- * time
- */
- return 0;
- }
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
set_buffer_mapped(bh);
return 0;
}
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- sector_t end_block = max_block(I_BDEV(inode));
- unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-
- if ((iblock + max_blocks) > end_block) {
- max_blocks = end_block - iblock;
- if ((long)max_blocks <= 0) {
- if (create)
- return -EIO; /* write fully beyond EOF */
- /*
- * It is a read which is fully beyond EOF. We return
- * a !buffer_mapped buffer
- */
- max_blocks = 0;
- }
- }
-
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- bh->b_size = max_blocks << inode->i_blkbits;
- if (max_blocks)
- set_buffer_mapped(bh);
- return 0;
-}
-
static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
- nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+ return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
+ offset, blkdev_get_block,
+ NULL, NULL, 0);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -341,59 +327,106 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
/*
* private llseek:
- * for a block special file file->f_path.dentry->d_inode->i_size is zero
+ * for a block special file file_inode(file)->i_size is zero
* so we compute the size by hand (just as in block_read/write above)
*/
-static loff_t block_llseek(struct file *file, loff_t offset, int origin)
+static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *bd_inode = file->f_mapping->host;
- loff_t size;
loff_t retval;
mutex_lock(&bd_inode->i_mutex);
- size = i_size_read(bd_inode);
-
- switch (origin) {
- case 2:
- offset += size;
- break;
- case 1:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset >= 0 && offset <= size) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- }
- retval = offset;
- }
+ retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
mutex_unlock(&bd_inode->i_mutex);
return retval;
}
-int blkdev_fsync(struct file *filp, int datasync)
+int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *bd_inode = filp->f_mapping->host;
struct block_device *bdev = I_BDEV(bd_inode);
int error;
+
+ error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+ if (error)
+ return error;
/*
* There is no need to serialise calls to blkdev_issue_flush with
* i_mutex and doing so causes performance issues with concurrent
* O_SYNC writers to a block device.
*/
- mutex_unlock(&bd_inode->i_mutex);
-
error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
if (error == -EOPNOTSUPP)
error = 0;
- mutex_lock(&bd_inode->i_mutex);
-
return error;
}
EXPORT_SYMBOL(blkdev_fsync);
+/**
+ * bdev_read_page() - Start reading a page from a block device
+ * @bdev: The device to read the page from
+ * @sector: The offset on the device to read the page to (need not be aligned)
+ * @page: The page to read
+ *
+ * On entry, the page should be locked. It will be unlocked when the page
+ * has been read. If the block driver implements rw_page synchronously,
+ * that will be true on exit from this function, but it need not be.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to read this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_read_page(struct block_device *bdev, sector_t sector,
+ struct page *page)
+{
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+ if (!ops->rw_page)
+ return -EOPNOTSUPP;
+ return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+}
+EXPORT_SYMBOL_GPL(bdev_read_page);
+
+/**
+ * bdev_write_page() - Start writing a page to a block device
+ * @bdev: The device to write the page to
+ * @sector: The offset on the device to write the page to (need not be aligned)
+ * @page: The page to write
+ * @wbc: The writeback_control for the write
+ *
+ * On entry, the page should be locked and not currently under writeback.
+ * On exit, if the write started successfully, the page will be unlocked and
+ * under writeback. If the write failed already (eg the driver failed to
+ * queue the page to the device), the page will still be locked. If the
+ * caller is a ->writepage implementation, it will need to unlock the page.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to write this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_write_page(struct block_device *bdev, sector_t sector,
+ struct page *page, struct writeback_control *wbc)
+{
+ int result;
+ int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+ if (!ops->rw_page)
+ return -EOPNOTSUPP;
+ set_page_writeback(page);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+ if (result)
+ end_page_writeback(page);
+ else
+ unlock_page(page);
+ return result;
+}
+EXPORT_SYMBOL_GPL(bdev_write_page);
+
/*
* pseudo-fs
*/
@@ -414,7 +447,6 @@ static void bdev_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(bdev_cachep, bdi);
}
@@ -451,9 +483,9 @@ static void bdev_evict_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
struct list_head *p;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
- end_writeback(inode);
+ clear_inode(inode);
spin_lock(&bdev_lock);
while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
__bd_forget(list_entry(p, struct inode, i_devices));
@@ -473,7 +505,7 @@ static const struct super_operations bdev_sops = {
static struct dentry *bd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
+ return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
}
static struct file_system_type bd_type = {
@@ -482,12 +514,12 @@ static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-struct super_block *blockdev_superblock __read_mostly;
+static struct super_block *blockdev_superblock __read_mostly;
void __init bdev_cache_init(void)
{
int err;
- struct vfsmount *bd_mnt;
+ static struct vfsmount *bd_mnt;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -499,12 +531,7 @@ void __init bdev_cache_init(void)
bd_mnt = kern_mount(&bd_type);
if (IS_ERR(bd_mnt))
panic("Cannot create bdev pseudo-fs");
- /*
- * This vfsmount structure is only used to obtain the
- * blockdev_superblock, so tell kmemleak not to report it.
- */
- kmemleak_not_leak(bd_mnt);
- blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
+ blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
/*
@@ -545,6 +572,7 @@ struct block_device *bdget(dev_t dev)
if (inode->i_state & I_NEW) {
bdev->bd_contains = NULL;
+ bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
@@ -574,6 +602,7 @@ struct block_device *bdgrab(struct block_device *bdev)
ihold(bdev->bd_inode);
return bdev;
}
+EXPORT_SYMBOL(bdgrab);
long nr_blockdev_pages(void)
{
@@ -627,6 +656,11 @@ static struct block_device *bd_acquire(struct inode *inode)
return bdev;
}
+int sb_is_blkdev_sb(struct super_block *sb)
+{
+ return sb == blockdev_superblock;
+}
+
/* Call when you free inode */
void bd_forget(struct inode *inode)
@@ -634,11 +668,9 @@ void bd_forget(struct inode *inode)
struct block_device *bdev = NULL;
spin_lock(&bdev_lock);
- if (inode->i_bdev) {
- if (!sb_is_blkdev_sb(inode->i_sb))
- bdev = inode->i_bdev;
- __bd_forget(inode);
- }
+ if (!sb_is_blkdev_sb(inode->i_sb))
+ bdev = inode->i_bdev;
+ __bd_forget(inode);
spin_unlock(&bdev_lock);
if (bdev)
@@ -651,7 +683,7 @@ void bd_forget(struct inode *inode)
* @whole: whole block device containing @bdev, may equal @bdev
* @holder: holder trying to claim @bdev
*
- * Test whther @bdev can be claimed by @holder.
+ * Test whether @bdev can be claimed by @holder.
*
* CONTEXT:
* spin_lock(&bdev_lock).
@@ -760,7 +792,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
if (!disk)
return ERR_PTR(-ENXIO);
- whole = bdget_disk(disk, 0);
+ /*
+ * Normally, @bdev should equal what's returned from bdget_disk()
+ * if partno is 0; however, some drivers (floppy) use multiple
+ * bdev's for the same physical device and @bdev may be one of the
+ * aliases. Keep @bdev if partno is 0. This means claimer
+ * tracking is broken for those devices but it has always been that
+ * way.
+ */
+ if (partno)
+ whole = bdget_disk(disk, 0);
+ else
+ whole = bdgrab(bdev);
+
module_put(disk->fops->owner);
put_disk(disk);
if (!whole)
@@ -947,7 +991,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
if (!bdev->bd_disk)
return;
- if (disk_partitionable(bdev->bd_disk))
+ if (disk_part_scan_enabled(bdev->bd_disk))
bdev->bd_invalidated = 1;
}
@@ -1000,6 +1044,7 @@ int revalidate_disk(struct gendisk *disk)
mutex_lock(&bdev->bd_mutex);
check_disk_size_change(disk, bdev);
+ bdev->bd_invalidated = 0;
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
return ret;
@@ -1038,7 +1083,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
{
unsigned bsize = bdev_logical_block_size(bdev);
- bdev->bd_inode->i_size = size;
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode, size);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
while (bsize < PAGE_CACHE_SIZE) {
if (size & bsize)
break;
@@ -1049,7 +1096,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
}
EXPORT_SYMBOL(bd_set_size);
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
/*
* bd_mutex locking:
@@ -1061,6 +1108,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
struct gendisk *disk;
+ struct module *owner;
int ret;
int partno;
int perm = 0;
@@ -1086,10 +1134,13 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk = get_gendisk(bdev->bd_dev, &partno);
if (!disk)
goto out;
+ owner = disk->fops->owner;
+ disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
bdev->bd_disk = disk;
+ bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
struct backing_dev_info *bdi;
@@ -1099,6 +1150,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (!bdev->bd_part)
goto out_clear;
+ ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
if (ret == -ERESTARTSYS) {
@@ -1108,24 +1160,38 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
- module_put(disk->fops->owner);
- put_disk(disk);
bdev->bd_disk = NULL;
+ bdev->bd_queue = NULL;
mutex_unlock(&bdev->bd_mutex);
+ disk_unblock_events(disk);
+ put_disk(disk);
+ module_put(owner);
goto restart;
}
- if (ret)
- goto out_clear;
}
- if (!bdev->bd_openers) {
+
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
bdi = &default_backing_dev_info;
bdev_inode_switch_bdi(bdev->bd_inode, bdi);
}
- if (bdev->bd_invalidated)
- rescan_partitions(disk, bdev);
+
+ /*
+ * If the device is invalidated, rescan partition
+ * if open succeeded or failed with -ENOMEDIUM.
+ * The latter is necessary to prevent ghost
+ * partitions on a removed medium.
+ */
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(disk, bdev);
+ }
+ if (ret)
+ goto out_clear;
} else {
struct block_device *whole;
whole = bdget_disk(disk, 0);
@@ -1148,39 +1214,46 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
} else {
- module_put(disk->fops->owner);
- put_disk(disk);
- disk = NULL;
if (bdev->bd_contains == bdev) {
- if (bdev->bd_disk->fops->open) {
+ ret = 0;
+ if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
- if (ret)
- goto out_unlock_bdev;
+ /* the same as first opener case, read comment there */
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(bdev->bd_disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(bdev->bd_disk, bdev);
}
- if (bdev->bd_invalidated)
- rescan_partitions(bdev->bd_disk, bdev);
+ if (ret)
+ goto out_unlock_bdev;
}
+ /* only one opener holds refs to the module and disk */
+ put_disk(disk);
+ module_put(owner);
}
bdev->bd_openers++;
if (for_part)
bdev->bd_part_count++;
mutex_unlock(&bdev->bd_mutex);
+ disk_unblock_events(disk);
return 0;
out_clear:
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
+ bdev->bd_queue = NULL;
bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
- out:
- if (disk)
- module_put(disk->fops->owner);
+ disk_unblock_events(disk);
put_disk(disk);
+ module_put(owner);
+ out:
bdput(bdev);
return ret;
@@ -1223,6 +1296,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
res = __blkdev_get(bdev, mode, 0);
if (whole) {
+ struct gendisk *disk = whole->bd_disk;
+
/* finish claiming */
mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
@@ -1249,15 +1324,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
spin_unlock(&bdev_lock);
/*
- * Block event polling for write claims. Any write
- * holder makes the write_holder state stick until all
- * are released. This is good enough and tracking
- * individual writeable reference is too fragile given
- * the way @mode is used in blkdev_get/put().
+ * Block event polling for write claims if requested. Any
+ * write holder makes the write_holder state stick until
+ * all are released. This is good enough and tracking
+ * individual writeable reference is too fragile given the
+ * way @mode is used in blkdev_get/put().
*/
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+ if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
bdev->bd_write_holder = true;
- disk_block_events(bdev->bd_disk);
+ disk_block_events(disk);
}
mutex_unlock(&bdev->bd_mutex);
@@ -1375,9 +1451,8 @@ static int blkdev_open(struct inode * inode, struct file * filp)
return blkdev_get(bdev, filp->f_mode, filp);
}
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
{
- int ret = 0;
struct gendisk *disk = bdev->bd_disk;
struct block_device *victim = NULL;
@@ -1389,34 +1464,39 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
+ /* ->release can cause the old bdi to disappear,
+ * so must switch it out first
+ */
+ bdev_inode_switch_bdi(bdev->bd_inode,
+ &default_backing_dev_info);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
- ret = disk->fops->release(disk, mode);
+ disk->fops->release(disk, mode);
}
if (!bdev->bd_openers) {
struct module *owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
- bdev_inode_switch_bdi(bdev->bd_inode,
- &default_backing_dev_info);
if (bdev != bdev->bd_contains)
victim = bdev->bd_contains;
bdev->bd_contains = NULL;
+
+ put_disk(disk);
+ module_put(owner);
}
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
if (victim)
__blkdev_put(victim, mode, 1);
- return ret;
}
-int blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, fmode_t mode)
{
+ mutex_lock(&bdev->bd_mutex);
+
if (mode & FMODE_EXCL) {
bool bdev_free;
@@ -1425,7 +1505,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
* are protected with bdev_lock. bd_mutex is to
* synchronize disk_holder unlinking.
*/
- mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
WARN_ON_ONCE(--bdev->bd_holders < 0);
@@ -1443,27 +1522,30 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
* If this was the last claim, remove holder link and
* unblock evpoll if it was a write holder.
*/
- if (bdev_free) {
- if (bdev->bd_write_holder) {
- disk_unblock_events(bdev->bd_disk);
- bdev->bd_write_holder = false;
- } else
- disk_check_events(bdev->bd_disk);
+ if (bdev_free && bdev->bd_write_holder) {
+ disk_unblock_events(bdev->bd_disk);
+ bdev->bd_write_holder = false;
}
+ }
- mutex_unlock(&bdev->bd_mutex);
- } else
- disk_check_events(bdev->bd_disk);
+ /*
+ * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+ * event. This is to ensure detection of media removal commanded
+ * from userland - e.g. eject(1).
+ */
+ disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
- return __blkdev_put(bdev, mode, 0);
+ mutex_unlock(&bdev->bd_mutex);
+
+ __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-
- return blkdev_put(bdev, filp->f_mode);
+ blkdev_put(bdev, filp->f_mode);
+ return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@ -1490,25 +1572,39 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
* Does not take i_mutex for the write and thus is not for general purpose
* use.
*/
-ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
+ struct blk_plug plug;
ssize_t ret;
- BUG_ON(iocb->ki_pos != pos);
-
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ blk_start_plug(&plug);
+ ret = __generic_file_write_iter(iocb, from);
+ if (ret > 0) {
ssize_t err;
-
- err = generic_write_sync(file, pos, ret);
- if (err < 0 && ret > 0)
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
ret = err;
}
+ blk_finish_plug(&plug);
return ret;
}
-EXPORT_SYMBOL_GPL(blkdev_aio_write);
+EXPORT_SYMBOL_GPL(blkdev_write_iter);
+
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *bd_inode = file->f_mapping->host;
+ loff_t size = i_size_read(bd_inode);
+ loff_t pos = iocb->ki_pos;
+
+ if (pos >= size)
+ return 0;
+
+ size -= pos;
+ iov_iter_truncate(to, size);
+ return generic_file_read_iter(iocb, to);
+}
/*
* Try to release a page associated with block device when the system
@@ -1527,22 +1623,22 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
.writepage = blkdev_writepage,
- .sync_page = block_sync_page,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.writepages = generic_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
};
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = blkdev_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = blkdev_read_iter,
+ .write_iter = blkdev_write_iter,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
@@ -1550,7 +1646,7 @@ const struct file_operations def_blk_fops = {
.compat_ioctl = compat_blkdev_ioctl,
#endif
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
@@ -1627,3 +1723,39 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
return res;
}
EXPORT_SYMBOL(__invalidate_device);
+
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+ struct inode *inode, *old_inode = NULL;
+
+ spin_lock(&inode_sb_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+ struct address_space *mapping = inode->i_mapping;
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+ mapping->nrpages == 0) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_sb_list_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have been
+ * removed from s_inodes list while we dropped the
+ * inode_sb_list_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it under
+ * inode_sb_list_lock. So we keep the reference and iput it
+ * later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+
+ func(I_BDEV(inode), arg);
+
+ spin_lock(&inode_sb_list_lock);
+ }
+ spin_unlock(&inode_sb_list_lock);
+ iput(old_inode);
+}