diff options
Diffstat (limited to 'fs/block_dev.c')
| -rw-r--r-- | fs/block_dev.c | 1240 | 
1 files changed, 601 insertions, 639 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 06e8ff12b97..6d7274619bf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -11,13 +11,14 @@  #include <linux/slab.h>  #include <linux/kmod.h>  #include <linux/major.h> -#include <linux/smp_lock.h>  #include <linux/device_cgroup.h>  #include <linux/highmem.h>  #include <linux/blkdev.h>  #include <linux/module.h>  #include <linux/blkpg.h> +#include <linux/magic.h>  #include <linux/buffer_head.h> +#include <linux/swap.h>  #include <linux/pagevec.h>  #include <linux/writeback.h>  #include <linux/mpage.h> @@ -25,7 +26,8 @@  #include <linux/uio.h>  #include <linux/namei.h>  #include <linux/log2.h> -#include <linux/kmemleak.h> +#include <linux/cleancache.h> +#include <linux/aio.h>  #include <asm/uaccess.h>  #include "internal.h" @@ -45,45 +47,67 @@ inline struct block_device *I_BDEV(struct inode *inode)  {  	return &BDEV_I(inode)->bdev;  } -  EXPORT_SYMBOL(I_BDEV);  /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list.   */  static void bdev_inode_switch_bdi(struct inode *inode,  			struct backing_dev_info *dst)  { -	spin_lock(&inode_lock); +	struct backing_dev_info *old = inode->i_data.backing_dev_info; +	bool wakeup_bdi = false; + +	if (unlikely(dst == old))		/* deadlock avoidance */ +		return; +	bdi_lock_two(&old->wb, &dst->wb); +	spin_lock(&inode->i_lock);  	inode->i_data.backing_dev_info = dst; -	if (inode->i_state & I_DIRTY) +	if (inode->i_state & I_DIRTY) { +		if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) +			wakeup_bdi = true;  		list_move(&inode->i_wb_list, &dst->wb.b_dirty); -	spin_unlock(&inode_lock); -} - -static sector_t max_block(struct block_device *bdev) -{ -	sector_t retval = ~((sector_t)0); -	loff_t sz = i_size_read(bdev->bd_inode); - -	if (sz) { -		unsigned int size = block_size(bdev); -		unsigned int sizebits = blksize_bits(size); -		retval = (sz >> sizebits);  	} -	return retval; +	spin_unlock(&inode->i_lock); +	spin_unlock(&old->wb.list_lock); +	spin_unlock(&dst->wb.list_lock); + +	if (wakeup_bdi) +		bdi_wakeup_thread_delayed(dst);  }  /* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) +void kill_bdev(struct block_device *bdev)  { -	if (bdev->bd_inode->i_mapping->nrpages == 0) +	struct address_space *mapping = bdev->bd_inode->i_mapping; + +	if (mapping->nrpages == 0 && mapping->nrshadows == 0)  		return; +  	invalidate_bh_lrus(); -	truncate_inode_pages(bdev->bd_inode->i_mapping, 0); +	truncate_inode_pages(mapping, 0);  }	 +EXPORT_SYMBOL(kill_bdev); + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ +	struct address_space *mapping = bdev->bd_inode->i_mapping; + +	if (mapping->nrpages == 0) +		return; + +	invalidate_bh_lrus(); +	lru_add_drain_all();	/* make sure all lru add caches are flushed */ +	invalidate_mapping_pages(mapping, 0, -1); +	/* 99% of the time, we don't need to flush the cleancache on the bdev. +	 * But, for the strange corners, lets be cautious +	 */ +	cleancache_invalidate_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev);  int set_blocksize(struct block_device *bdev, int size)  { @@ -134,61 +158,22 @@ static int  blkdev_get_block(struct inode *inode, sector_t iblock,  		struct buffer_head *bh, int create)  { -	if (iblock >= max_block(I_BDEV(inode))) { -		if (create) -			return -EIO; - -		/* -		 * for reads, we're just trying to fill a partial page. -		 * return a hole, they will have to call get_block again -		 * before they can fill it, and they will get -EIO at that -		 * time -		 */ -		return 0; -	}  	bh->b_bdev = I_BDEV(inode);  	bh->b_blocknr = iblock;  	set_buffer_mapped(bh);  	return 0;  } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, -		struct buffer_head *bh, int create) -{ -	sector_t end_block = max_block(I_BDEV(inode)); -	unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - -	if ((iblock + max_blocks) > end_block) { -		max_blocks = end_block - iblock; -		if ((long)max_blocks <= 0) { -			if (create) -				return -EIO;	/* write fully beyond EOF */ -			/* -			 * It is a read which is fully beyond EOF.  We return -			 * a !buffer_mapped buffer -			 */ -			max_blocks = 0; -		} -	} - -	bh->b_bdev = I_BDEV(inode); -	bh->b_blocknr = iblock; -	bh->b_size = max_blocks << inode->i_blkbits; -	if (max_blocks) -		set_buffer_mapped(bh); -	return 0; -} -  static ssize_t -blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, -			loff_t offset, unsigned long nr_segs) +blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, +			loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; -	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, -				    nr_segs, blkdev_get_blocks, NULL, NULL, 0); +	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter, +				    offset, blkdev_get_block, +				    NULL, NULL, 0);  }  int __sync_blockdev(struct block_device *bdev, int wait) @@ -342,59 +327,106 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,  /*   * private llseek: - * for a block special file file->f_path.dentry->d_inode->i_size is zero + * for a block special file file_inode(file)->i_size is zero   * so we compute the size by hand (just as in block_read/write above)   */ -static loff_t block_llseek(struct file *file, loff_t offset, int origin) +static loff_t block_llseek(struct file *file, loff_t offset, int whence)  {  	struct inode *bd_inode = file->f_mapping->host; -	loff_t size;  	loff_t retval;  	mutex_lock(&bd_inode->i_mutex); -	size = i_size_read(bd_inode); - -	switch (origin) { -		case 2: -			offset += size; -			break; -		case 1: -			offset += file->f_pos; -	} -	retval = -EINVAL; -	if (offset >= 0 && offset <= size) { -		if (offset != file->f_pos) { -			file->f_pos = offset; -		} -		retval = offset; -	} +	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));  	mutex_unlock(&bd_inode->i_mutex);  	return retval;  } -int blkdev_fsync(struct file *filp, int datasync) +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)  {  	struct inode *bd_inode = filp->f_mapping->host;  	struct block_device *bdev = I_BDEV(bd_inode);  	int error; +	 +	error = filemap_write_and_wait_range(filp->f_mapping, start, end); +	if (error) +		return error;  	/*  	 * There is no need to serialise calls to blkdev_issue_flush with  	 * i_mutex and doing so causes performance issues with concurrent  	 * O_SYNC writers to a block device.  	 */ -	mutex_unlock(&bd_inode->i_mutex); -  	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);  	if (error == -EOPNOTSUPP)  		error = 0; -	mutex_lock(&bd_inode->i_mutex); -  	return error;  }  EXPORT_SYMBOL(blkdev_fsync); +/** + * bdev_read_page() - Start reading a page from a block device + * @bdev: The device to read the page from + * @sector: The offset on the device to read the page to (need not be aligned) + * @page: The page to read + * + * On entry, the page should be locked.  It will be unlocked when the page + * has been read.  If the block driver implements rw_page synchronously, + * that will be true on exit from this function, but it need not be. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to read this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_read_page(struct block_device *bdev, sector_t sector, +			struct page *page) +{ +	const struct block_device_operations *ops = bdev->bd_disk->fops; +	if (!ops->rw_page) +		return -EOPNOTSUPP; +	return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); +} +EXPORT_SYMBOL_GPL(bdev_read_page); + +/** + * bdev_write_page() - Start writing a page to a block device + * @bdev: The device to write the page to + * @sector: The offset on the device to write the page to (need not be aligned) + * @page: The page to write + * @wbc: The writeback_control for the write + * + * On entry, the page should be locked and not currently under writeback. + * On exit, if the write started successfully, the page will be unlocked and + * under writeback.  If the write failed already (eg the driver failed to + * queue the page to the device), the page will still be locked.  If the + * caller is a ->writepage implementation, it will need to unlock the page. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to write this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_write_page(struct block_device *bdev, sector_t sector, +			struct page *page, struct writeback_control *wbc) +{ +	int result; +	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; +	const struct block_device_operations *ops = bdev->bd_disk->fops; +	if (!ops->rw_page) +		return -EOPNOTSUPP; +	set_page_writeback(page); +	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); +	if (result) +		end_page_writeback(page); +	else +		unlock_page(page); +	return result; +} +EXPORT_SYMBOL_GPL(bdev_write_page); +  /*   * pseudo-fs   */ @@ -410,13 +442,19 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)  	return &ei->vfs_inode;  } -static void bdev_destroy_inode(struct inode *inode) +static void bdev_i_callback(struct rcu_head *head)  { +	struct inode *inode = container_of(head, struct inode, i_rcu);  	struct bdev_inode *bdi = BDEV_I(inode);  	kmem_cache_free(bdev_cachep, bdi);  } +static void bdev_destroy_inode(struct inode *inode) +{ +	call_rcu(&inode->i_rcu, bdev_i_callback); +} +  static void init_once(void *foo)  {  	struct bdev_inode *ei = (struct bdev_inode *) foo; @@ -427,7 +465,7 @@ static void init_once(void *foo)  	INIT_LIST_HEAD(&bdev->bd_inodes);  	INIT_LIST_HEAD(&bdev->bd_list);  #ifdef CONFIG_SYSFS -	INIT_LIST_HEAD(&bdev->bd_holder_list); +	INIT_LIST_HEAD(&bdev->bd_holder_disks);  #endif  	inode_init_once(&ei->vfs_inode);  	/* Initialize mutex for freeze. */ @@ -445,9 +483,9 @@ static void bdev_evict_inode(struct inode *inode)  {  	struct block_device *bdev = &BDEV_I(inode)->bdev;  	struct list_head *p; -	truncate_inode_pages(&inode->i_data, 0); +	truncate_inode_pages_final(&inode->i_data);  	invalidate_inode_buffers(inode); /* is it needed here? */ -	end_writeback(inode); +	clear_inode(inode);  	spin_lock(&bdev_lock);  	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {  		__bd_forget(list_entry(p, struct inode, i_devices)); @@ -467,7 +505,7 @@ static const struct super_operations bdev_sops = {  static struct dentry *bd_mount(struct file_system_type *fs_type,  	int flags, const char *dev_name, void *data)  { -	return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); +	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);  }  static struct file_system_type bd_type = { @@ -476,12 +514,12 @@ static struct file_system_type bd_type = {  	.kill_sb	= kill_anon_super,  }; -struct super_block *blockdev_superblock __read_mostly; +static struct super_block *blockdev_superblock __read_mostly;  void __init bdev_cache_init(void)  {  	int err; -	struct vfsmount *bd_mnt; +	static struct vfsmount *bd_mnt;  	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),  			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -493,12 +531,7 @@ void __init bdev_cache_init(void)  	bd_mnt = kern_mount(&bd_type);  	if (IS_ERR(bd_mnt))  		panic("Cannot create bdev pseudo-fs"); -	/* -	 * This vfsmount structure is only used to obtain the -	 * blockdev_superblock, so tell kmemleak not to report it. -	 */ -	kmemleak_not_leak(bd_mnt); -	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */ +	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */  }  /* @@ -539,6 +572,7 @@ struct block_device *bdget(dev_t dev)  	if (inode->i_state & I_NEW) {  		bdev->bd_contains = NULL; +		bdev->bd_super = NULL;  		bdev->bd_inode = inode;  		bdev->bd_block_size = (1 << inode->i_blkbits);  		bdev->bd_part_count = 0; @@ -568,6 +602,7 @@ struct block_device *bdgrab(struct block_device *bdev)  	ihold(bdev->bd_inode);  	return bdev;  } +EXPORT_SYMBOL(bdgrab);  long nr_blockdev_pages(void)  { @@ -621,6 +656,11 @@ static struct block_device *bd_acquire(struct inode *inode)  	return bdev;  } +int sb_is_blkdev_sb(struct super_block *sb) +{ +	return sb == blockdev_superblock; +} +  /* Call when you free inode */  void bd_forget(struct inode *inode) @@ -628,11 +668,9 @@ void bd_forget(struct inode *inode)  	struct block_device *bdev = NULL;  	spin_lock(&bdev_lock); -	if (inode->i_bdev) { -		if (!sb_is_blkdev_sb(inode->i_sb)) -			bdev = inode->i_bdev; -		__bd_forget(inode); -	} +	if (!sb_is_blkdev_sb(inode->i_sb)) +		bdev = inode->i_bdev; +	__bd_forget(inode);  	spin_unlock(&bdev_lock);  	if (bdev) @@ -645,7 +683,7 @@ void bd_forget(struct inode *inode)   * @whole: whole block device containing @bdev, may equal @bdev   * @holder: holder trying to claim @bdev   * - * Test whther @bdev can be claimed by @holder. + * Test whether @bdev can be claimed by @holder.   *   * CONTEXT:   * spin_lock(&bdev_lock). @@ -663,7 +701,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,  	else if (bdev->bd_contains == bdev)  		return true;  	 /* is a whole device which isn't held */ -	else if (whole->bd_holder == bd_claim) +	else if (whole->bd_holder == bd_may_claim)  		return true; 	 /* is a partition of a device that is being partitioned */  	else if (whole->bd_holder != NULL)  		return false;	 /* is a partition of a held device */ @@ -754,7 +792,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,  	if (!disk)  		return ERR_PTR(-ENXIO); -	whole = bdget_disk(disk, 0); +	/* +	 * Normally, @bdev should equal what's returned from bdget_disk() +	 * if partno is 0; however, some drivers (floppy) use multiple +	 * bdev's for the same physical device and @bdev may be one of the +	 * aliases.  Keep @bdev if partno is 0.  This means claimer +	 * tracking is broken for those devices but it has always been that +	 * way. +	 */ +	if (partno) +		whole = bdget_disk(disk, 0); +	else +		whole = bdgrab(bdev); +  	module_put(disk->fops->owner);  	put_disk(disk);  	if (!whole) @@ -775,452 +825,162 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,  	}  } -/* releases bdev_lock */ -static void __bd_abort_claiming(struct block_device *whole, void *holder) -{ -	BUG_ON(whole->bd_claiming != holder); -	whole->bd_claiming = NULL; -	wake_up_bit(&whole->bd_claiming, 0); - -	spin_unlock(&bdev_lock); -	bdput(whole); -} - -/** - * bd_abort_claiming - abort claiming a block device - * @whole: whole block device returned by bd_start_claiming() - * @holder: holder trying to claim @bdev - * - * Abort a claiming block started by bd_start_claiming().  Note that - * @whole is not the block device to be claimed but the whole device - * returned by bd_start_claiming(). - * - * CONTEXT: - * Grabs and releases bdev_lock. - */ -static void bd_abort_claiming(struct block_device *whole, void *holder) -{ -	spin_lock(&bdev_lock); -	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */ -} - -/* increment holders when we have a legitimate claim. requires bdev_lock */ -static void __bd_claim(struct block_device *bdev, struct block_device *whole, -					void *holder) -{ -	/* note that for a whole device bd_holders -	 * will be incremented twice, and bd_holder will -	 * be set to bd_claim before being set to holder -	 */ -	whole->bd_holders++; -	whole->bd_holder = bd_claim; -	bdev->bd_holders++; -	bdev->bd_holder = holder; -} +#ifdef CONFIG_SYSFS +struct bd_holder_disk { +	struct list_head	list; +	struct gendisk		*disk; +	int			refcnt; +}; -/** - * bd_finish_claiming - finish claiming a block device - * @bdev: block device of interest (passed to bd_start_claiming()) - * @whole: whole block device returned by bd_start_claiming() - * @holder: holder trying to claim @bdev - * - * Finish a claiming block started by bd_start_claiming(). - * - * CONTEXT: - * Grabs and releases bdev_lock. - */ -static void bd_finish_claiming(struct block_device *bdev, -				struct block_device *whole, void *holder) +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, +						  struct gendisk *disk)  { -	spin_lock(&bdev_lock); -	BUG_ON(!bd_may_claim(bdev, whole, holder)); -	__bd_claim(bdev, whole, holder); -	__bd_abort_claiming(whole, holder); /* not actually an abort */ -} +	struct bd_holder_disk *holder; -/** - * bd_claim - claim a block device - * @bdev: block device to claim - * @holder: holder trying to claim @bdev - * - * Try to claim @bdev which must have been opened successfully. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 if successful, -EBUSY if @bdev is already claimed. - */ -int bd_claim(struct block_device *bdev, void *holder) -{ -	struct block_device *whole = bdev->bd_contains; -	int res; - -	might_sleep(); - -	spin_lock(&bdev_lock); -	res = bd_prepare_to_claim(bdev, whole, holder); -	if (res == 0) -		__bd_claim(bdev, whole, holder); -	spin_unlock(&bdev_lock); - -	return res; -} -EXPORT_SYMBOL(bd_claim); - -void bd_release(struct block_device *bdev) -{ -	spin_lock(&bdev_lock); -	if (!--bdev->bd_contains->bd_holders) -		bdev->bd_contains->bd_holder = NULL; -	if (!--bdev->bd_holders) -		bdev->bd_holder = NULL; -	spin_unlock(&bdev_lock); +	list_for_each_entry(holder, &bdev->bd_holder_disks, list) +		if (holder->disk == disk) +			return holder; +	return NULL;  } -EXPORT_SYMBOL(bd_release); - -#ifdef CONFIG_SYSFS -/* - * Functions for bd_claim_by_kobject / bd_release_from_kobject - * - *     If a kobject is passed to bd_claim_by_kobject() - *     and the kobject has a parent directory, - *     following symlinks are created: - *        o from the kobject to the claimed bdev - *        o from "holders" directory of the bdev to the parent of the kobject - *     bd_release_from_kobject() removes these symlinks. - * - *     Example: - *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to - *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: - *           /sys/block/dm-0/slaves/sda --> /sys/block/sda - *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - */ -  static int add_symlink(struct kobject *from, struct kobject *to)  { -	if (!from || !to) -		return 0;  	return sysfs_create_link(from, to, kobject_name(to));  }  static void del_symlink(struct kobject *from, struct kobject *to)  { -	if (!from || !to) -		return;  	sysfs_remove_link(from, kobject_name(to));  } -/* - * 'struct bd_holder' contains pointers to kobjects symlinked by - * bd_claim_by_kobject. - * It's connected to bd_holder_list which is protected by bdev->bd_sem. - */ -struct bd_holder { -	struct list_head list;	/* chain of holders of the bdev */ -	int count;		/* references from the holder */ -	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */ -	struct kobject *hdev;	/* e.g. "/block/dm-0" */ -	struct kobject *hdir;	/* e.g. "/block/sda/holders" */ -	struct kobject *sdev;	/* e.g. "/block/sda" */ -}; - -/* - * Get references of related kobjects at once. - * Returns 1 on success. 0 on failure. - * - * Should call bd_holder_release_dirs() after successful use. - */ -static int bd_holder_grab_dirs(struct block_device *bdev, -			struct bd_holder *bo) -{ -	if (!bdev || !bo) -		return 0; - -	bo->sdir = kobject_get(bo->sdir); -	if (!bo->sdir) -		return 0; - -	bo->hdev = kobject_get(bo->sdir->parent); -	if (!bo->hdev) -		goto fail_put_sdir; - -	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); -	if (!bo->sdev) -		goto fail_put_hdev; - -	bo->hdir = kobject_get(bdev->bd_part->holder_dir); -	if (!bo->hdir) -		goto fail_put_sdev; - -	return 1; - -fail_put_sdev: -	kobject_put(bo->sdev); -fail_put_hdev: -	kobject_put(bo->hdev); -fail_put_sdir: -	kobject_put(bo->sdir); - -	return 0; -} - -/* Put references of related kobjects at once. */ -static void bd_holder_release_dirs(struct bd_holder *bo) -{ -	kobject_put(bo->hdir); -	kobject_put(bo->sdev); -	kobject_put(bo->hdev); -	kobject_put(bo->sdir); -} - -static struct bd_holder *alloc_bd_holder(struct kobject *kobj) -{ -	struct bd_holder *bo; - -	bo = kzalloc(sizeof(*bo), GFP_KERNEL); -	if (!bo) -		return NULL; - -	bo->count = 1; -	bo->sdir = kobj; - -	return bo; -} - -static void free_bd_holder(struct bd_holder *bo) -{ -	kfree(bo); -} -  /** - * find_bd_holder - find matching struct bd_holder from the block device + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk   * - * @bdev:	struct block device to be searched - * @bo:		target struct bd_holder + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.   * - * Returns matching entry with @bo in @bdev->bd_holder_list. - * If found, increment the reference count and return the pointer. - * If not found, returns NULL. - */ -static struct bd_holder *find_bd_holder(struct block_device *bdev, -					struct bd_holder *bo) -{ -	struct bd_holder *tmp; - -	list_for_each_entry(tmp, &bdev->bd_holder_list, list) -		if (tmp->sdir == bo->sdir) { -			tmp->count++; -			return tmp; -		} - -	return NULL; -} - -/** - * add_bd_holder - create sysfs symlinks for bd_claim() relationship + * This functions creates the following sysfs symlinks.   * - * @bdev:	block device to be bd_claimed - * @bo:		preallocated and initialized by alloc_bd_holder() + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk   * - * Add @bo to @bdev->bd_holder_list, create symlinks. + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then:   * - * Returns 0 if symlinks are created. - * Returns -ve if something fails. + *   /sys/block/dm-0/slaves/sda --> /sys/block/sda + *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure.   */ -static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)  { -	int err; +	struct bd_holder_disk *holder; +	int ret = 0; -	if (!bo) -		return -EINVAL; +	mutex_lock(&bdev->bd_mutex); -	if (!bd_holder_grab_dirs(bdev, bo)) -		return -EBUSY; +	WARN_ON_ONCE(!bdev->bd_holder); -	err = add_symlink(bo->sdir, bo->sdev); -	if (err) -		return err; +	/* FIXME: remove the following once add_disk() handles errors */ +	if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) +		goto out_unlock; -	err = add_symlink(bo->hdir, bo->hdev); -	if (err) { -		del_symlink(bo->sdir, bo->sdev); -		return err; +	holder = bd_find_holder_disk(bdev, disk); +	if (holder) { +		holder->refcnt++; +		goto out_unlock;  	} -	list_add_tail(&bo->list, &bdev->bd_holder_list); -	return 0; -} - -/** - * del_bd_holder - delete sysfs symlinks for bd_claim() relationship - * - * @bdev:	block device to be bd_claimed - * @kobj:	holder's kobject - * - * If there is matching entry with @kobj in @bdev->bd_holder_list - * and no other bd_claim() from the same kobject, - * remove the struct bd_holder from the list, delete symlinks for it. - * - * Returns a pointer to the struct bd_holder when it's removed from the list - * and ready to be freed. - * Returns NULL if matching claim isn't found or there is other bd_claim() - * by the same kobject. - */ -static struct bd_holder *del_bd_holder(struct block_device *bdev, -					struct kobject *kobj) -{ -	struct bd_holder *bo; - -	list_for_each_entry(bo, &bdev->bd_holder_list, list) { -		if (bo->sdir == kobj) { -			bo->count--; -			BUG_ON(bo->count < 0); -			if (!bo->count) { -				list_del(&bo->list); -				del_symlink(bo->sdir, bo->sdev); -				del_symlink(bo->hdir, bo->hdev); -				bd_holder_release_dirs(bo); -				return bo; -			} -			break; -		} +	holder = kzalloc(sizeof(*holder), GFP_KERNEL); +	if (!holder) { +		ret = -ENOMEM; +		goto out_unlock;  	} -	return NULL; -} - -/** - * bd_claim_by_kobject - bd_claim() with additional kobject signature - * - * @bdev:	block device to be claimed - * @holder:	holder's signature - * @kobj:	holder's kobject - * - * Do bd_claim() and if it succeeds, create sysfs symlinks between - * the bdev and the holder's kobject. - * Use bd_release_from_kobject() when relesing the claimed bdev. - * - * Returns 0 on success. (same as bd_claim()) - * Returns errno on failure. - */ -static int bd_claim_by_kobject(struct block_device *bdev, void *holder, -				struct kobject *kobj) -{ -	int err; -	struct bd_holder *bo, *found; - -	if (!kobj) -		return -EINVAL; +	INIT_LIST_HEAD(&holder->list); +	holder->disk = disk; +	holder->refcnt = 1; -	bo = alloc_bd_holder(kobj); -	if (!bo) -		return -ENOMEM; - -	mutex_lock(&bdev->bd_mutex); +	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +	if (ret) +		goto out_free; -	err = bd_claim(bdev, holder); -	if (err) -		goto fail; +	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); +	if (ret) +		goto out_del; +	/* +	 * bdev could be deleted beneath us which would implicitly destroy +	 * the holder directory.  Hold on to it. +	 */ +	kobject_get(bdev->bd_part->holder_dir); -	found = find_bd_holder(bdev, bo); -	if (found) -		goto fail; +	list_add(&holder->list, &bdev->bd_holder_disks); +	goto out_unlock; -	err = add_bd_holder(bdev, bo); -	if (err) -		bd_release(bdev); -	else -		bo = NULL; -fail: +out_del: +	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +out_free: +	kfree(holder); +out_unlock:  	mutex_unlock(&bdev->bd_mutex); -	free_bd_holder(bo); -	return err; +	return ret;  } +EXPORT_SYMBOL_GPL(bd_link_disk_holder);  /** - * bd_release_from_kobject - bd_release() with additional kobject signature + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk   * - * @bdev:	block device to be released - * @kobj:	holder's kobject + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.   * - * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). + * CONTEXT: + * Might sleep.   */ -static void bd_release_from_kobject(struct block_device *bdev, -					struct kobject *kobj) +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)  { -	if (!kobj) -		return; +	struct bd_holder_disk *holder;  	mutex_lock(&bdev->bd_mutex); -	bd_release(bdev); -	free_bd_holder(del_bd_holder(bdev, kobj)); -	mutex_unlock(&bdev->bd_mutex); -} -/** - * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() - * - * @bdev:	block device to be claimed - * @holder:	holder's signature - * @disk:	holder's gendisk - * - * Call bd_claim_by_kobject() with getting @disk->slave_dir. - */ -int bd_claim_by_disk(struct block_device *bdev, void *holder, -			struct gendisk *disk) -{ -	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); -} -EXPORT_SYMBOL_GPL(bd_claim_by_disk); +	holder = bd_find_holder_disk(bdev, disk); -/** - * bd_release_from_disk - wrapper function for bd_release_from_kobject() - * - * @bdev:	block device to be claimed - * @disk:	holder's gendisk - * - * Call bd_release_from_kobject() and put @disk->slave_dir. - */ -void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) -{ -	bd_release_from_kobject(bdev, disk->slave_dir); -	kobject_put(disk->slave_dir); -} -EXPORT_SYMBOL_GPL(bd_release_from_disk); -#endif +	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { +		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); +		del_symlink(bdev->bd_part->holder_dir, +			    &disk_to_dev(disk)->kobj); +		kobject_put(bdev->bd_part->holder_dir); +		list_del_init(&holder->list); +		kfree(holder); +	} -/* - * Tries to open block device by device number.  Use it ONLY if you - * really do not have anything better - i.e. when you are behind a - * truly sucky interface and all you are given is a device number.  _Never_ - * to be used for internal purposes.  If you ever need it - reconsider - * your API. - */ -struct block_device *open_by_devnum(dev_t dev, fmode_t mode) -{ -	struct block_device *bdev = bdget(dev); -	int err = -ENOMEM; -	if (bdev) -		err = blkdev_get(bdev, mode); -	return err ? ERR_PTR(err) : bdev; +	mutex_unlock(&bdev->bd_mutex);  } - -EXPORT_SYMBOL(open_by_devnum); +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); +#endif  /**   * flush_disk - invalidates all buffer-cache entries on a disk   *   * @bdev:      struct block device to be flushed + * @kill_dirty: flag to guide handling of dirty inodes   *   * Invalidates all buffer-cache entries on a disk. It should be called   * when a disk has been changed -- either by a media change or online   * resize.   */ -static void flush_disk(struct block_device *bdev) +static void flush_disk(struct block_device *bdev, bool kill_dirty)  { -	if (__invalidate_device(bdev)) { +	if (__invalidate_device(bdev, kill_dirty)) {  		char name[BDEVNAME_SIZE] = "";  		if (bdev->bd_disk) @@ -1231,7 +991,7 @@ static void flush_disk(struct block_device *bdev)  	if (!bdev->bd_disk)  		return; -	if (disk_partitionable(bdev->bd_disk)) +	if (disk_part_scan_enabled(bdev->bd_disk))  		bdev->bd_invalidated = 1;  } @@ -1257,7 +1017,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)  		       "%s: detected capacity change from %lld to %lld\n",  		       name, bdev_size, disk_size);  		i_size_write(bdev->bd_inode, disk_size); -		flush_disk(bdev); +		flush_disk(bdev, false);  	}  }  EXPORT_SYMBOL(check_disk_size_change); @@ -1284,6 +1044,7 @@ int revalidate_disk(struct gendisk *disk)  	mutex_lock(&bdev->bd_mutex);  	check_disk_size_change(disk, bdev); +	bdev->bd_invalidated = 0;  	mutex_unlock(&bdev->bd_mutex);  	bdput(bdev);  	return ret; @@ -1303,13 +1064,14 @@ int check_disk_change(struct block_device *bdev)  {  	struct gendisk *disk = bdev->bd_disk;  	const struct block_device_operations *bdops = disk->fops; +	unsigned int events; -	if (!bdops->media_changed) -		return 0; -	if (!bdops->media_changed(bdev->bd_disk)) +	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | +				   DISK_EVENT_EJECT_REQUEST); +	if (!(events & DISK_EVENT_MEDIA_CHANGE))  		return 0; -	flush_disk(bdev); +	flush_disk(bdev, true);  	if (bdops->revalidate_disk)  		bdops->revalidate_disk(bdev->bd_disk);  	return 1; @@ -1321,7 +1083,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)  {  	unsigned bsize = bdev_logical_block_size(bdev); -	bdev->bd_inode->i_size = size; +	mutex_lock(&bdev->bd_inode->i_mutex); +	i_size_write(bdev->bd_inode, size); +	mutex_unlock(&bdev->bd_inode->i_mutex);  	while (bsize < PAGE_CACHE_SIZE) {  		if (size & bsize)  			break; @@ -1332,7 +1096,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)  }  EXPORT_SYMBOL(bd_set_size); -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);  /*   * bd_mutex locking: @@ -1344,6 +1108,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  {  	struct gendisk *disk; +	struct module *owner;  	int ret;  	int partno;  	int perm = 0; @@ -1369,10 +1134,13 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  	disk = get_gendisk(bdev->bd_dev, &partno);  	if (!disk)  		goto out; +	owner = disk->fops->owner; +	disk_block_events(disk);  	mutex_lock_nested(&bdev->bd_mutex, for_part);  	if (!bdev->bd_openers) {  		bdev->bd_disk = disk; +		bdev->bd_queue = disk->queue;  		bdev->bd_contains = bdev;  		if (!partno) {  			struct backing_dev_info *bdi; @@ -1382,6 +1150,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  			if (!bdev->bd_part)  				goto out_clear; +			ret = 0;  			if (disk->fops->open) {  				ret = disk->fops->open(bdev, mode);  				if (ret == -ERESTARTSYS) { @@ -1391,24 +1160,38 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  					 */  					disk_put_part(bdev->bd_part);  					bdev->bd_part = NULL; -					module_put(disk->fops->owner); -					put_disk(disk);  					bdev->bd_disk = NULL; +					bdev->bd_queue = NULL;  					mutex_unlock(&bdev->bd_mutex); +					disk_unblock_events(disk); +					put_disk(disk); +					module_put(owner);  					goto restart;  				} -				if (ret) -					goto out_clear;  			} -			if (!bdev->bd_openers) { + +			if (!ret) {  				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);  				bdi = blk_get_backing_dev_info(bdev);  				if (bdi == NULL)  					bdi = &default_backing_dev_info;  				bdev_inode_switch_bdi(bdev->bd_inode, bdi);  			} -			if (bdev->bd_invalidated) -				rescan_partitions(disk, bdev); + +			/* +			 * If the device is invalidated, rescan partition +			 * if open succeeded or failed with -ENOMEDIUM. +			 * The latter is necessary to prevent ghost +			 * partitions on a removed medium. +			 */ +			if (bdev->bd_invalidated) { +				if (!ret) +					rescan_partitions(disk, bdev); +				else if (ret == -ENOMEDIUM) +					invalidate_partitions(disk, bdev); +			} +			if (ret) +				goto out_clear;  		} else {  			struct block_device *whole;  			whole = bdget_disk(disk, 0); @@ -1431,55 +1214,218 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);  		}  	} else { -		module_put(disk->fops->owner); -		put_disk(disk); -		disk = NULL;  		if (bdev->bd_contains == bdev) { -			if (bdev->bd_disk->fops->open) { +			ret = 0; +			if (bdev->bd_disk->fops->open)  				ret = bdev->bd_disk->fops->open(bdev, mode); -				if (ret) -					goto out_unlock_bdev; +			/* the same as first opener case, read comment there */ +			if (bdev->bd_invalidated) { +				if (!ret) +					rescan_partitions(bdev->bd_disk, bdev); +				else if (ret == -ENOMEDIUM) +					invalidate_partitions(bdev->bd_disk, bdev);  			} -			if (bdev->bd_invalidated) -				rescan_partitions(bdev->bd_disk, bdev); +			if (ret) +				goto out_unlock_bdev;  		} +		/* only one opener holds refs to the module and disk */ +		put_disk(disk); +		module_put(owner);  	}  	bdev->bd_openers++;  	if (for_part)  		bdev->bd_part_count++;  	mutex_unlock(&bdev->bd_mutex); +	disk_unblock_events(disk);  	return 0;   out_clear:  	disk_put_part(bdev->bd_part);  	bdev->bd_disk = NULL;  	bdev->bd_part = NULL; +	bdev->bd_queue = NULL;  	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);  	if (bdev != bdev->bd_contains)  		__blkdev_put(bdev->bd_contains, mode, 1);  	bdev->bd_contains = NULL;   out_unlock_bdev:  	mutex_unlock(&bdev->bd_mutex); - out: -	if (disk) -		module_put(disk->fops->owner); +	disk_unblock_events(disk);  	put_disk(disk); +	module_put(owner); + out:  	bdput(bdev);  	return ret;  } -int blkdev_get(struct block_device *bdev, fmode_t mode) +/** + * blkdev_get - open a block device + * @bdev: block_device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is + * open with exclusive access.  Specifying %FMODE_EXCL with %NULL + * @holder is invalid.  Exclusive opens may nest for the same @holder. + * + * On success, the reference count of @bdev is unchanged.  On failure, + * @bdev is put. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)  { -	return __blkdev_get(bdev, mode, 0); +	struct block_device *whole = NULL; +	int res; + +	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); + +	if ((mode & FMODE_EXCL) && holder) { +		whole = bd_start_claiming(bdev, holder); +		if (IS_ERR(whole)) { +			bdput(bdev); +			return PTR_ERR(whole); +		} +	} + +	res = __blkdev_get(bdev, mode, 0); + +	if (whole) { +		struct gendisk *disk = whole->bd_disk; + +		/* finish claiming */ +		mutex_lock(&bdev->bd_mutex); +		spin_lock(&bdev_lock); + +		if (!res) { +			BUG_ON(!bd_may_claim(bdev, whole, holder)); +			/* +			 * Note that for a whole device bd_holders +			 * will be incremented twice, and bd_holder +			 * will be set to bd_may_claim before being +			 * set to holder +			 */ +			whole->bd_holders++; +			whole->bd_holder = bd_may_claim; +			bdev->bd_holders++; +			bdev->bd_holder = holder; +		} + +		/* tell others that we're done */ +		BUG_ON(whole->bd_claiming != holder); +		whole->bd_claiming = NULL; +		wake_up_bit(&whole->bd_claiming, 0); + +		spin_unlock(&bdev_lock); + +		/* +		 * Block event polling for write claims if requested.  Any +		 * write holder makes the write_holder state stick until +		 * all are released.  This is good enough and tracking +		 * individual writeable reference is too fragile given the +		 * way @mode is used in blkdev_get/put(). +		 */ +		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && +		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { +			bdev->bd_write_holder = true; +			disk_block_events(disk); +		} + +		mutex_unlock(&bdev->bd_mutex); +		bdput(whole); +	} + +	return res;  }  EXPORT_SYMBOL(blkdev_get); +/** + * blkdev_get_by_path - open a block device by name + * @path: path to the block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by the device file at @path.  @mode + * and @holder are identical to blkdev_get(). + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, +					void *holder) +{ +	struct block_device *bdev; +	int err; + +	bdev = lookup_bdev(path); +	if (IS_ERR(bdev)) +		return bdev; + +	err = blkdev_get(bdev, mode, holder); +	if (err) +		return ERR_PTR(err); + +	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { +		blkdev_put(bdev, mode); +		return ERR_PTR(-EACCES); +	} + +	return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_path); + +/** + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by device number @dev.  @mode and + * @holder are identical to blkdev_get(). + * + * Use it ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a + * device number.  _Never_ to be used for internal purposes.  If you + * ever need it - reconsider your API. + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +{ +	struct block_device *bdev; +	int err; + +	bdev = bdget(dev); +	if (!bdev) +		return ERR_PTR(-ENOMEM); + +	err = blkdev_get(bdev, mode, holder); +	if (err) +		return ERR_PTR(err); + +	return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_dev); +  static int blkdev_open(struct inode * inode, struct file * filp)  { -	struct block_device *whole = NULL;  	struct block_device *bdev; -	int res;  	/*  	 * Preserve backwards compatibility and allow large file access @@ -1500,31 +1446,13 @@ static int blkdev_open(struct inode * inode, struct file * filp)  	if (bdev == NULL)  		return -ENOMEM; -	if (filp->f_mode & FMODE_EXCL) { -		whole = bd_start_claiming(bdev, filp); -		if (IS_ERR(whole)) { -			bdput(bdev); -			return PTR_ERR(whole); -		} -	} -  	filp->f_mapping = bdev->bd_inode->i_mapping; -	res = blkdev_get(bdev, filp->f_mode); - -	if (whole) { -		if (res == 0) -			bd_finish_claiming(bdev, whole, filp); -		else -			bd_abort_claiming(whole, filp); -	} - -	return res; +	return blkdev_get(bdev, filp->f_mode, filp);  } -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)  { -	int ret = 0;  	struct gendisk *disk = bdev->bd_disk;  	struct block_device *victim = NULL; @@ -1533,46 +1461,91 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)  		bdev->bd_part_count--;  	if (!--bdev->bd_openers) { +		WARN_ON_ONCE(bdev->bd_holders);  		sync_blockdev(bdev);  		kill_bdev(bdev); +		/* ->release can cause the old bdi to disappear, +		 * so must switch it out first +		 */ +		bdev_inode_switch_bdi(bdev->bd_inode, +					&default_backing_dev_info);  	}  	if (bdev->bd_contains == bdev) {  		if (disk->fops->release) -			ret = disk->fops->release(disk, mode); +			disk->fops->release(disk, mode);  	}  	if (!bdev->bd_openers) {  		struct module *owner = disk->fops->owner; -		put_disk(disk); -		module_put(owner);  		disk_put_part(bdev->bd_part);  		bdev->bd_part = NULL;  		bdev->bd_disk = NULL; -		bdev_inode_switch_bdi(bdev->bd_inode, -					&default_backing_dev_info);  		if (bdev != bdev->bd_contains)  			victim = bdev->bd_contains;  		bdev->bd_contains = NULL; + +		put_disk(disk); +		module_put(owner);  	}  	mutex_unlock(&bdev->bd_mutex);  	bdput(bdev);  	if (victim)  		__blkdev_put(victim, mode, 1); -	return ret;  } -int blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, fmode_t mode)  { -	return __blkdev_put(bdev, mode, 0); +	mutex_lock(&bdev->bd_mutex); + +	if (mode & FMODE_EXCL) { +		bool bdev_free; + +		/* +		 * Release a claim on the device.  The holder fields +		 * are protected with bdev_lock.  bd_mutex is to +		 * synchronize disk_holder unlinking. +		 */ +		spin_lock(&bdev_lock); + +		WARN_ON_ONCE(--bdev->bd_holders < 0); +		WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); + +		/* bd_contains might point to self, check in a separate step */ +		if ((bdev_free = !bdev->bd_holders)) +			bdev->bd_holder = NULL; +		if (!bdev->bd_contains->bd_holders) +			bdev->bd_contains->bd_holder = NULL; + +		spin_unlock(&bdev_lock); + +		/* +		 * If this was the last claim, remove holder link and +		 * unblock evpoll if it was a write holder. +		 */ +		if (bdev_free && bdev->bd_write_holder) { +			disk_unblock_events(bdev->bd_disk); +			bdev->bd_write_holder = false; +		} +	} + +	/* +	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE +	 * event.  This is to ensure detection of media removal commanded +	 * from userland - e.g. eject(1). +	 */ +	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); + +	mutex_unlock(&bdev->bd_mutex); + +	__blkdev_put(bdev, mode, 0);  }  EXPORT_SYMBOL(blkdev_put);  static int blkdev_close(struct inode * inode, struct file * filp)  {  	struct block_device *bdev = I_BDEV(filp->f_mapping->host); -	if (bdev->bd_holder == filp) -		bd_release(bdev); -	return blkdev_put(bdev, filp->f_mode); +	blkdev_put(bdev, filp->f_mode); +	return 0;  }  static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) @@ -1599,25 +1572,39 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)   * Does not take i_mutex for the write and thus is not for general purpose   * use.   */ -ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, -			 unsigned long nr_segs, loff_t pos) +ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)  {  	struct file *file = iocb->ki_filp; +	struct blk_plug plug;  	ssize_t ret; -	BUG_ON(iocb->ki_pos != pos); - -	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); -	if (ret > 0 || ret == -EIOCBQUEUED) { +	blk_start_plug(&plug); +	ret = __generic_file_write_iter(iocb, from); +	if (ret > 0) {  		ssize_t err; - -		err = generic_write_sync(file, pos, ret); -		if (err < 0 && ret > 0) +		err = generic_write_sync(file, iocb->ki_pos - ret, ret); +		if (err < 0)  			ret = err;  	} +	blk_finish_plug(&plug);  	return ret;  } -EXPORT_SYMBOL_GPL(blkdev_aio_write); +EXPORT_SYMBOL_GPL(blkdev_write_iter); + +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ +	struct file *file = iocb->ki_filp; +	struct inode *bd_inode = file->f_mapping->host; +	loff_t size = i_size_read(bd_inode); +	loff_t pos = iocb->ki_pos; + +	if (pos >= size) +		return 0; + +	size -= pos; +	iov_iter_truncate(to, size); +	return generic_file_read_iter(iocb, to); +}  /*   * Try to release a page associated with block device when the system @@ -1636,22 +1623,22 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)  static const struct address_space_operations def_blk_aops = {  	.readpage	= blkdev_readpage,  	.writepage	= blkdev_writepage, -	.sync_page	= block_sync_page,  	.write_begin	= blkdev_write_begin,  	.write_end	= blkdev_write_end,  	.writepages	= generic_writepages,  	.releasepage	= blkdev_releasepage,  	.direct_IO	= blkdev_direct_IO, +	.is_dirty_writeback = buffer_check_dirty_writeback,  };  const struct file_operations def_blk_fops = {  	.open		= blkdev_open,  	.release	= blkdev_close,  	.llseek		= block_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, -  	.aio_read	= generic_file_aio_read, -	.aio_write	= blkdev_aio_write, +	.read		= new_sync_read, +	.write		= new_sync_write, +	.read_iter	= blkdev_read_iter, +	.write_iter	= blkdev_write_iter,  	.mmap		= generic_file_mmap,  	.fsync		= blkdev_fsync,  	.unlocked_ioctl	= block_ioctl, @@ -1659,7 +1646,7 @@ const struct file_operations def_blk_fops = {  	.compat_ioctl	= compat_blkdev_ioctl,  #endif  	.splice_read	= generic_file_splice_read, -	.splice_write	= generic_file_splice_write, +	.splice_write	= iter_file_splice_write,  };  int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) @@ -1716,68 +1703,7 @@ fail:  }  EXPORT_SYMBOL(lookup_bdev); -/** - * open_bdev_exclusive  -  open a block device by name and set it up for use - * - * @path:	special file representing the block device - * @mode:	FMODE_... combination to pass be used - * @holder:	owner for exclusion - * - * Open the blockdevice described by the special file at @path, claim it - * for the @holder. - */ -struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) -{ -	struct block_device *bdev, *whole; -	int error; - -	bdev = lookup_bdev(path); -	if (IS_ERR(bdev)) -		return bdev; - -	whole = bd_start_claiming(bdev, holder); -	if (IS_ERR(whole)) { -		bdput(bdev); -		return whole; -	} - -	error = blkdev_get(bdev, mode); -	if (error) -		goto out_abort_claiming; - -	error = -EACCES; -	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) -		goto out_blkdev_put; - -	bd_finish_claiming(bdev, whole, holder); -	return bdev; - -out_blkdev_put: -	blkdev_put(bdev, mode); -out_abort_claiming: -	bd_abort_claiming(whole, holder); -	return ERR_PTR(error); -} - -EXPORT_SYMBOL(open_bdev_exclusive); - -/** - * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive() - * - * @bdev:	blockdevice to close - * @mode:	mode, must match that used to open. - * - * This is the counterpart to open_bdev_exclusive(). - */ -void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) -{ -	bd_release(bdev); -	blkdev_put(bdev, mode); -} - -EXPORT_SYMBOL(close_bdev_exclusive); - -int __invalidate_device(struct block_device *bdev) +int __invalidate_device(struct block_device *bdev, bool kill_dirty)  {  	struct super_block *sb = get_super(bdev);  	int res = 0; @@ -1790,10 +1716,46 @@ int __invalidate_device(struct block_device *bdev)  		 * hold).  		 */  		shrink_dcache_sb(sb); -		res = invalidate_inodes(sb); +		res = invalidate_inodes(sb, kill_dirty);  		drop_super(sb);  	}  	invalidate_bdev(bdev);  	return res;  }  EXPORT_SYMBOL(__invalidate_device); + +void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +{ +	struct inode *inode, *old_inode = NULL; + +	spin_lock(&inode_sb_list_lock); +	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { +		struct address_space *mapping = inode->i_mapping; + +		spin_lock(&inode->i_lock); +		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || +		    mapping->nrpages == 0) { +			spin_unlock(&inode->i_lock); +			continue; +		} +		__iget(inode); +		spin_unlock(&inode->i_lock); +		spin_unlock(&inode_sb_list_lock); +		/* +		 * We hold a reference to 'inode' so it couldn't have been +		 * removed from s_inodes list while we dropped the +		 * inode_sb_list_lock.  We cannot iput the inode now as we can +		 * be holding the last reference and we cannot iput it under +		 * inode_sb_list_lock. So we keep the reference and iput it +		 * later. +		 */ +		iput(old_inode); +		old_inode = inode; + +		func(I_BDEV(inode), arg); + +		spin_lock(&inode_sb_list_lock); +	} +	spin_unlock(&inode_sb_list_lock); +	iput(old_inode); +}  | 
