diff options
Diffstat (limited to 'block')
46 files changed, 7809 insertions, 726 deletions
diff --git a/block/Kconfig b/block/Kconfig index 7f38e40fee0..2429515c05c 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -99,11 +99,16 @@ config BLK_DEV_THROTTLING  	See Documentation/cgroups/blkio-controller.txt for more information. -config CMDLINE_PARSER +config BLK_CMDLINE_PARSER  	bool "Block device command line partition parser"  	default n  	---help--- -	Parsing command line, get the partitions information. +	Enabling this option allows you to specify the partition layout from +	the kernel boot args.  This is typically of use for embedded devices +	which don't otherwise have any standardized method for listing the +	partitions on a block device. + +	See Documentation/block/cmdline-partition.txt for more information.  menu "Partition Types" diff --git a/block/Makefile b/block/Makefile index 4fa4be544ec..a2ce6ac935e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,12 +2,15 @@  # Makefile for the kernel block layer  # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \  			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \  			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ -			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ -			partition-generic.o partitions/ +			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ +			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ +			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ +			partitions/ +obj-$(CONFIG_BOUNCE)	+= bounce.o  obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o  obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o  obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o @@ -18,4 +21,5 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o  obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o  obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o -obj-$(CONFIG_CMDLINE_PARSER)	+= cmdline-parser.o +obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o diff --git a/block/bio-integrity.c b/block/bio-integrity.c new file mode 100644 index 00000000000..9e241063a61 --- /dev/null +++ b/block/bio-integrity.c @@ -0,0 +1,657 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING.  If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/blkdev.h> +#include <linux/mempool.h> +#include <linux/export.h> +#include <linux/bio.h> +#include <linux/workqueue.h> +#include <linux/slab.h> + +#define BIP_INLINE_VECS	4 + +static struct kmem_cache *bip_slab; +static struct workqueue_struct *kintegrityd_wq; + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio:	bio to attach integrity metadata to + * @gfp_mask:	Memory allocation mask + * @nr_vecs:	Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata.  nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, +						  gfp_t gfp_mask, +						  unsigned int nr_vecs) +{ +	struct bio_integrity_payload *bip; +	struct bio_set *bs = bio->bi_pool; +	unsigned long idx = BIO_POOL_NONE; +	unsigned inline_vecs; + +	if (!bs) { +		bip = kmalloc(sizeof(struct bio_integrity_payload) + +			      sizeof(struct bio_vec) * nr_vecs, gfp_mask); +		inline_vecs = nr_vecs; +	} else { +		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); +		inline_vecs = BIP_INLINE_VECS; +	} + +	if (unlikely(!bip)) +		return NULL; + +	memset(bip, 0, sizeof(*bip)); + +	if (nr_vecs > inline_vecs) { +		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, +					  bs->bvec_integrity_pool); +		if (!bip->bip_vec) +			goto err; +	} else { +		bip->bip_vec = bip->bip_inline_vecs; +	} + +	bip->bip_slab = idx; +	bip->bip_bio = bio; +	bio->bi_integrity = bip; + +	return bip; +err: +	mempool_free(bip, bs->bio_integrity_pool); +	return NULL; +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio:	bio containing bip to be freed + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio) +{ +	struct bio_integrity_payload *bip = bio->bi_integrity; +	struct bio_set *bs = bio->bi_pool; + +	if (bip->bip_owns_buf) +		kfree(bip->bip_buf); + +	if (bs) { +		if (bip->bip_slab != BIO_POOL_NONE) +			bvec_free(bs->bvec_integrity_pool, bip->bip_vec, +				  bip->bip_slab); + +		mempool_free(bip, bs->bio_integrity_pool); +	} else { +		kfree(bip); +	} + +	bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) +{ +	if (bip->bip_slab == BIO_POOL_NONE) +		return BIP_INLINE_VECS; + +	return bvec_nr_vecs(bip->bip_slab); +} + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio:	bio to update + * @page:	page containing integrity metadata + * @len:	number of bytes of integrity metadata in page + * @offset:	start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, +			   unsigned int len, unsigned int offset) +{ +	struct bio_integrity_payload *bip = bio->bi_integrity; +	struct bio_vec *iv; + +	if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { +		printk(KERN_ERR "%s: bip_vec full\n", __func__); +		return 0; +	} + +	iv = bip->bip_vec + bip->bip_vcnt; + +	iv->bv_page = page; +	iv->bv_len = len; +	iv->bv_offset = offset; +	bip->bip_vcnt++; + +	return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +static int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ +	struct blk_integrity *bi = bdev_get_integrity(bdev); + +	if (bi == NULL) +		return 0; + +	if (rw == READ && bi->verify_fn != NULL && +	    (bi->flags & INTEGRITY_FLAG_READ)) +		return 1; + +	if (rw == WRITE && bi->generate_fn != NULL && +	    (bi->flags & INTEGRITY_FLAG_WRITE)) +		return 1; + +	return 0; +} + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio:	bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not.	bio data direction and target device must be + * set prior to calling.  The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ +	if (!bio_is_rw(bio)) +		return 0; + +	/* Already protected? */ +	if (bio_integrity(bio)) +		return 0; + +	return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi:		blk_integrity profile for device + * @sectors:	Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device.  Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, +						    unsigned int sectors) +{ +	/* At this point there are only 512b or 4096b DIF/EPP devices */ +	if (bi->sector_size == 4096) +		return sectors >>= 3; + +	return sectors; +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, +					       unsigned int sectors) +{ +	return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio:	bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ +	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + +	BUG_ON(bio->bi_iter.bi_size == 0); + +	return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, +			     int set) +{ +	struct bio_integrity_payload *bip = bio->bi_integrity; +	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); +	unsigned int nr_sectors; + +	BUG_ON(bip->bip_buf == NULL); + +	if (bi->tag_size == 0) +		return -1; + +	nr_sectors = bio_integrity_hw_sectors(bi, +					DIV_ROUND_UP(len, bi->tag_size)); + +	if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { +		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, +		       nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); +		return -1; +	} + +	if (set) +		bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); +	else +		bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + +	return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio:	bio to attach buffer to + * @tag_buf:	Pointer to a buffer containing tag data + * @len:	Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection.  The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ +	BUG_ON(bio_data_dir(bio) != WRITE); + +	return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio:	bio to retrieve buffer from + * @tag_buf:	Pointer to a buffer for the tag data + * @len:	Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ +	BUG_ON(bio_data_dir(bio) != READ); + +	return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio + * @bio:	bio to generate/verify integrity metadata for + * @operate:	operate number, 1 for generate, 0 for verify + */ +static int bio_integrity_generate_verify(struct bio *bio, int operate) +{ +	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); +	struct blk_integrity_exchg bix; +	struct bio_vec *bv; +	sector_t sector; +	unsigned int sectors, ret = 0, i; +	void *prot_buf = bio->bi_integrity->bip_buf; + +	if (operate) +		sector = bio->bi_iter.bi_sector; +	else +		sector = bio->bi_integrity->bip_iter.bi_sector; + +	bix.disk_name = bio->bi_bdev->bd_disk->disk_name; +	bix.sector_size = bi->sector_size; + +	bio_for_each_segment_all(bv, bio, i) { +		void *kaddr = kmap_atomic(bv->bv_page); +		bix.data_buf = kaddr + bv->bv_offset; +		bix.data_size = bv->bv_len; +		bix.prot_buf = prot_buf; +		bix.sector = sector; + +		if (operate) +			bi->generate_fn(&bix); +		else { +			ret = bi->verify_fn(&bix); +			if (ret) { +				kunmap_atomic(kaddr); +				return ret; +			} +		} + +		sectors = bv->bv_len / bi->sector_size; +		sector += sectors; +		prot_buf += sectors * bi->tuple_size; + +		kunmap_atomic(kaddr); +	} +	return ret; +} + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio:	bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function.  The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ +	bio_integrity_generate_verify(bio, 1); +} + +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ +	if (bi) +		return bi->tuple_size; + +	return 0; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio:	bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio.  The bio must have data + * direction, target device and start sector set priot to calling.  In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function.  In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ +	struct bio_integrity_payload *bip; +	struct blk_integrity *bi; +	struct request_queue *q; +	void *buf; +	unsigned long start, end; +	unsigned int len, nr_pages; +	unsigned int bytes, offset, i; +	unsigned int sectors; + +	bi = bdev_get_integrity(bio->bi_bdev); +	q = bdev_get_queue(bio->bi_bdev); +	BUG_ON(bi == NULL); +	BUG_ON(bio_integrity(bio)); + +	sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + +	/* Allocate kernel buffer for protection data */ +	len = sectors * blk_integrity_tuple_size(bi); +	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); +	if (unlikely(buf == NULL)) { +		printk(KERN_ERR "could not allocate integrity buffer\n"); +		return -ENOMEM; +	} + +	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; +	start = ((unsigned long) buf) >> PAGE_SHIFT; +	nr_pages = end - start; + +	/* Allocate bio integrity payload and integrity vectors */ +	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); +	if (unlikely(bip == NULL)) { +		printk(KERN_ERR "could not allocate data integrity bioset\n"); +		kfree(buf); +		return -EIO; +	} + +	bip->bip_owns_buf = 1; +	bip->bip_buf = buf; +	bip->bip_iter.bi_size = len; +	bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; + +	/* Map it */ +	offset = offset_in_page(buf); +	for (i = 0 ; i < nr_pages ; i++) { +		int ret; +		bytes = PAGE_SIZE - offset; + +		if (len <= 0) +			break; + +		if (bytes > len) +			bytes = len; + +		ret = bio_integrity_add_page(bio, virt_to_page(buf), +					     bytes, offset); + +		if (ret == 0) +			return 0; + +		if (ret < bytes) +			break; + +		buf += bytes; +		len -= bytes; +		offset = 0; +	} + +	/* Install custom I/O completion handler if read verify is enabled */ +	if (bio_data_dir(bio) == READ) { +		bip->bip_end_io = bio->bi_end_io; +		bio->bi_end_io = bio_integrity_endio; +	} + +	/* Auto-generate integrity metadata if this is a write */ +	if (bio_data_dir(bio) == WRITE) +		bio_integrity_generate(bio); + +	return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio:	bio to verify + * + * Description: This function is called to verify the integrity of a + * bio.	 The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ +	return bio_integrity_generate_verify(bio, 0); +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work:	Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request.  The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ +	struct bio_integrity_payload *bip = +		container_of(work, struct bio_integrity_payload, bip_work); +	struct bio *bio = bip->bip_bio; +	int error; + +	error = bio_integrity_verify(bio); + +	/* Restore original bio completion handler */ +	bio->bi_end_io = bip->bip_end_io; +	bio_endio_nodec(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio:	Protected bio + * @error:	Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context.  However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context.	This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ +	struct bio_integrity_payload *bip = bio->bi_integrity; + +	BUG_ON(bip->bip_bio != bio); + +	/* In case of an I/O error there is no point in verifying the +	 * integrity metadata.  Restore original bio end_io handler +	 * and run it. +	 */ +	if (error) { +		bio->bi_end_io = bip->bip_end_io; +		bio_endio(bio, error); + +		return; +	} + +	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); +	queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_advance - Advance integrity vector + * @bio:	bio whose integrity vector to update + * @bytes_done:	number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ +	struct bio_integrity_payload *bip = bio->bi_integrity; +	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); +	unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); + +	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio:	bio whose integrity vector to update + * @offset:	offset to first data sector + * @sectors:	number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, +			unsigned int sectors) +{ +	struct bio_integrity_payload *bip = bio->bi_integrity; +	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + +	bio_integrity_advance(bio, offset << 9); +	bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio:	New bio + * @bio_src:	Original bio + * @gfp_mask:	Memory allocation mask + * + * Description:	Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, +			gfp_t gfp_mask) +{ +	struct bio_integrity_payload *bip_src = bio_src->bi_integrity; +	struct bio_integrity_payload *bip; + +	BUG_ON(bip_src == NULL); + +	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + +	if (bip == NULL) +		return -EIO; + +	memcpy(bip->bip_vec, bip_src->bip_vec, +	       bip_src->bip_vcnt * sizeof(struct bio_vec)); + +	bip->bip_vcnt = bip_src->bip_vcnt; +	bip->bip_iter = bip_src->bip_iter; + +	return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ +	if (bs->bio_integrity_pool) +		return 0; + +	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); +	if (!bs->bio_integrity_pool) +		return -1; + +	bs->bvec_integrity_pool = biovec_create_pool(pool_size); +	if (!bs->bvec_integrity_pool) { +		mempool_destroy(bs->bio_integrity_pool); +		return -1; +	} + +	return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ +	if (bs->bio_integrity_pool) +		mempool_destroy(bs->bio_integrity_pool); + +	if (bs->bvec_integrity_pool) +		mempool_destroy(bs->bvec_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init(void) +{ +	/* +	 * kintegrityd won't block much but may burn a lot of CPU cycles. +	 * Make it highpri CPU intensive wq with max concurrency of 1. +	 */ +	kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | +					 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); +	if (!kintegrityd_wq) +		panic("Failed to create kintegrityd\n"); + +	bip_slab = kmem_cache_create("bio_integrity_payload", +				     sizeof(struct bio_integrity_payload) + +				     sizeof(struct bio_vec) * BIP_INLINE_VECS, +				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +	if (!bip_slab) +		panic("Failed to create slab\n"); +} diff --git a/block/bio.c b/block/bio.c new file mode 100644 index 00000000000..0ec61c9e536 --- /dev/null +++ b/block/bio.c @@ -0,0 +1,2052 @@ +/* + * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111- + * + */ +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/uio.h> +#include <linux/iocontext.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mempool.h> +#include <linux/workqueue.h> +#include <linux/cgroup.h> +#include <scsi/sg.h>		/* for struct sg_iovec */ + +#include <trace/events/block.h> + +/* + * Test patch to inline a certain number of bi_io_vec's inside the bio + * itself, to shrink a bio data allocation from two mempool calls to one + */ +#define BIO_INLINE_VECS		4 + +/* + * if you change this list, also change bvec_alloc or things will + * break badly! cannot be bigger than what you can fit into an + * unsigned short + */ +#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } +static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { +	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), +}; +#undef BV + +/* + * fs_bio_set is the bio_set containing bio and iovec memory pools used by + * IO code that does not need private memory pools. + */ +struct bio_set *fs_bio_set; +EXPORT_SYMBOL(fs_bio_set); + +/* + * Our slab pool management + */ +struct bio_slab { +	struct kmem_cache *slab; +	unsigned int slab_ref; +	unsigned int slab_size; +	char name[8]; +}; +static DEFINE_MUTEX(bio_slab_lock); +static struct bio_slab *bio_slabs; +static unsigned int bio_slab_nr, bio_slab_max; + +static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +{ +	unsigned int sz = sizeof(struct bio) + extra_size; +	struct kmem_cache *slab = NULL; +	struct bio_slab *bslab, *new_bio_slabs; +	unsigned int new_bio_slab_max; +	unsigned int i, entry = -1; + +	mutex_lock(&bio_slab_lock); + +	i = 0; +	while (i < bio_slab_nr) { +		bslab = &bio_slabs[i]; + +		if (!bslab->slab && entry == -1) +			entry = i; +		else if (bslab->slab_size == sz) { +			slab = bslab->slab; +			bslab->slab_ref++; +			break; +		} +		i++; +	} + +	if (slab) +		goto out_unlock; + +	if (bio_slab_nr == bio_slab_max && entry == -1) { +		new_bio_slab_max = bio_slab_max << 1; +		new_bio_slabs = krealloc(bio_slabs, +					 new_bio_slab_max * sizeof(struct bio_slab), +					 GFP_KERNEL); +		if (!new_bio_slabs) +			goto out_unlock; +		bio_slab_max = new_bio_slab_max; +		bio_slabs = new_bio_slabs; +	} +	if (entry == -1) +		entry = bio_slab_nr++; + +	bslab = &bio_slabs[entry]; + +	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); +	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); +	if (!slab) +		goto out_unlock; + +	bslab->slab = slab; +	bslab->slab_ref = 1; +	bslab->slab_size = sz; +out_unlock: +	mutex_unlock(&bio_slab_lock); +	return slab; +} + +static void bio_put_slab(struct bio_set *bs) +{ +	struct bio_slab *bslab = NULL; +	unsigned int i; + +	mutex_lock(&bio_slab_lock); + +	for (i = 0; i < bio_slab_nr; i++) { +		if (bs->bio_slab == bio_slabs[i].slab) { +			bslab = &bio_slabs[i]; +			break; +		} +	} + +	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) +		goto out; + +	WARN_ON(!bslab->slab_ref); + +	if (--bslab->slab_ref) +		goto out; + +	kmem_cache_destroy(bslab->slab); +	bslab->slab = NULL; + +out: +	mutex_unlock(&bio_slab_lock); +} + +unsigned int bvec_nr_vecs(unsigned short idx) +{ +	return bvec_slabs[idx].nr_vecs; +} + +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) +{ +	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); + +	if (idx == BIOVEC_MAX_IDX) +		mempool_free(bv, pool); +	else { +		struct biovec_slab *bvs = bvec_slabs + idx; + +		kmem_cache_free(bvs->slab, bv); +	} +} + +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, +			   mempool_t *pool) +{ +	struct bio_vec *bvl; + +	/* +	 * see comment near bvec_array define! +	 */ +	switch (nr) { +	case 1: +		*idx = 0; +		break; +	case 2 ... 4: +		*idx = 1; +		break; +	case 5 ... 16: +		*idx = 2; +		break; +	case 17 ... 64: +		*idx = 3; +		break; +	case 65 ... 128: +		*idx = 4; +		break; +	case 129 ... BIO_MAX_PAGES: +		*idx = 5; +		break; +	default: +		return NULL; +	} + +	/* +	 * idx now points to the pool we want to allocate from. only the +	 * 1-vec entry pool is mempool backed. +	 */ +	if (*idx == BIOVEC_MAX_IDX) { +fallback: +		bvl = mempool_alloc(pool, gfp_mask); +	} else { +		struct biovec_slab *bvs = bvec_slabs + *idx; +		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + +		/* +		 * Make this allocation restricted and don't dump info on +		 * allocation failures, since we'll fallback to the mempool +		 * in case of failure. +		 */ +		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + +		/* +		 * Try a slab allocation. If this fails and __GFP_WAIT +		 * is set, retry with the 1-entry mempool +		 */ +		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); +		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { +			*idx = BIOVEC_MAX_IDX; +			goto fallback; +		} +	} + +	return bvl; +} + +static void __bio_free(struct bio *bio) +{ +	bio_disassociate_task(bio); + +	if (bio_integrity(bio)) +		bio_integrity_free(bio); +} + +static void bio_free(struct bio *bio) +{ +	struct bio_set *bs = bio->bi_pool; +	void *p; + +	__bio_free(bio); + +	if (bs) { +		if (bio_flagged(bio, BIO_OWNS_VEC)) +			bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); + +		/* +		 * If we have front padding, adjust the bio pointer before freeing +		 */ +		p = bio; +		p -= bs->front_pad; + +		mempool_free(p, bs->bio_pool); +	} else { +		/* Bio was allocated by bio_kmalloc() */ +		kfree(bio); +	} +} + +void bio_init(struct bio *bio) +{ +	memset(bio, 0, sizeof(*bio)); +	bio->bi_flags = 1 << BIO_UPTODATE; +	atomic_set(&bio->bi_remaining, 1); +	atomic_set(&bio->bi_cnt, 1); +} +EXPORT_SYMBOL(bio_init); + +/** + * bio_reset - reinitialize a bio + * @bio:	bio to reset + * + * Description: + *   After calling bio_reset(), @bio will be in the same state as a freshly + *   allocated bio returned bio bio_alloc_bioset() - the only fields that are + *   preserved are the ones that are initialized by bio_alloc_bioset(). See + *   comment in struct bio. + */ +void bio_reset(struct bio *bio) +{ +	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + +	__bio_free(bio); + +	memset(bio, 0, BIO_RESET_BYTES); +	bio->bi_flags = flags|(1 << BIO_UPTODATE); +	atomic_set(&bio->bi_remaining, 1); +} +EXPORT_SYMBOL(bio_reset); + +static void bio_chain_endio(struct bio *bio, int error) +{ +	bio_endio(bio->bi_private, error); +	bio_put(bio); +} + +/** + * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the @bio's parent bio + * + * The caller won't have a bi_end_io called when @bio completes - instead, + * @parent's bi_end_io won't be called until both @parent and @bio have + * completed; the chained bio will also be freed when it completes. + * + * The caller must not set bi_private or bi_end_io in @bio. + */ +void bio_chain(struct bio *bio, struct bio *parent) +{ +	BUG_ON(bio->bi_private || bio->bi_end_io); + +	bio->bi_private = parent; +	bio->bi_end_io	= bio_chain_endio; +	atomic_inc(&parent->bi_remaining); +} +EXPORT_SYMBOL(bio_chain); + +static void bio_alloc_rescue(struct work_struct *work) +{ +	struct bio_set *bs = container_of(work, struct bio_set, rescue_work); +	struct bio *bio; + +	while (1) { +		spin_lock(&bs->rescue_lock); +		bio = bio_list_pop(&bs->rescue_list); +		spin_unlock(&bs->rescue_lock); + +		if (!bio) +			break; + +		generic_make_request(bio); +	} +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ +	struct bio_list punt, nopunt; +	struct bio *bio; + +	/* +	 * In order to guarantee forward progress we must punt only bios that +	 * were allocated from this bio_set; otherwise, if there was a bio on +	 * there for a stacking driver higher up in the stack, processing it +	 * could require allocating bios from this bio_set, and doing that from +	 * our own rescuer would be bad. +	 * +	 * Since bio lists are singly linked, pop them all instead of trying to +	 * remove from the middle of the list: +	 */ + +	bio_list_init(&punt); +	bio_list_init(&nopunt); + +	while ((bio = bio_list_pop(current->bio_list))) +		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + +	*current->bio_list = nopunt; + +	spin_lock(&bs->rescue_lock); +	bio_list_merge(&bs->rescue_list, &punt); +	spin_unlock(&bs->rescue_lock); + +	queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + +/** + * bio_alloc_bioset - allocate a bio for I/O + * @gfp_mask:   the GFP_ mask given to the slab allocator + * @nr_iovecs:	number of iovecs to pre-allocate + * @bs:		the bio_set to allocate from. + * + * Description: + *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + *   backed by the @bs's mempool. + * + *   When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be + *   able to allocate a bio. This is due to the mempool guarantees. To make this + *   work, callers must never allocate more than 1 bio at a time from this pool. + *   Callers that need to allocate more than 1 bio must always submit the + *   previously allocated bio for IO before attempting to allocate a new one. + *   Failure to do so can cause deadlocks under memory pressure. + * + *   Note that when running under generic_make_request() (i.e. any block + *   driver), bios are not submitted until after you return - see the code in + *   generic_make_request() that converts recursion into iteration, to prevent + *   stack overflows. + * + *   This would normally mean allocating multiple bios under + *   generic_make_request() would be susceptible to deadlocks, but we have + *   deadlock avoidance code that resubmits any blocked bios from a rescuer + *   thread. + * + *   However, we do not guarantee forward progress for allocations from other + *   mempools. Doing multiple allocations from the same mempool under + *   generic_make_request() should be avoided - instead, use bio_set's front_pad + *   for per bio allocations. + * + *   RETURNS: + *   Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) +{ +	gfp_t saved_gfp = gfp_mask; +	unsigned front_pad; +	unsigned inline_vecs; +	unsigned long idx = BIO_POOL_NONE; +	struct bio_vec *bvl = NULL; +	struct bio *bio; +	void *p; + +	if (!bs) { +		if (nr_iovecs > UIO_MAXIOV) +			return NULL; + +		p = kmalloc(sizeof(struct bio) + +			    nr_iovecs * sizeof(struct bio_vec), +			    gfp_mask); +		front_pad = 0; +		inline_vecs = nr_iovecs; +	} else { +		/* +		 * generic_make_request() converts recursion to iteration; this +		 * means if we're running beneath it, any bios we allocate and +		 * submit will not be submitted (and thus freed) until after we +		 * return. +		 * +		 * This exposes us to a potential deadlock if we allocate +		 * multiple bios from the same bio_set() while running +		 * underneath generic_make_request(). If we were to allocate +		 * multiple bios (say a stacking block driver that was splitting +		 * bios), we would deadlock if we exhausted the mempool's +		 * reserve. +		 * +		 * We solve this, and guarantee forward progress, with a rescuer +		 * workqueue per bio_set. If we go to allocate and there are +		 * bios on current->bio_list, we first try the allocation +		 * without __GFP_WAIT; if that fails, we punt those bios we +		 * would be blocking to the rescuer workqueue before we retry +		 * with the original gfp_flags. +		 */ + +		if (current->bio_list && !bio_list_empty(current->bio_list)) +			gfp_mask &= ~__GFP_WAIT; + +		p = mempool_alloc(bs->bio_pool, gfp_mask); +		if (!p && gfp_mask != saved_gfp) { +			punt_bios_to_rescuer(bs); +			gfp_mask = saved_gfp; +			p = mempool_alloc(bs->bio_pool, gfp_mask); +		} + +		front_pad = bs->front_pad; +		inline_vecs = BIO_INLINE_VECS; +	} + +	if (unlikely(!p)) +		return NULL; + +	bio = p + front_pad; +	bio_init(bio); + +	if (nr_iovecs > inline_vecs) { +		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); +		if (!bvl && gfp_mask != saved_gfp) { +			punt_bios_to_rescuer(bs); +			gfp_mask = saved_gfp; +			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); +		} + +		if (unlikely(!bvl)) +			goto err_free; + +		bio->bi_flags |= 1 << BIO_OWNS_VEC; +	} else if (nr_iovecs) { +		bvl = bio->bi_inline_vecs; +	} + +	bio->bi_pool = bs; +	bio->bi_flags |= idx << BIO_POOL_OFFSET; +	bio->bi_max_vecs = nr_iovecs; +	bio->bi_io_vec = bvl; +	return bio; + +err_free: +	mempool_free(p, bs->bio_pool); +	return NULL; +} +EXPORT_SYMBOL(bio_alloc_bioset); + +void zero_fill_bio(struct bio *bio) +{ +	unsigned long flags; +	struct bio_vec bv; +	struct bvec_iter iter; + +	bio_for_each_segment(bv, bio, iter) { +		char *data = bvec_kmap_irq(&bv, &flags); +		memset(data, 0, bv.bv_len); +		flush_dcache_page(bv.bv_page); +		bvec_kunmap_irq(data, &flags); +	} +} +EXPORT_SYMBOL(zero_fill_bio); + +/** + * bio_put - release a reference to a bio + * @bio:   bio to release reference to + * + * Description: + *   Put a reference to a &struct bio, either one you have gotten with + *   bio_alloc, bio_get or bio_clone. The last put of a bio will free it. + **/ +void bio_put(struct bio *bio) +{ +	BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); + +	/* +	 * last put frees it +	 */ +	if (atomic_dec_and_test(&bio->bi_cnt)) +		bio_free(bio); +} +EXPORT_SYMBOL(bio_put); + +inline int bio_phys_segments(struct request_queue *q, struct bio *bio) +{ +	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) +		blk_recount_segments(q, bio); + +	return bio->bi_phys_segments; +} +EXPORT_SYMBOL(bio_phys_segments); + +/** + * 	__bio_clone_fast - clone a bio that shares the original bio's biovec + * 	@bio: destination bio + * 	@bio_src: bio to clone + * + *	Clone a &bio. Caller will own the returned bio, but not + *	the actual data it points to. Reference count of returned + * 	bio will be one. + * + * 	Caller must ensure that @bio_src is not freed before @bio. + */ +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +{ +	BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); + +	/* +	 * most users will be overriding ->bi_bdev with a new target, +	 * so we don't set nor calculate new physical/hw segment counts here +	 */ +	bio->bi_bdev = bio_src->bi_bdev; +	bio->bi_flags |= 1 << BIO_CLONED; +	bio->bi_rw = bio_src->bi_rw; +	bio->bi_iter = bio_src->bi_iter; +	bio->bi_io_vec = bio_src->bi_io_vec; +} +EXPORT_SYMBOL(__bio_clone_fast); + +/** + *	bio_clone_fast - clone a bio that shares the original bio's biovec + *	@bio: bio to clone + *	@gfp_mask: allocation priority + *	@bs: bio_set to allocate from + * + * 	Like __bio_clone_fast, only also allocates the returned bio + */ +struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +{ +	struct bio *b; + +	b = bio_alloc_bioset(gfp_mask, 0, bs); +	if (!b) +		return NULL; + +	__bio_clone_fast(b, bio); + +	if (bio_integrity(bio)) { +		int ret; + +		ret = bio_integrity_clone(b, bio, gfp_mask); + +		if (ret < 0) { +			bio_put(b); +			return NULL; +		} +	} + +	return b; +} +EXPORT_SYMBOL(bio_clone_fast); + +/** + * 	bio_clone_bioset - clone a bio + * 	@bio_src: bio to clone + *	@gfp_mask: allocation priority + *	@bs: bio_set to allocate from + * + *	Clone bio. Caller will own the returned bio, but not the actual data it + *	points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, +			     struct bio_set *bs) +{ +	struct bvec_iter iter; +	struct bio_vec bv; +	struct bio *bio; + +	/* +	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from +	 * bio_src->bi_io_vec to bio->bi_io_vec. +	 * +	 * We can't do that anymore, because: +	 * +	 *  - The point of cloning the biovec is to produce a bio with a biovec +	 *    the caller can modify: bi_idx and bi_bvec_done should be 0. +	 * +	 *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if +	 *    we tried to clone the whole thing bio_alloc_bioset() would fail. +	 *    But the clone should succeed as long as the number of biovecs we +	 *    actually need to allocate is fewer than BIO_MAX_PAGES. +	 * +	 *  - Lastly, bi_vcnt should not be looked at or relied upon by code +	 *    that does not own the bio - reason being drivers don't use it for +	 *    iterating over the biovec anymore, so expecting it to be kept up +	 *    to date (i.e. for clones that share the parent biovec) is just +	 *    asking for trouble and would force extra work on +	 *    __bio_clone_fast() anyways. +	 */ + +	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); +	if (!bio) +		return NULL; + +	bio->bi_bdev		= bio_src->bi_bdev; +	bio->bi_rw		= bio_src->bi_rw; +	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector; +	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size; + +	if (bio->bi_rw & REQ_DISCARD) +		goto integrity_clone; + +	if (bio->bi_rw & REQ_WRITE_SAME) { +		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; +		goto integrity_clone; +	} + +	bio_for_each_segment(bv, bio_src, iter) +		bio->bi_io_vec[bio->bi_vcnt++] = bv; + +integrity_clone: +	if (bio_integrity(bio_src)) { +		int ret; + +		ret = bio_integrity_clone(bio, bio_src, gfp_mask); +		if (ret < 0) { +			bio_put(bio); +			return NULL; +		} +	} + +	return bio; +} +EXPORT_SYMBOL(bio_clone_bioset); + +/** + *	bio_get_nr_vecs		- return approx number of vecs + *	@bdev:  I/O target + * + *	Return the approximate number of pages we can send to this target. + *	There's no guarantee that you will be able to fit this number of pages + *	into a bio, it does not account for dynamic restrictions that vary + *	on offset. + */ +int bio_get_nr_vecs(struct block_device *bdev) +{ +	struct request_queue *q = bdev_get_queue(bdev); +	int nr_pages; + +	nr_pages = min_t(unsigned, +		     queue_max_segments(q), +		     queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); + +	return min_t(unsigned, nr_pages, BIO_MAX_PAGES); + +} +EXPORT_SYMBOL(bio_get_nr_vecs); + +static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page +			  *page, unsigned int len, unsigned int offset, +			  unsigned int max_sectors) +{ +	int retried_segments = 0; +	struct bio_vec *bvec; + +	/* +	 * cloned bio must not modify vec list +	 */ +	if (unlikely(bio_flagged(bio, BIO_CLONED))) +		return 0; + +	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) +		return 0; + +	/* +	 * For filesystems with a blocksize smaller than the pagesize +	 * we will often be called with the same page as last time and +	 * a consecutive offset.  Optimize this special case. +	 */ +	if (bio->bi_vcnt > 0) { +		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + +		if (page == prev->bv_page && +		    offset == prev->bv_offset + prev->bv_len) { +			unsigned int prev_bv_len = prev->bv_len; +			prev->bv_len += len; + +			if (q->merge_bvec_fn) { +				struct bvec_merge_data bvm = { +					/* prev_bvec is already charged in +					   bi_size, discharge it in order to +					   simulate merging updated prev_bvec +					   as new bvec. */ +					.bi_bdev = bio->bi_bdev, +					.bi_sector = bio->bi_iter.bi_sector, +					.bi_size = bio->bi_iter.bi_size - +						prev_bv_len, +					.bi_rw = bio->bi_rw, +				}; + +				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { +					prev->bv_len -= len; +					return 0; +				} +			} + +			goto done; +		} + +		/* +		 * If the queue doesn't support SG gaps and adding this +		 * offset would create a gap, disallow it. +		 */ +		if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && +		    bvec_gap_to_prev(prev, offset)) +			return 0; +	} + +	if (bio->bi_vcnt >= bio->bi_max_vecs) +		return 0; + +	/* +	 * we might lose a segment or two here, but rather that than +	 * make this too complex. +	 */ + +	while (bio->bi_phys_segments >= queue_max_segments(q)) { + +		if (retried_segments) +			return 0; + +		retried_segments = 1; +		blk_recount_segments(q, bio); +	} + +	/* +	 * setup the new entry, we might clear it again later if we +	 * cannot add the page +	 */ +	bvec = &bio->bi_io_vec[bio->bi_vcnt]; +	bvec->bv_page = page; +	bvec->bv_len = len; +	bvec->bv_offset = offset; + +	/* +	 * if queue has other restrictions (eg varying max sector size +	 * depending on offset), it can specify a merge_bvec_fn in the +	 * queue to get further control +	 */ +	if (q->merge_bvec_fn) { +		struct bvec_merge_data bvm = { +			.bi_bdev = bio->bi_bdev, +			.bi_sector = bio->bi_iter.bi_sector, +			.bi_size = bio->bi_iter.bi_size, +			.bi_rw = bio->bi_rw, +		}; + +		/* +		 * merge_bvec_fn() returns number of bytes it can accept +		 * at this offset +		 */ +		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { +			bvec->bv_page = NULL; +			bvec->bv_len = 0; +			bvec->bv_offset = 0; +			return 0; +		} +	} + +	/* If we may be able to merge these biovecs, force a recount */ +	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) +		bio->bi_flags &= ~(1 << BIO_SEG_VALID); + +	bio->bi_vcnt++; +	bio->bi_phys_segments++; + done: +	bio->bi_iter.bi_size += len; +	return len; +} + +/** + *	bio_add_pc_page	-	attempt to add page to bio + *	@q: the target queue + *	@bio: destination bio + *	@page: page to add + *	@len: vec entry length + *	@offset: vec entry offset + * + *	Attempt to add a page to the bio_vec maplist. This can fail for a + *	number of reasons, such as the bio being full or target block device + *	limitations. The target block device must allow bio's up to PAGE_SIZE, + *	so it is always possible to add a single page to an empty bio. + * + *	This should only be used by REQ_PC bios. + */ +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, +		    unsigned int len, unsigned int offset) +{ +	return __bio_add_page(q, bio, page, len, offset, +			      queue_max_hw_sectors(q)); +} +EXPORT_SYMBOL(bio_add_pc_page); + +/** + *	bio_add_page	-	attempt to add page to bio + *	@bio: destination bio + *	@page: page to add + *	@len: vec entry length + *	@offset: vec entry offset + * + *	Attempt to add a page to the bio_vec maplist. This can fail for a + *	number of reasons, such as the bio being full or target block device + *	limitations. The target block device must allow bio's up to PAGE_SIZE, + *	so it is always possible to add a single page to an empty bio. + */ +int bio_add_page(struct bio *bio, struct page *page, unsigned int len, +		 unsigned int offset) +{ +	struct request_queue *q = bdev_get_queue(bio->bi_bdev); +	unsigned int max_sectors; + +	max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); +	if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size) +		max_sectors = len >> 9; + +	return __bio_add_page(q, bio, page, len, offset, max_sectors); +} +EXPORT_SYMBOL(bio_add_page); + +struct submit_bio_ret { +	struct completion event; +	int error; +}; + +static void submit_bio_wait_endio(struct bio *bio, int error) +{ +	struct submit_bio_ret *ret = bio->bi_private; + +	ret->error = error; +	complete(&ret->event); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + */ +int submit_bio_wait(int rw, struct bio *bio) +{ +	struct submit_bio_ret ret; + +	rw |= REQ_SYNC; +	init_completion(&ret.event); +	bio->bi_private = &ret; +	bio->bi_end_io = submit_bio_wait_endio; +	submit_bio(rw, bio); +	wait_for_completion(&ret.event); + +	return ret.error; +} +EXPORT_SYMBOL(submit_bio_wait); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio:	bio to advance + * @bytes:	number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ +	if (bio_integrity(bio)) +		bio_integrity_advance(bio, bytes); + +	bio_advance_iter(bio, &bio->bi_iter, bytes); +} +EXPORT_SYMBOL(bio_advance); + +/** + * bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ +	int i; +	struct bio_vec *bv; + +	bio_for_each_segment_all(bv, bio, i) { +		bv->bv_page = alloc_page(gfp_mask); +		if (!bv->bv_page) { +			while (--bv >= bio->bi_io_vec) +				__free_page(bv->bv_page); +			return -ENOMEM; +		} +	} + +	return 0; +} +EXPORT_SYMBOL(bio_alloc_pages); + +/** + * bio_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats + * @src and @dst as linked lists of bios. + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ +	struct bvec_iter src_iter, dst_iter; +	struct bio_vec src_bv, dst_bv; +	void *src_p, *dst_p; +	unsigned bytes; + +	src_iter = src->bi_iter; +	dst_iter = dst->bi_iter; + +	while (1) { +		if (!src_iter.bi_size) { +			src = src->bi_next; +			if (!src) +				break; + +			src_iter = src->bi_iter; +		} + +		if (!dst_iter.bi_size) { +			dst = dst->bi_next; +			if (!dst) +				break; + +			dst_iter = dst->bi_iter; +		} + +		src_bv = bio_iter_iovec(src, src_iter); +		dst_bv = bio_iter_iovec(dst, dst_iter); + +		bytes = min(src_bv.bv_len, dst_bv.bv_len); + +		src_p = kmap_atomic(src_bv.bv_page); +		dst_p = kmap_atomic(dst_bv.bv_page); + +		memcpy(dst_p + dst_bv.bv_offset, +		       src_p + src_bv.bv_offset, +		       bytes); + +		kunmap_atomic(dst_p); +		kunmap_atomic(src_p); + +		bio_advance_iter(src, &src_iter, bytes); +		bio_advance_iter(dst, &dst_iter, bytes); +	} +} +EXPORT_SYMBOL(bio_copy_data); + +struct bio_map_data { +	int nr_sgvecs; +	int is_our_pages; +	struct sg_iovec sgvecs[]; +}; + +static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, +			     const struct sg_iovec *iov, int iov_count, +			     int is_our_pages) +{ +	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); +	bmd->nr_sgvecs = iov_count; +	bmd->is_our_pages = is_our_pages; +	bio->bi_private = bmd; +} + +static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, +					       gfp_t gfp_mask) +{ +	if (iov_count > UIO_MAXIOV) +		return NULL; + +	return kmalloc(sizeof(struct bio_map_data) + +		       sizeof(struct sg_iovec) * iov_count, gfp_mask); +} + +static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, +			  int to_user, int from_user, int do_free_page) +{ +	int ret = 0, i; +	struct bio_vec *bvec; +	int iov_idx = 0; +	unsigned int iov_off = 0; + +	bio_for_each_segment_all(bvec, bio, i) { +		char *bv_addr = page_address(bvec->bv_page); +		unsigned int bv_len = bvec->bv_len; + +		while (bv_len && iov_idx < iov_count) { +			unsigned int bytes; +			char __user *iov_addr; + +			bytes = min_t(unsigned int, +				      iov[iov_idx].iov_len - iov_off, bv_len); +			iov_addr = iov[iov_idx].iov_base + iov_off; + +			if (!ret) { +				if (to_user) +					ret = copy_to_user(iov_addr, bv_addr, +							   bytes); + +				if (from_user) +					ret = copy_from_user(bv_addr, iov_addr, +							     bytes); + +				if (ret) +					ret = -EFAULT; +			} + +			bv_len -= bytes; +			bv_addr += bytes; +			iov_addr += bytes; +			iov_off += bytes; + +			if (iov[iov_idx].iov_len == iov_off) { +				iov_idx++; +				iov_off = 0; +			} +		} + +		if (do_free_page) +			__free_page(bvec->bv_page); +	} + +	return ret; +} + +/** + *	bio_uncopy_user	-	finish previously mapped bio + *	@bio: bio being terminated + * + *	Free pages allocated from bio_copy_user() and write back data + *	to user space in case of a read. + */ +int bio_uncopy_user(struct bio *bio) +{ +	struct bio_map_data *bmd = bio->bi_private; +	struct bio_vec *bvec; +	int ret = 0, i; + +	if (!bio_flagged(bio, BIO_NULL_MAPPED)) { +		/* +		 * if we're in a workqueue, the request is orphaned, so +		 * don't copy into a random user address space, just free. +		 */ +		if (current->mm) +			ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, +					     bio_data_dir(bio) == READ, +					     0, bmd->is_our_pages); +		else if (bmd->is_our_pages) +			bio_for_each_segment_all(bvec, bio, i) +				__free_page(bvec->bv_page); +	} +	kfree(bmd); +	bio_put(bio); +	return ret; +} +EXPORT_SYMBOL(bio_uncopy_user); + +/** + *	bio_copy_user_iov	-	copy user data to bio + *	@q: destination block queue + *	@map_data: pointer to the rq_map_data holding pages (if necessary) + *	@iov:	the iovec. + *	@iov_count: number of elements in the iovec + *	@write_to_vm: bool indicating writing to pages or not + *	@gfp_mask: memory allocation flags + * + *	Prepares and returns a bio for indirect user io, bouncing data + *	to/from kernel pages as necessary. Must be paired with + *	call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user_iov(struct request_queue *q, +			      struct rq_map_data *map_data, +			      const struct sg_iovec *iov, int iov_count, +			      int write_to_vm, gfp_t gfp_mask) +{ +	struct bio_map_data *bmd; +	struct bio_vec *bvec; +	struct page *page; +	struct bio *bio; +	int i, ret; +	int nr_pages = 0; +	unsigned int len = 0; +	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; + +	for (i = 0; i < iov_count; i++) { +		unsigned long uaddr; +		unsigned long end; +		unsigned long start; + +		uaddr = (unsigned long)iov[i].iov_base; +		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; +		start = uaddr >> PAGE_SHIFT; + +		/* +		 * Overflow, abort +		 */ +		if (end < start) +			return ERR_PTR(-EINVAL); + +		nr_pages += end - start; +		len += iov[i].iov_len; +	} + +	if (offset) +		nr_pages++; + +	bmd = bio_alloc_map_data(iov_count, gfp_mask); +	if (!bmd) +		return ERR_PTR(-ENOMEM); + +	ret = -ENOMEM; +	bio = bio_kmalloc(gfp_mask, nr_pages); +	if (!bio) +		goto out_bmd; + +	if (!write_to_vm) +		bio->bi_rw |= REQ_WRITE; + +	ret = 0; + +	if (map_data) { +		nr_pages = 1 << map_data->page_order; +		i = map_data->offset / PAGE_SIZE; +	} +	while (len) { +		unsigned int bytes = PAGE_SIZE; + +		bytes -= offset; + +		if (bytes > len) +			bytes = len; + +		if (map_data) { +			if (i == map_data->nr_entries * nr_pages) { +				ret = -ENOMEM; +				break; +			} + +			page = map_data->pages[i / nr_pages]; +			page += (i % nr_pages); + +			i++; +		} else { +			page = alloc_page(q->bounce_gfp | gfp_mask); +			if (!page) { +				ret = -ENOMEM; +				break; +			} +		} + +		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) +			break; + +		len -= bytes; +		offset = 0; +	} + +	if (ret) +		goto cleanup; + +	/* +	 * success +	 */ +	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || +	    (map_data && map_data->from_user)) { +		ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); +		if (ret) +			goto cleanup; +	} + +	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); +	return bio; +cleanup: +	if (!map_data) +		bio_for_each_segment_all(bvec, bio, i) +			__free_page(bvec->bv_page); + +	bio_put(bio); +out_bmd: +	kfree(bmd); +	return ERR_PTR(ret); +} + +/** + *	bio_copy_user	-	copy user data to bio + *	@q: destination block queue + *	@map_data: pointer to the rq_map_data holding pages (if necessary) + *	@uaddr: start of user address + *	@len: length in bytes + *	@write_to_vm: bool indicating writing to pages or not + *	@gfp_mask: memory allocation flags + * + *	Prepares and returns a bio for indirect user io, bouncing data + *	to/from kernel pages as necessary. Must be paired with + *	call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, +			  unsigned long uaddr, unsigned int len, +			  int write_to_vm, gfp_t gfp_mask) +{ +	struct sg_iovec iov; + +	iov.iov_base = (void __user *)uaddr; +	iov.iov_len = len; + +	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); +} +EXPORT_SYMBOL(bio_copy_user); + +static struct bio *__bio_map_user_iov(struct request_queue *q, +				      struct block_device *bdev, +				      const struct sg_iovec *iov, int iov_count, +				      int write_to_vm, gfp_t gfp_mask) +{ +	int i, j; +	int nr_pages = 0; +	struct page **pages; +	struct bio *bio; +	int cur_page = 0; +	int ret, offset; + +	for (i = 0; i < iov_count; i++) { +		unsigned long uaddr = (unsigned long)iov[i].iov_base; +		unsigned long len = iov[i].iov_len; +		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; +		unsigned long start = uaddr >> PAGE_SHIFT; + +		/* +		 * Overflow, abort +		 */ +		if (end < start) +			return ERR_PTR(-EINVAL); + +		nr_pages += end - start; +		/* +		 * buffer must be aligned to at least hardsector size for now +		 */ +		if (uaddr & queue_dma_alignment(q)) +			return ERR_PTR(-EINVAL); +	} + +	if (!nr_pages) +		return ERR_PTR(-EINVAL); + +	bio = bio_kmalloc(gfp_mask, nr_pages); +	if (!bio) +		return ERR_PTR(-ENOMEM); + +	ret = -ENOMEM; +	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); +	if (!pages) +		goto out; + +	for (i = 0; i < iov_count; i++) { +		unsigned long uaddr = (unsigned long)iov[i].iov_base; +		unsigned long len = iov[i].iov_len; +		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; +		unsigned long start = uaddr >> PAGE_SHIFT; +		const int local_nr_pages = end - start; +		const int page_limit = cur_page + local_nr_pages; + +		ret = get_user_pages_fast(uaddr, local_nr_pages, +				write_to_vm, &pages[cur_page]); +		if (ret < local_nr_pages) { +			ret = -EFAULT; +			goto out_unmap; +		} + +		offset = uaddr & ~PAGE_MASK; +		for (j = cur_page; j < page_limit; j++) { +			unsigned int bytes = PAGE_SIZE - offset; + +			if (len <= 0) +				break; +			 +			if (bytes > len) +				bytes = len; + +			/* +			 * sorry... +			 */ +			if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < +					    bytes) +				break; + +			len -= bytes; +			offset = 0; +		} + +		cur_page = j; +		/* +		 * release the pages we didn't map into the bio, if any +		 */ +		while (j < page_limit) +			page_cache_release(pages[j++]); +	} + +	kfree(pages); + +	/* +	 * set data direction, and check if mapped pages need bouncing +	 */ +	if (!write_to_vm) +		bio->bi_rw |= REQ_WRITE; + +	bio->bi_bdev = bdev; +	bio->bi_flags |= (1 << BIO_USER_MAPPED); +	return bio; + + out_unmap: +	for (i = 0; i < nr_pages; i++) { +		if(!pages[i]) +			break; +		page_cache_release(pages[i]); +	} + out: +	kfree(pages); +	bio_put(bio); +	return ERR_PTR(ret); +} + +/** + *	bio_map_user	-	map user address into bio + *	@q: the struct request_queue for the bio + *	@bdev: destination block device + *	@uaddr: start of user address + *	@len: length in bytes + *	@write_to_vm: bool indicating writing to pages or not + *	@gfp_mask: memory allocation flags + * + *	Map the user space address into a bio suitable for io to a block + *	device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, +			 unsigned long uaddr, unsigned int len, int write_to_vm, +			 gfp_t gfp_mask) +{ +	struct sg_iovec iov; + +	iov.iov_base = (void __user *)uaddr; +	iov.iov_len = len; + +	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); +} +EXPORT_SYMBOL(bio_map_user); + +/** + *	bio_map_user_iov - map user sg_iovec table into bio + *	@q: the struct request_queue for the bio + *	@bdev: destination block device + *	@iov:	the iovec. + *	@iov_count: number of elements in the iovec + *	@write_to_vm: bool indicating writing to pages or not + *	@gfp_mask: memory allocation flags + * + *	Map the user space address into a bio suitable for io to a block + *	device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, +			     const struct sg_iovec *iov, int iov_count, +			     int write_to_vm, gfp_t gfp_mask) +{ +	struct bio *bio; + +	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, +				 gfp_mask); +	if (IS_ERR(bio)) +		return bio; + +	/* +	 * subtle -- if __bio_map_user() ended up bouncing a bio, +	 * it would normally disappear when its bi_end_io is run. +	 * however, we need it for the unmap, so grab an extra +	 * reference to it +	 */ +	bio_get(bio); + +	return bio; +} + +static void __bio_unmap_user(struct bio *bio) +{ +	struct bio_vec *bvec; +	int i; + +	/* +	 * make sure we dirty pages we wrote to +	 */ +	bio_for_each_segment_all(bvec, bio, i) { +		if (bio_data_dir(bio) == READ) +			set_page_dirty_lock(bvec->bv_page); + +		page_cache_release(bvec->bv_page); +	} + +	bio_put(bio); +} + +/** + *	bio_unmap_user	-	unmap a bio + *	@bio:		the bio being unmapped + * + *	Unmap a bio previously mapped by bio_map_user(). Must be called with + *	a process context. + * + *	bio_unmap_user() may sleep. + */ +void bio_unmap_user(struct bio *bio) +{ +	__bio_unmap_user(bio); +	bio_put(bio); +} +EXPORT_SYMBOL(bio_unmap_user); + +static void bio_map_kern_endio(struct bio *bio, int err) +{ +	bio_put(bio); +} + +static struct bio *__bio_map_kern(struct request_queue *q, void *data, +				  unsigned int len, gfp_t gfp_mask) +{ +	unsigned long kaddr = (unsigned long)data; +	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; +	unsigned long start = kaddr >> PAGE_SHIFT; +	const int nr_pages = end - start; +	int offset, i; +	struct bio *bio; + +	bio = bio_kmalloc(gfp_mask, nr_pages); +	if (!bio) +		return ERR_PTR(-ENOMEM); + +	offset = offset_in_page(kaddr); +	for (i = 0; i < nr_pages; i++) { +		unsigned int bytes = PAGE_SIZE - offset; + +		if (len <= 0) +			break; + +		if (bytes > len) +			bytes = len; + +		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, +				    offset) < bytes) +			break; + +		data += bytes; +		len -= bytes; +		offset = 0; +	} + +	bio->bi_end_io = bio_map_kern_endio; +	return bio; +} + +/** + *	bio_map_kern	-	map kernel address into bio + *	@q: the struct request_queue for the bio + *	@data: pointer to buffer to map + *	@len: length in bytes + *	@gfp_mask: allocation flags for bio allocation + * + *	Map the kernel address into a bio suitable for io to a block + *	device. Returns an error pointer in case of error. + */ +struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, +			 gfp_t gfp_mask) +{ +	struct bio *bio; + +	bio = __bio_map_kern(q, data, len, gfp_mask); +	if (IS_ERR(bio)) +		return bio; + +	if (bio->bi_iter.bi_size == len) +		return bio; + +	/* +	 * Don't support partial mappings. +	 */ +	bio_put(bio); +	return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL(bio_map_kern); + +static void bio_copy_kern_endio(struct bio *bio, int err) +{ +	struct bio_vec *bvec; +	const int read = bio_data_dir(bio) == READ; +	struct bio_map_data *bmd = bio->bi_private; +	int i; +	char *p = bmd->sgvecs[0].iov_base; + +	bio_for_each_segment_all(bvec, bio, i) { +		char *addr = page_address(bvec->bv_page); + +		if (read) +			memcpy(p, addr, bvec->bv_len); + +		__free_page(bvec->bv_page); +		p += bvec->bv_len; +	} + +	kfree(bmd); +	bio_put(bio); +} + +/** + *	bio_copy_kern	-	copy kernel address into bio + *	@q: the struct request_queue for the bio + *	@data: pointer to buffer to copy + *	@len: length in bytes + *	@gfp_mask: allocation flags for bio and page allocation + *	@reading: data direction is READ + * + *	copy the kernel address into a bio suitable for io to a block + *	device. Returns an error pointer in case of error. + */ +struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, +			  gfp_t gfp_mask, int reading) +{ +	struct bio *bio; +	struct bio_vec *bvec; +	int i; + +	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); +	if (IS_ERR(bio)) +		return bio; + +	if (!reading) { +		void *p = data; + +		bio_for_each_segment_all(bvec, bio, i) { +			char *addr = page_address(bvec->bv_page); + +			memcpy(addr, p, bvec->bv_len); +			p += bvec->bv_len; +		} +	} + +	bio->bi_end_io = bio_copy_kern_endio; + +	return bio; +} +EXPORT_SYMBOL(bio_copy_kern); + +/* + * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions + * for performing direct-IO in BIOs. + * + * The problem is that we cannot run set_page_dirty() from interrupt context + * because the required locks are not interrupt-safe.  So what we can do is to + * mark the pages dirty _before_ performing IO.  And in interrupt context, + * check that the pages are still dirty.   If so, fine.  If not, redirty them + * in process context. + * + * We special-case compound pages here: normally this means reads into hugetlb + * pages.  The logic in here doesn't really work right for compound pages + * because the VM does not uniformly chase down the head page in all cases. + * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't + * handle them at all.  So we skip compound pages here at an early stage. + * + * Note that this code is very hard to test under normal circumstances because + * direct-io pins the pages with get_user_pages().  This makes + * is_page_cache_freeable return false, and the VM will not clean the pages. + * But other code (eg, flusher threads) could clean the pages if they are mapped + * pagecache. + * + * Simply disabling the call to bio_set_pages_dirty() is a good way to test the + * deferred bio dirtying paths. + */ + +/* + * bio_set_pages_dirty() will mark all the bio's pages as dirty. + */ +void bio_set_pages_dirty(struct bio *bio) +{ +	struct bio_vec *bvec; +	int i; + +	bio_for_each_segment_all(bvec, bio, i) { +		struct page *page = bvec->bv_page; + +		if (page && !PageCompound(page)) +			set_page_dirty_lock(page); +	} +} + +static void bio_release_pages(struct bio *bio) +{ +	struct bio_vec *bvec; +	int i; + +	bio_for_each_segment_all(bvec, bio, i) { +		struct page *page = bvec->bv_page; + +		if (page) +			put_page(page); +	} +} + +/* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. + * If they are, then fine.  If, however, some pages are clean then they must + * have been written out during the direct-IO read.  So we take another ref on + * the BIO and the offending pages and re-dirty the pages in process context. + * + * It is expected that bio_check_pages_dirty() will wholly own the BIO from + * here on.  It will run one page_cache_release() against each page and will + * run one bio_put() against the BIO. + */ + +static void bio_dirty_fn(struct work_struct *work); + +static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); +static DEFINE_SPINLOCK(bio_dirty_lock); +static struct bio *bio_dirty_list; + +/* + * This runs in process context + */ +static void bio_dirty_fn(struct work_struct *work) +{ +	unsigned long flags; +	struct bio *bio; + +	spin_lock_irqsave(&bio_dirty_lock, flags); +	bio = bio_dirty_list; +	bio_dirty_list = NULL; +	spin_unlock_irqrestore(&bio_dirty_lock, flags); + +	while (bio) { +		struct bio *next = bio->bi_private; + +		bio_set_pages_dirty(bio); +		bio_release_pages(bio); +		bio_put(bio); +		bio = next; +	} +} + +void bio_check_pages_dirty(struct bio *bio) +{ +	struct bio_vec *bvec; +	int nr_clean_pages = 0; +	int i; + +	bio_for_each_segment_all(bvec, bio, i) { +		struct page *page = bvec->bv_page; + +		if (PageDirty(page) || PageCompound(page)) { +			page_cache_release(page); +			bvec->bv_page = NULL; +		} else { +			nr_clean_pages++; +		} +	} + +	if (nr_clean_pages) { +		unsigned long flags; + +		spin_lock_irqsave(&bio_dirty_lock, flags); +		bio->bi_private = bio_dirty_list; +		bio_dirty_list = bio; +		spin_unlock_irqrestore(&bio_dirty_lock, flags); +		schedule_work(&bio_dirty_work); +	} else { +		bio_put(bio); +	} +} + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void bio_flush_dcache_pages(struct bio *bi) +{ +	struct bio_vec bvec; +	struct bvec_iter iter; + +	bio_for_each_segment(bvec, bi, iter) +		flush_dcache_page(bvec.bv_page); +} +EXPORT_SYMBOL(bio_flush_dcache_pages); +#endif + +/** + * bio_endio - end I/O on a bio + * @bio:	bio + * @error:	error, if any + * + * Description: + *   bio_endio() will end I/O on the whole bio. bio_endio() is the + *   preferred way to end I/O on a bio, it takes care of clearing + *   BIO_UPTODATE on error. @error is 0 on success, and and one of the + *   established -Exxxx (-EIO, for instance) error values in case + *   something went wrong. No one should call bi_end_io() directly on a + *   bio unless they own it and thus know that it has an end_io + *   function. + **/ +void bio_endio(struct bio *bio, int error) +{ +	while (bio) { +		BUG_ON(atomic_read(&bio->bi_remaining) <= 0); + +		if (error) +			clear_bit(BIO_UPTODATE, &bio->bi_flags); +		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) +			error = -EIO; + +		if (!atomic_dec_and_test(&bio->bi_remaining)) +			return; + +		/* +		 * Need to have a real endio function for chained bios, +		 * otherwise various corner cases will break (like stacking +		 * block devices that save/restore bi_end_io) - however, we want +		 * to avoid unbounded recursion and blowing the stack. Tail call +		 * optimization would handle this, but compiling with frame +		 * pointers also disables gcc's sibling call optimization. +		 */ +		if (bio->bi_end_io == bio_chain_endio) { +			struct bio *parent = bio->bi_private; +			bio_put(bio); +			bio = parent; +		} else { +			if (bio->bi_end_io) +				bio->bi_end_io(bio, error); +			bio = NULL; +		} +	} +} +EXPORT_SYMBOL(bio_endio); + +/** + * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining + * @bio:	bio + * @error:	error, if any + * + * For code that has saved and restored bi_end_io; thing hard before using this + * function, probably you should've cloned the entire bio. + **/ +void bio_endio_nodec(struct bio *bio, int error) +{ +	atomic_inc(&bio->bi_remaining); +	bio_endio(bio, error); +} +EXPORT_SYMBOL(bio_endio_nodec); + +/** + * bio_split - split a bio + * @bio:	bio to split + * @sectors:	number of sectors to split from the front of @bio + * @gfp:	gfp mask + * @bs:		bio set to allocate from + * + * Allocates and returns a new bio which represents @sectors from the start of + * @bio, and updates @bio to represent the remaining sectors. + * + * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's + * responsibility to ensure that @bio is not freed before the split. + */ +struct bio *bio_split(struct bio *bio, int sectors, +		      gfp_t gfp, struct bio_set *bs) +{ +	struct bio *split = NULL; + +	BUG_ON(sectors <= 0); +	BUG_ON(sectors >= bio_sectors(bio)); + +	split = bio_clone_fast(bio, gfp, bs); +	if (!split) +		return NULL; + +	split->bi_iter.bi_size = sectors << 9; + +	if (bio_integrity(split)) +		bio_integrity_trim(split, 0, sectors); + +	bio_advance(bio, split->bi_iter.bi_size); + +	return split; +} +EXPORT_SYMBOL(bio_split); + +/** + * bio_trim - trim a bio + * @bio:	bio to trim + * @offset:	number of sectors to trim from the front of @bio + * @size:	size we want to trim @bio to, in sectors + */ +void bio_trim(struct bio *bio, int offset, int size) +{ +	/* 'bio' is a cloned bio which we need to trim to match +	 * the given offset and size. +	 */ + +	size <<= 9; +	if (offset == 0 && size == bio->bi_iter.bi_size) +		return; + +	clear_bit(BIO_SEG_VALID, &bio->bi_flags); + +	bio_advance(bio, offset << 9); + +	bio->bi_iter.bi_size = size; +} +EXPORT_SYMBOL_GPL(bio_trim); + +/* + * create memory pools for biovec's in a bio_set. + * use the global biovec slabs created for general use. + */ +mempool_t *biovec_create_pool(int pool_entries) +{ +	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; + +	return mempool_create_slab_pool(pool_entries, bp->slab); +} + +void bioset_free(struct bio_set *bs) +{ +	if (bs->rescue_workqueue) +		destroy_workqueue(bs->rescue_workqueue); + +	if (bs->bio_pool) +		mempool_destroy(bs->bio_pool); + +	if (bs->bvec_pool) +		mempool_destroy(bs->bvec_pool); + +	bioset_integrity_free(bs); +	bio_put_slab(bs); + +	kfree(bs); +} +EXPORT_SYMBOL(bioset_free); + +/** + * bioset_create  - Create a bio_set + * @pool_size:	Number of bio and bio_vecs to cache in the mempool + * @front_pad:	Number of bytes to allocate in front of the returned bio + * + * Description: + *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + *    to ask for a number of bytes to be allocated in front of the bio. + *    Front pad allocation is useful for embedding the bio inside + *    another structure, to avoid allocating extra data to go with the bio. + *    Note that the bio must be embedded at the END of that structure always, + *    or things will break badly. + */ +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) +{ +	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); +	struct bio_set *bs; + +	bs = kzalloc(sizeof(*bs), GFP_KERNEL); +	if (!bs) +		return NULL; + +	bs->front_pad = front_pad; + +	spin_lock_init(&bs->rescue_lock); +	bio_list_init(&bs->rescue_list); +	INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + +	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); +	if (!bs->bio_slab) { +		kfree(bs); +		return NULL; +	} + +	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); +	if (!bs->bio_pool) +		goto bad; + +	bs->bvec_pool = biovec_create_pool(pool_size); +	if (!bs->bvec_pool) +		goto bad; + +	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); +	if (!bs->rescue_workqueue) +		goto bad; + +	return bs; +bad: +	bioset_free(bs); +	return NULL; +} +EXPORT_SYMBOL(bioset_create); + +#ifdef CONFIG_BLK_CGROUP +/** + * bio_associate_current - associate a bio with %current + * @bio: target bio + * + * Associate @bio with %current if it hasn't been associated yet.  Block + * layer will treat @bio as if it were issued by %current no matter which + * task actually issues it. + * + * This function takes an extra reference of @task's io_context and blkcg + * which will be put when @bio is released.  The caller must own @bio, + * ensure %current->io_context exists, and is responsible for synchronizing + * calls to this function. + */ +int bio_associate_current(struct bio *bio) +{ +	struct io_context *ioc; +	struct cgroup_subsys_state *css; + +	if (bio->bi_ioc) +		return -EBUSY; + +	ioc = current->io_context; +	if (!ioc) +		return -ENOENT; + +	/* acquire active ref on @ioc and associate */ +	get_io_context_active(ioc); +	bio->bi_ioc = ioc; + +	/* associate blkcg if exists */ +	rcu_read_lock(); +	css = task_css(current, blkio_cgrp_id); +	if (css && css_tryget_online(css)) +		bio->bi_css = css; +	rcu_read_unlock(); + +	return 0; +} + +/** + * bio_disassociate_task - undo bio_associate_current() + * @bio: target bio + */ +void bio_disassociate_task(struct bio *bio) +{ +	if (bio->bi_ioc) { +		put_io_context(bio->bi_ioc); +		bio->bi_ioc = NULL; +	} +	if (bio->bi_css) { +		css_put(bio->bi_css); +		bio->bi_css = NULL; +	} +} + +#endif /* CONFIG_BLK_CGROUP */ + +static void __init biovec_init_slabs(void) +{ +	int i; + +	for (i = 0; i < BIOVEC_NR_POOLS; i++) { +		int size; +		struct biovec_slab *bvs = bvec_slabs + i; + +		if (bvs->nr_vecs <= BIO_INLINE_VECS) { +			bvs->slab = NULL; +			continue; +		} + +		size = bvs->nr_vecs * sizeof(struct bio_vec); +		bvs->slab = kmem_cache_create(bvs->name, size, 0, +                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +	} +} + +static int __init init_bio(void) +{ +	bio_slab_max = 2; +	bio_slab_nr = 0; +	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); +	if (!bio_slabs) +		panic("bio: can't allocate bios\n"); + +	bio_integrity_init(); +	biovec_init_slabs(); + +	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); +	if (!fs_bio_set) +		panic("bio: can't allocate bios\n"); + +	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) +		panic("bio: can't create integrity pool\n"); + +	return 0; +} +subsys_initcall(init_bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e90c7c164c8..28d227c5ca7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -80,7 +80,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,  	blkg->q = q;  	INIT_LIST_HEAD(&blkg->q_node);  	blkg->blkcg = blkcg; -	blkg->refcnt = 1; +	atomic_set(&blkg->refcnt, 1);  	/* root blkg uses @q->root_rl, init rl only for !root blkgs */  	if (blkcg != &blkcg_root) { @@ -185,7 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,  	lockdep_assert_held(q->queue_lock);  	/* blkg holds a reference to blkcg */ -	if (!css_tryget(&blkcg->css)) { +	if (!css_tryget_online(&blkcg->css)) {  		ret = -EINVAL;  		goto err_free_blkg;  	} @@ -235,8 +235,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,  	blkg->online = true;  	spin_unlock(&blkcg->lock); -	if (!ret) +	if (!ret) { +		if (blkcg == &blkcg_root) { +			q->root_blkg = blkg; +			q->root_rl.blkg = blkg; +		}  		return blkg; +	}  	/* @blkg failed fully initialized, use the usual release path */  	blkg_put(blkg); @@ -331,10 +336,19 @@ static void blkg_destroy(struct blkcg_gq *blkg)  	 * under queue_lock.  If it's not pointing to @blkg now, it never  	 * will.  Hint assignment itself can race safely.  	 */ -	if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) +	if (rcu_access_pointer(blkcg->blkg_hint) == blkg)  		rcu_assign_pointer(blkcg->blkg_hint, NULL);  	/* +	 * If root blkg is destroyed.  Just clear the pointer since root_rl +	 * does not take reference on root blkg. +	 */ +	if (blkcg == &blkcg_root) { +		blkg->q->root_blkg = NULL; +		blkg->q->root_rl.blkg = NULL; +	} + +	/*  	 * Put the reference taken at the time of creation so that when all  	 * queues are gone, group can be destroyed.  	 */ @@ -360,13 +374,6 @@ static void blkg_destroy_all(struct request_queue *q)  		blkg_destroy(blkg);  		spin_unlock(&blkcg->lock);  	} - -	/* -	 * root blkg is destroyed.  Just clear the pointer since -	 * root_rl does not take reference on root blkg. -	 */ -	q->root_blkg = NULL; -	q->root_rl.blkg = NULL;  }  /* @@ -392,11 +399,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)  	/* release the blkcg and parent blkg refs this blkg has been holding */  	css_put(&blkg->blkcg->css); -	if (blkg->parent) { -		spin_lock_irq(blkg->q->queue_lock); +	if (blkg->parent)  		blkg_put(blkg->parent); -		spin_unlock_irq(blkg->q->queue_lock); -	}  	blkg_free(blkg);  } @@ -444,7 +448,20 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,  	struct blkcg_gq *blkg;  	int i; -	mutex_lock(&blkcg_pol_mutex); +	/* +	 * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex +	 * which ends up putting cgroup's internal cgroup_tree_mutex under +	 * it; however, cgroup_tree_mutex is nested above cgroup file +	 * active protection and grabbing blkcg_pol_mutex from a cgroup +	 * file operation creates a possible circular dependency.  cgroup +	 * internal locking is planned to go through further simplification +	 * and this issue should go away soon.  For now, let's trylock +	 * blkcg_pol_mutex and restart the write on failure. +	 * +	 * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com +	 */ +	if (!mutex_trylock(&blkcg_pol_mutex)) +		return restart_syscall();  	spin_lock_irq(&blkcg->lock);  	/* @@ -855,6 +872,13 @@ void blkcg_drain_queue(struct request_queue *q)  {  	lockdep_assert_held(q->queue_lock); +	/* +	 * @q could be exiting and already have destroyed all blkgs as +	 * indicated by NULL root_blkg.  If so, don't confuse policies. +	 */ +	if (!q->root_blkg) +		return; +  	blk_throtl_drain(q);  } @@ -887,7 +911,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,  	int ret = 0;  	/* task_lock() is needed to avoid races with exit_io_context() */ -	cgroup_taskset_for_each(task, css, tset) { +	cgroup_taskset_for_each(task, tset) {  		task_lock(task);  		ioc = task->io_context;  		if (ioc && atomic_read(&ioc->nr_tasks) > 1) @@ -899,17 +923,14 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,  	return ret;  } -struct cgroup_subsys blkio_subsys = { -	.name = "blkio", +struct cgroup_subsys blkio_cgrp_subsys = {  	.css_alloc = blkcg_css_alloc,  	.css_offline = blkcg_css_offline,  	.css_free = blkcg_css_free,  	.can_attach = blkcg_can_attach, -	.subsys_id = blkio_subsys_id,  	.base_cftypes = blkcg_files, -	.module = THIS_MODULE,  }; -EXPORT_SYMBOL_GPL(blkio_subsys); +EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);  /**   * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -970,8 +991,6 @@ int blkcg_activate_policy(struct request_queue *q,  		ret = PTR_ERR(blkg);  		goto out_unlock;  	} -	q->root_blkg = blkg; -	q->root_rl.blkg = blkg;  	list_for_each_entry(blkg, &q->blkg_list, q_node)  		cnt++; @@ -1101,7 +1120,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)  	/* everything is in place, add intf files for the new policy */  	if (pol->cftypes) -		WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); +		WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));  	ret = 0;  out_unlock:  	mutex_unlock(&blkcg_pol_mutex); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index ae6969a7ffd..d3fd7aa3d2a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -18,6 +18,7 @@  #include <linux/seq_file.h>  #include <linux/radix-tree.h>  #include <linux/blkdev.h> +#include <linux/atomic.h>  /* Max limits for throttle policy */  #define THROTL_IOPS_MAX		UINT_MAX @@ -104,7 +105,7 @@ struct blkcg_gq {  	struct request_list		rl;  	/* reference count */ -	int				refcnt; +	atomic_t			refcnt;  	/* is this blkg online? protected by both blkcg and q locks */  	bool				online; @@ -186,7 +187,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)  static inline struct blkcg *task_blkcg(struct task_struct *tsk)  { -	return css_to_blkcg(task_css(tsk, blkio_subsys_id)); +	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));  }  static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -204,7 +205,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)   */  static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)  { -	return css_to_blkcg(css_parent(&blkcg->css)); +	return css_to_blkcg(blkcg->css.parent);  }  /** @@ -241,25 +242,28 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)   */  static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)  { -	int ret; +	char *p; -	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); -	if (ret) +	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); +	if (!p) {  		strncpy(buf, "<unavailable>", buflen); -	return ret; +		return -ENAMETOOLONG; +	} + +	memmove(buf, p, buf + buflen - p); +	return 0;  }  /**   * blkg_get - get a blkg reference   * @blkg: blkg to get   * - * The caller should be holding queue_lock and an existing reference. + * The caller should be holding an existing reference.   */  static inline void blkg_get(struct blkcg_gq *blkg)  { -	lockdep_assert_held(blkg->q->queue_lock); -	WARN_ON_ONCE(!blkg->refcnt); -	blkg->refcnt++; +	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); +	atomic_inc(&blkg->refcnt);  }  void __blkg_release_rcu(struct rcu_head *rcu); @@ -267,14 +271,11 @@ void __blkg_release_rcu(struct rcu_head *rcu);  /**   * blkg_put - put a blkg reference   * @blkg: blkg to put - * - * The caller should be holding queue_lock.   */  static inline void blkg_put(struct blkcg_gq *blkg)  { -	lockdep_assert_held(blkg->q->queue_lock); -	WARN_ON_ONCE(blkg->refcnt <= 0); -	if (!--blkg->refcnt) +	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); +	if (atomic_dec_and_test(&blkg->refcnt))  		call_rcu(&blkg->rcu_head, __blkg_release_rcu);  } @@ -402,6 +403,11 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,  #define blk_queue_for_each_rl(rl, q)	\  	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) +static inline void blkg_stat_init(struct blkg_stat *stat) +{ +	u64_stats_init(&stat->syncp); +} +  /**   * blkg_stat_add - add a value to a blkg_stat   * @stat: target blkg_stat @@ -430,9 +436,9 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)  	uint64_t v;  	do { -		start = u64_stats_fetch_begin(&stat->syncp); +		start = u64_stats_fetch_begin_irq(&stat->syncp);  		v = stat->cnt; -	} while (u64_stats_fetch_retry(&stat->syncp, start)); +	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));  	return v;  } @@ -458,6 +464,11 @@ static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)  	blkg_stat_add(to, blkg_stat_read(from));  } +static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +{ +	u64_stats_init(&rwstat->syncp); +} +  /**   * blkg_rwstat_add - add a value to a blkg_rwstat   * @rwstat: target blkg_rwstat @@ -498,9 +509,9 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)  	struct blkg_rwstat tmp;  	do { -		start = u64_stats_fetch_begin(&rwstat->syncp); +		start = u64_stats_fetch_begin_irq(&rwstat->syncp);  		tmp = *rwstat; -	} while (u64_stats_fetch_retry(&rwstat->syncp, start)); +	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));  	return tmp;  } diff --git a/block/blk-core.c b/block/blk-core.c index c0450535834..6f8dba161bf 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,6 +16,7 @@  #include <linux/backing-dev.h>  #include <linux/bio.h>  #include <linux/blkdev.h> +#include <linux/blk-mq.h>  #include <linux/highmem.h>  #include <linux/mm.h>  #include <linux/kernel_stat.h> @@ -37,10 +38,12 @@  #include "blk.h"  #include "blk-cgroup.h" +#include "blk-mq.h"  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);  EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);  DEFINE_IDA(blk_queue_ida); @@ -48,7 +51,7 @@ DEFINE_IDA(blk_queue_ida);  /*   * For the allocated request tables   */ -static struct kmem_cache *request_cachep; +struct kmem_cache *request_cachep = NULL;  /*   * For queue allocation @@ -60,42 +63,6 @@ struct kmem_cache *blk_requestq_cachep;   */  static struct workqueue_struct *kblockd_workqueue; -static void drive_stat_acct(struct request *rq, int new_io) -{ -	struct hd_struct *part; -	int rw = rq_data_dir(rq); -	int cpu; - -	if (!blk_do_io_stat(rq)) -		return; - -	cpu = part_stat_lock(); - -	if (!new_io) { -		part = rq->part; -		part_stat_inc(cpu, part, merges[rw]); -	} else { -		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); -		if (!hd_struct_try_get(part)) { -			/* -			 * The partition is already being removed, -			 * the request will be accounted on the disk only -			 * -			 * We take a reference on disk->part0 although that -			 * partition will never be deleted, so we can treat -			 * it as any other partition. -			 */ -			part = &rq->rq_disk->part0; -			hd_struct_get(part); -		} -		part_round_stats(cpu, part); -		part_inc_in_flight(part, rw); -		rq->part = part; -	} - -	part_stat_unlock(); -} -  void blk_queue_congestion_threshold(struct request_queue *q)  {  	int nr; @@ -145,7 +112,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)  	rq->cmd = rq->__cmd;  	rq->cmd_len = BLK_MAX_CDB;  	rq->tag = -1; -	rq->ref_count = 1;  	rq->start_time = jiffies;  	set_start_time_ns(rq);  	rq->part = NULL; @@ -166,7 +132,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,  	bio_advance(bio, nbytes);  	/* don't actually finish bio if it's part of flush sequence */ -	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) +	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))  		bio_endio(bio, error);  } @@ -174,15 +140,15 @@ void blk_dump_rq_flags(struct request *rq, char *msg)  {  	int bit; -	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, +	printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,  		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, -		rq->cmd_flags); +		(unsigned long long) rq->cmd_flags);  	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",  	       (unsigned long long)blk_rq_pos(rq),  	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); -	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n", -	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); +	printk(KERN_INFO "  bio %p, biotail %p, len %u\n", +	       rq->bio, rq->biotail, blk_rq_bytes(rq));  	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {  		printk(KERN_INFO "  cdb: "); @@ -281,7 +247,18 @@ EXPORT_SYMBOL(blk_stop_queue);  void blk_sync_queue(struct request_queue *q)  {  	del_timer_sync(&q->timeout); -	cancel_delayed_work_sync(&q->delay_work); + +	if (q->mq_ops) { +		struct blk_mq_hw_ctx *hctx; +		int i; + +		queue_for_each_hw_ctx(q, hctx, i) { +			cancel_delayed_work_sync(&hctx->run_work); +			cancel_delayed_work_sync(&hctx->delay_work); +		} +	} else { +		cancel_delayed_work_sync(&q->delay_work); +	}  }  EXPORT_SYMBOL(blk_sync_queue); @@ -533,8 +510,13 @@ void blk_cleanup_queue(struct request_queue *q)  	 * Drain all requests queued before DYING marking. Set DEAD flag to  	 * prevent that q->request_fn() gets invoked after draining finished.  	 */ -	spin_lock_irq(lock); -	__blk_drain_queue(q, true); +	if (q->mq_ops) { +		blk_mq_drain_queue(q); +		spin_lock_irq(lock); +	} else { +		spin_lock_irq(lock); +		__blk_drain_queue(q, true); +	}  	queue_flag_set(QUEUE_FLAG_DEAD, q);  	spin_unlock_irq(lock); @@ -644,11 +626,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)  	q->bypass_depth = 1;  	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); +	init_waitqueue_head(&q->mq_freeze_wq); +  	if (blkcg_init_queue(q)) -		goto fail_id; +		goto fail_bdi;  	return q; +fail_bdi: +	bdi_destroy(&q->backing_dev_info);  fail_id:  	ida_simple_remove(&blk_queue_ida, q->id);  fail_q: @@ -720,9 +706,13 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,  	if (!q)  		return NULL; -	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) +	q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); +	if (!q->flush_rq)  		return NULL; +	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) +		goto fail; +  	q->request_fn		= rfn;  	q->prep_rq_fn		= NULL;  	q->unprep_rq_fn		= NULL; @@ -739,10 +729,22 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,  	q->sg_reserved_size = INT_MAX; +	/* Protect q->elevator from elevator_change */ +	mutex_lock(&q->sysfs_lock); +  	/* init elevator */ -	if (elevator_init(q, NULL)) -		return NULL; +	if (elevator_init(q, NULL)) { +		mutex_unlock(&q->sysfs_lock); +		goto fail; +	} + +	mutex_unlock(&q->sysfs_lock); +  	return q; + +fail: +	kfree(q->flush_rq); +	return NULL;  }  EXPORT_SYMBOL(blk_init_allocated_queue); @@ -842,6 +844,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)  		__freed_request(rl, sync ^ 1);  } +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ +	struct request_list *rl; + +	spin_lock_irq(q->queue_lock); +	q->nr_requests = nr; +	blk_queue_congestion_threshold(q); + +	/* congestion isn't cgroup aware and follows root blkcg for now */ +	rl = &q->root_rl; + +	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) +		blk_set_queue_congested(q, BLK_RW_SYNC); +	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) +		blk_clear_queue_congested(q, BLK_RW_SYNC); + +	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) +		blk_set_queue_congested(q, BLK_RW_ASYNC); +	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) +		blk_clear_queue_congested(q, BLK_RW_ASYNC); + +	blk_queue_for_each_rl(rl, q) { +		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { +			blk_set_rl_full(rl, BLK_RW_SYNC); +		} else { +			blk_clear_rl_full(rl, BLK_RW_SYNC); +			wake_up(&rl->wait[BLK_RW_SYNC]); +		} + +		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { +			blk_set_rl_full(rl, BLK_RW_ASYNC); +		} else { +			blk_clear_rl_full(rl, BLK_RW_ASYNC); +			wake_up(&rl->wait[BLK_RW_ASYNC]); +		} +	} + +	spin_unlock_irq(q->queue_lock); +	return 0; +} +  /*   * Determine if elevator data should be initialized when allocating the   * request associated with @bio. @@ -1109,7 +1152,8 @@ retry:  	goto retry;  } -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +static struct request *blk_old_get_request(struct request_queue *q, int rw, +		gfp_t gfp_mask)  {  	struct request *rq; @@ -1126,6 +1170,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)  	return rq;  } + +struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +{ +	if (q->mq_ops) +		return blk_mq_alloc_request(q, rw, gfp_mask, false); +	else +		return blk_old_get_request(q, rw, gfp_mask); +}  EXPORT_SYMBOL(blk_get_request);  /** @@ -1167,6 +1219,8 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio,  	if (unlikely(!rq))  		return ERR_PTR(-ENOMEM); +	blk_rq_set_block_pc(rq); +  	for_each_bio(bio) {  		struct bio *bounce_bio = bio;  		int ret; @@ -1184,6 +1238,22 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio,  EXPORT_SYMBOL(blk_make_request);  /** + * blk_rq_set_block_pc - initialize a requeest to type BLOCK_PC + * @rq:		request to be initialized + * + */ +void blk_rq_set_block_pc(struct request *rq) +{ +	rq->cmd_type = REQ_TYPE_BLOCK_PC; +	rq->__data_len = 0; +	rq->__sector = (sector_t) -1; +	rq->bio = rq->biotail = NULL; +	memset(rq->__cmd, 0, sizeof(rq->__cmd)); +	rq->cmd = rq->__cmd; +} +EXPORT_SYMBOL(blk_rq_set_block_pc); + +/**   * blk_requeue_request - put a request back on queue   * @q:		request queue where request should be inserted   * @rq:		request to be inserted @@ -1211,19 +1281,22 @@ EXPORT_SYMBOL(blk_requeue_request);  static void add_acct_request(struct request_queue *q, struct request *rq,  			     int where)  { -	drive_stat_acct(rq, 1); +	blk_account_io_start(rq, true);  	__elv_add_request(q, rq, where);  }  static void part_round_stats_single(int cpu, struct hd_struct *part,  				    unsigned long now)  { +	int inflight; +  	if (now == part->stamp)  		return; -	if (part_in_flight(part)) { +	inflight = part_in_flight(part); +	if (inflight) {  		__part_stat_add(cpu, part, time_in_queue, -				part_in_flight(part) * (now - part->stamp)); +				inflight * (now - part->stamp));  		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));  	}  	part->stamp = now; @@ -1272,8 +1345,11 @@ void __blk_put_request(struct request_queue *q, struct request *req)  {  	if (unlikely(!q))  		return; -	if (unlikely(--req->ref_count)) + +	if (q->mq_ops) { +		blk_mq_free_request(req);  		return; +	}  	blk_pm_put_request(req); @@ -1291,7 +1367,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)  		struct request_list *rl = blk_rq_rl(req);  		BUG_ON(!list_empty(&req->queuelist)); -		BUG_ON(!hlist_unhashed(&req->hash)); +		BUG_ON(ELV_ON_HASH(req));  		blk_free_request(rl, req);  		freed_request(rl, flags); @@ -1302,12 +1378,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);  void blk_put_request(struct request *req)  { -	unsigned long flags;  	struct request_queue *q = req->q; -	spin_lock_irqsave(q->queue_lock, flags); -	__blk_put_request(q, req); -	spin_unlock_irqrestore(q->queue_lock, flags); +	if (q->mq_ops) +		blk_mq_free_request(req); +	else { +		unsigned long flags; + +		spin_lock_irqsave(q->queue_lock, flags); +		__blk_put_request(q, req); +		spin_unlock_irqrestore(q->queue_lock, flags); +	}  }  EXPORT_SYMBOL(blk_put_request); @@ -1333,18 +1414,17 @@ void blk_add_request_payload(struct request *rq, struct page *page,  	bio->bi_io_vec->bv_offset = 0;  	bio->bi_io_vec->bv_len = len; -	bio->bi_size = len; +	bio->bi_iter.bi_size = len;  	bio->bi_vcnt = 1;  	bio->bi_phys_segments = 1;  	rq->__data_len = rq->resid_len = len;  	rq->nr_phys_segments = 1; -	rq->buffer = bio_data(bio);  }  EXPORT_SYMBOL_GPL(blk_add_request_payload); -static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, -				   struct bio *bio) +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, +			    struct bio *bio)  {  	const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1358,15 +1438,15 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,  	req->biotail->bi_next = bio;  	req->biotail = bio; -	req->__data_len += bio->bi_size; +	req->__data_len += bio->bi_iter.bi_size;  	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); -	drive_stat_acct(req, 0); +	blk_account_io_start(req, false);  	return true;  } -static bool bio_attempt_front_merge(struct request_queue *q, -				    struct request *req, struct bio *bio) +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, +			     struct bio *bio)  {  	const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1381,22 +1461,16 @@ static bool bio_attempt_front_merge(struct request_queue *q,  	bio->bi_next = req->bio;  	req->bio = bio; -	/* -	 * may not be valid. if the low level driver said -	 * it didn't need a bounce buffer then it better -	 * not touch req->buffer either... -	 */ -	req->buffer = bio_data(bio); -	req->__sector = bio->bi_sector; -	req->__data_len += bio->bi_size; +	req->__sector = bio->bi_iter.bi_sector; +	req->__data_len += bio->bi_iter.bi_size;  	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); -	drive_stat_acct(req, 0); +	blk_account_io_start(req, false);  	return true;  }  /** - * attempt_plug_merge - try to merge with %current's plugged list + * blk_attempt_plug_merge - try to merge with %current's plugged list   * @q: request_queue new bio is being queued at   * @bio: new bio being queued   * @request_count: out parameter for number of traversed plugged requests @@ -1411,20 +1485,28 @@ static bool bio_attempt_front_merge(struct request_queue *q,   * added on the elevator at this point.  In addition, we don't have   * reliable access to the elevator outside queue lock.  Only check basic   * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand.   */ -static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, -			       unsigned int *request_count) +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, +			    unsigned int *request_count)  {  	struct blk_plug *plug;  	struct request *rq;  	bool ret = false; +	struct list_head *plug_list;  	plug = current->plug;  	if (!plug)  		goto out;  	*request_count = 0; -	list_for_each_entry_reverse(rq, &plug->list, queuelist) { +	if (q->mq_ops) +		plug_list = &plug->mq_list; +	else +		plug_list = &plug->list; + +	list_for_each_entry_reverse(rq, plug_list, queuelist) {  		int el_ret;  		if (rq->q == q) @@ -1457,7 +1539,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)  		req->cmd_flags |= REQ_FAILFAST_MASK;  	req->errors = 0; -	req->__sector = bio->bi_sector; +	req->__sector = bio->bi_iter.bi_sector;  	req->ioprio = bio_prio(bio);  	blk_rq_bio_prep(req->q, req, bio);  } @@ -1492,7 +1574,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)  	 * Check if we can merge with the plugged list before grabbing  	 * any locks.  	 */ -	if (attempt_plug_merge(q, bio, &request_count)) +	if (!blk_queue_nomerges(q) && +	    blk_attempt_plug_merge(q, bio, &request_count))  		return;  	spin_lock_irq(q->queue_lock); @@ -1549,11 +1632,9 @@ get_rq:  	if (plug) {  		/*  		 * If this is the first request added after a plug, fire -		 * of a plug trace. If others have been added before, check -		 * if we have multiple devices in this plug. If so, make a -		 * note to sort the list before dispatch. +		 * of a plug trace.  		 */ -		if (list_empty(&plug->list)) +		if (!request_count)  			trace_block_plug(q);  		else {  			if (request_count >= BLK_MAX_REQUEST_COUNT) { @@ -1562,7 +1643,7 @@ get_rq:  			}  		}  		list_add_tail(&req->queuelist, &plug->list); -		drive_stat_acct(req, 1); +		blk_account_io_start(req, true);  	} else {  		spin_lock_irq(q->queue_lock);  		add_acct_request(q, req, where); @@ -1583,12 +1664,12 @@ static inline void blk_partition_remap(struct bio *bio)  	if (bio_sectors(bio) && bdev != bdev->bd_contains) {  		struct hd_struct *p = bdev->bd_part; -		bio->bi_sector += p->start_sect; +		bio->bi_iter.bi_sector += p->start_sect;  		bio->bi_bdev = bdev->bd_contains;  		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,  				      bdev->bd_dev, -				      bio->bi_sector - p->start_sect); +				      bio->bi_iter.bi_sector - p->start_sect);  	}  } @@ -1626,7 +1707,7 @@ static int __init fail_make_request_debugfs(void)  	struct dentry *dir = fault_create_debugfs_attr("fail_make_request",  						NULL, &fail_make_request); -	return IS_ERR(dir) ? PTR_ERR(dir) : 0; +	return PTR_ERR_OR_ZERO(dir);  }  late_initcall(fail_make_request_debugfs); @@ -1654,7 +1735,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)  	/* Test device or partition size, when known. */  	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;  	if (maxsector) { -		sector_t sector = bio->bi_sector; +		sector_t sector = bio->bi_iter.bi_sector;  		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {  			/* @@ -1690,7 +1771,7 @@ generic_make_request_checks(struct bio *bio)  		       "generic_make_request: Trying to access "  			"nonexistent block-device %s (%Lu)\n",  			bdevname(bio->bi_bdev, b), -			(long long) bio->bi_sector); +			(long long) bio->bi_iter.bi_sector);  		goto end_io;  	} @@ -1704,9 +1785,9 @@ generic_make_request_checks(struct bio *bio)  	}  	part = bio->bi_bdev->bd_part; -	if (should_fail_request(part, bio->bi_size) || +	if (should_fail_request(part, bio->bi_iter.bi_size) ||  	    should_fail_request(&part_to_disk(part)->part0, -				bio->bi_size)) +				bio->bi_iter.bi_size))  		goto end_io;  	/* @@ -1865,7 +1946,7 @@ void submit_bio(int rw, struct bio *bio)  		if (rw & WRITE) {  			count_vm_events(PGPGOUT, count);  		} else { -			task_io_account_read(bio->bi_size); +			task_io_account_read(bio->bi_iter.bi_size);  			count_vm_events(PGPGIN, count);  		} @@ -1874,7 +1955,7 @@ void submit_bio(int rw, struct bio *bio)  			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",  			current->comm, task_pid_nr(current),  				(rw & WRITE) ? "WRITE" : "READ", -				(unsigned long long)bio->bi_sector, +				(unsigned long long)bio->bi_iter.bi_sector,  				bdevname(bio->bi_bdev, b),  				count);  		} @@ -1900,7 +1981,7 @@ EXPORT_SYMBOL(submit_bio);   *    in some cases below, so export this function.   *    Request stacking drivers like request-based dm may change the queue   *    limits while requests are in the queue (e.g. dm's table swapping). - *    Such request stacking drivers should check those requests agaist + *    Such request stacking drivers should check those requests against   *    the new queue limits again when they dispatch those requests,   *    although such checkings are also done against the old queue limits   *    when submitting requests. @@ -2007,7 +2088,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)  	for (bio = rq->bio; bio; bio = bio->bi_next) {  		if ((bio->bi_rw & ff) != ff)  			break; -		bytes += bio->bi_size; +		bytes += bio->bi_iter.bi_size;  	}  	/* this could lead to infinite loop */ @@ -2016,7 +2097,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)  }  EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void blk_account_io_completion(struct request *req, unsigned int bytes) +void blk_account_io_completion(struct request *req, unsigned int bytes)  {  	if (blk_do_io_stat(req)) {  		const int rw = rq_data_dir(req); @@ -2030,7 +2111,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)  	}  } -static void blk_account_io_done(struct request *req) +void blk_account_io_done(struct request *req)  {  	/*  	 * Account IO completion.  flush_rq isn't accounted as a @@ -2078,6 +2159,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,  }  #endif +void blk_account_io_start(struct request *rq, bool new_io) +{ +	struct hd_struct *part; +	int rw = rq_data_dir(rq); +	int cpu; + +	if (!blk_do_io_stat(rq)) +		return; + +	cpu = part_stat_lock(); + +	if (!new_io) { +		part = rq->part; +		part_stat_inc(cpu, part, merges[rw]); +	} else { +		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); +		if (!hd_struct_try_get(part)) { +			/* +			 * The partition is already being removed, +			 * the request will be accounted on the disk only +			 * +			 * We take a reference on disk->part0 although that +			 * partition will never be deleted, so we can treat +			 * it as any other partition. +			 */ +			part = &rq->rq_disk->part0; +			hd_struct_get(part); +		} +		part_round_stats(cpu, part); +		part_inc_in_flight(part, rw); +		rq->part = part; +	} + +	part_stat_unlock(); +} +  /**   * blk_peek_request - peek at the top of a request queue   * @q: request queue to peek at @@ -2229,6 +2346,7 @@ void blk_start_request(struct request *req)  	if (unlikely(blk_bidi_rq(req)))  		req->next_rq->resid_len = blk_rq_bytes(req->next_rq); +	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));  	blk_add_timer(req);  }  EXPORT_SYMBOL(blk_start_request); @@ -2288,7 +2406,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)  	if (!req->bio)  		return false; -	trace_block_rq_complete(req->q, req); +	trace_block_rq_complete(req->q, req, nr_bytes);  	/*  	 * For fs requests, rq is just carrier of independent bio's @@ -2341,9 +2459,9 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)  	total_bytes = 0;  	while (req->bio) {  		struct bio *bio = req->bio; -		unsigned bio_bytes = min(bio->bi_size, nr_bytes); +		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); -		if (bio_bytes == bio->bi_size) +		if (bio_bytes == bio->bi_iter.bi_size)  			req->bio = bio->bi_next;  		req_bio_endio(req, bio, bio_bytes, error); @@ -2369,7 +2487,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)  	}  	req->__data_len -= total_bytes; -	req->buffer = bio_data(req->bio);  	/* update sector only for requests with clear definition of sector */  	if (req->cmd_type == REQ_TYPE_FS) @@ -2438,7 +2555,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);  /*   * queue lock must be held   */ -static void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, int error)  {  	if (blk_rq_tagged(req))  		blk_queue_end_tag(req->q, req); @@ -2453,7 +2570,6 @@ static void blk_finish_request(struct request *req, int error)  	if (req->cmd_flags & REQ_DONTPREP)  		blk_unprep_request(req); -  	blk_account_io_done(req);  	if (req->end_io) @@ -2465,6 +2581,7 @@ static void blk_finish_request(struct request *req, int error)  		__blk_put_request(req->q, req);  	}  } +EXPORT_SYMBOL(blk_finish_request);  /**   * blk_end_bidi_request - Complete a bidi request @@ -2688,11 +2805,10 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,  	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */  	rq->cmd_flags |= bio->bi_rw & REQ_WRITE; -	if (bio_has_data(bio)) { +	if (bio_has_data(bio))  		rq->nr_phys_segments = bio_phys_segments(q, bio); -		rq->buffer = bio_data(bio); -	} -	rq->__data_len = bio->bi_size; + +	rq->__data_len = bio->bi_iter.bi_size;  	rq->bio = rq->biotail = bio;  	if (bio->bi_bdev) @@ -2710,10 +2826,10 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,  void rq_flush_dcache_pages(struct request *rq)  {  	struct req_iterator iter; -	struct bio_vec *bvec; +	struct bio_vec bvec;  	rq_for_each_segment(bvec, rq, iter) -		flush_dcache_page(bvec->bv_page); +		flush_dcache_page(bvec.bv_page);  }  EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);  #endif @@ -2767,7 +2883,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);  /*   * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. + * The actual data parts (e.g. ->cmd, ->sense) are not copied.   */  static void __blk_rq_prep_clone(struct request *dst, struct request *src)  { @@ -2793,7 +2909,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)   *   * Description:   *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) + *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)   *     are not copied, and copying such parts is the caller's responsibility.   *     Also, pages which the original bios are pointing to are not copied   *     and the cloned bios just point same pages. @@ -2840,20 +2956,25 @@ free_and_out:  }  EXPORT_SYMBOL_GPL(blk_rq_prep_clone); -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +int kblockd_schedule_work(struct work_struct *work)  {  	return queue_work(kblockd_workqueue, work);  }  EXPORT_SYMBOL(kblockd_schedule_work); -int kblockd_schedule_delayed_work(struct request_queue *q, -			struct delayed_work *dwork, unsigned long delay) +int kblockd_schedule_delayed_work(struct delayed_work *dwork, +				  unsigned long delay)  {  	return queue_delayed_work(kblockd_workqueue, dwork, delay);  }  EXPORT_SYMBOL(kblockd_schedule_delayed_work); -#define PLUG_MAGIC	0x91827364 +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, +				     unsigned long delay) +{ +	return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);  /**   * blk_start_plug - initialize blk_plug and track it inside the task_struct @@ -2873,8 +2994,8 @@ void blk_start_plug(struct blk_plug *plug)  {  	struct task_struct *tsk = current; -	plug->magic = PLUG_MAGIC;  	INIT_LIST_HEAD(&plug->list); +	INIT_LIST_HEAD(&plug->mq_list);  	INIT_LIST_HEAD(&plug->cb_list);  	/* @@ -2969,9 +3090,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)  	LIST_HEAD(list);  	unsigned int depth; -	BUG_ON(plug->magic != PLUG_MAGIC); -  	flush_plug_callbacks(plug, from_schedule); + +	if (!list_empty(&plug->mq_list)) +		blk_mq_flush_plug_list(plug, from_schedule); +  	if (list_empty(&plug->list))  		return; @@ -3189,8 +3312,7 @@ int __init blk_dev_init(void)  	/* used for unplugging and affects IO latency/throughput - HIGHPRI */  	kblockd_workqueue = alloc_workqueue("kblockd", -					    WQ_MEM_RECLAIM | WQ_HIGHPRI | -					    WQ_POWER_EFFICIENT, 0); +					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);  	if (!kblockd_workqueue)  		panic("Failed to create kblockd\n"); diff --git a/block/blk-exec.c b/block/blk-exec.c index e7062139612..f4d27b12c90 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -5,6 +5,7 @@  #include <linux/module.h>  #include <linux/bio.h>  #include <linux/blkdev.h> +#include <linux/blk-mq.h>  #include <linux/sched/sysctl.h>  #include "blk.h" @@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)  	struct completion *waiting = rq->end_io_data;  	rq->end_io_data = NULL; -	__blk_put_request(rq->q, rq);  	/*  	 * complete last, if this is a stack request the process (and thus @@ -59,6 +59,16 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,  	rq->rq_disk = bd_disk;  	rq->end_io = done; + +	/* +	 * don't check dying flag for MQ because the request won't +	 * be resued after dying flag is set +	 */ +	if (q->mq_ops) { +		blk_mq_insert_request(rq, at_head, true, false); +		return; +	} +  	/*  	 * need to check this before __blk_run_queue(), because rq can  	 * be freed before that returns. @@ -68,9 +78,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,  	spin_lock_irq(q->queue_lock);  	if (unlikely(blk_queue_dying(q))) { +		rq->cmd_flags |= REQ_QUIET;   		rq->errors = -ENXIO; -		if (rq->end_io) -			rq->end_io(rq, rq->errors); +		__blk_end_request_all(rq, rq->errors);  		spin_unlock_irq(q->queue_lock);  		return;  	} @@ -103,12 +113,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,  	int err = 0;  	unsigned long hang_check; -	/* -	 * we need an extra reference to the request, so we can look at -	 * it after io completion -	 */ -	rq->ref_count++; -  	if (!rq->sense) {  		memset(sense, 0, sizeof(sense));  		rq->sense = sense; @@ -128,6 +132,11 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,  	if (rq->errors)  		err = -EIO; +	if (rq->sense == sense)	{ +		rq->sense = NULL; +		rq->sense_len = 0; +	} +  	return err;  }  EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index cc2b827a853..3cb5e9e7108 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,8 +69,10 @@  #include <linux/bio.h>  #include <linux/blkdev.h>  #include <linux/gfp.h> +#include <linux/blk-mq.h>  #include "blk.h" +#include "blk-mq.h"  /* FLUSH/FUA sequences */  enum { @@ -124,6 +126,25 @@ static void blk_flush_restore_request(struct request *rq)  	/* make @rq a normal request */  	rq->cmd_flags &= ~REQ_FLUSH_SEQ;  	rq->end_io = rq->flush.saved_end_io; + +	blk_clear_rq_complete(rq); +} + +static bool blk_flush_queue_rq(struct request *rq, bool add_front) +{ +	if (rq->q->mq_ops) { +		struct request_queue *q = rq->q; + +		blk_mq_add_to_requeue_list(rq, add_front); +		blk_mq_kick_requeue_list(q); +		return false; +	} else { +		if (add_front) +			list_add(&rq->queuelist, &rq->q->queue_head); +		else +			list_add_tail(&rq->queuelist, &rq->q->queue_head); +		return true; +	}  }  /** @@ -136,7 +157,7 @@ static void blk_flush_restore_request(struct request *rq)   * completion and trigger the next step.   *   * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock or q->mq_flush_lock)   *   * RETURNS:   * %true if requests were added to the dispatch queue, %false otherwise. @@ -146,7 +167,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,  {  	struct request_queue *q = rq->q;  	struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; -	bool queued = false; +	bool queued = false, kicked;  	BUG_ON(rq->flush.seq & seq);  	rq->flush.seq |= seq; @@ -167,8 +188,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,  	case REQ_FSEQ_DATA:  		list_move_tail(&rq->flush.list, &q->flush_data_in_flight); -		list_add(&rq->queuelist, &q->queue_head); -		queued = true; +		queued = blk_flush_queue_rq(rq, true);  		break;  	case REQ_FSEQ_DONE: @@ -181,28 +201,41 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,  		BUG_ON(!list_empty(&rq->queuelist));  		list_del_init(&rq->flush.list);  		blk_flush_restore_request(rq); -		__blk_end_request_all(rq, error); +		if (q->mq_ops) +			blk_mq_end_io(rq, error); +		else +			__blk_end_request_all(rq, error);  		break;  	default:  		BUG();  	} -	return blk_kick_flush(q) | queued; +	kicked = blk_kick_flush(q); +	return kicked | queued;  }  static void flush_end_io(struct request *flush_rq, int error)  {  	struct request_queue *q = flush_rq->q; -	struct list_head *running = &q->flush_queue[q->flush_running_idx]; +	struct list_head *running;  	bool queued = false;  	struct request *rq, *n; +	unsigned long flags = 0; +	if (q->mq_ops) { +		spin_lock_irqsave(&q->mq_flush_lock, flags); +		q->flush_rq->tag = -1; +	} + +	running = &q->flush_queue[q->flush_running_idx];  	BUG_ON(q->flush_pending_idx == q->flush_running_idx);  	/* account completion of the flush request */  	q->flush_running_idx ^= 1; -	elv_completed_request(q, flush_rq); + +	if (!q->mq_ops) +		elv_completed_request(q, flush_rq);  	/* and push the waiting requests to the next stage */  	list_for_each_entry_safe(rq, n, running, flush.list) { @@ -223,9 +256,13 @@ static void flush_end_io(struct request *flush_rq, int error)  	 * directly into request_fn may confuse the driver.  Always use  	 * kblockd.  	 */ -	if (queued || q->flush_queue_delayed) +	if (queued || q->flush_queue_delayed) { +		WARN_ON(q->mq_ops);  		blk_run_queue_async(q); +	}  	q->flush_queue_delayed = 0; +	if (q->mq_ops) +		spin_unlock_irqrestore(&q->mq_flush_lock, flags);  }  /** @@ -236,7 +273,7 @@ static void flush_end_io(struct request *flush_rq, int error)   * Please read the comment at the top of this file for more info.   *   * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock or q->mq_flush_lock)   *   * RETURNS:   * %true if flush was issued, %false otherwise. @@ -261,15 +298,18 @@ static bool blk_kick_flush(struct request_queue *q)  	 * Issue flush and toggle pending_idx.  This makes pending_idx  	 * different from running_idx, which means flush is in flight.  	 */ -	blk_rq_init(q, &q->flush_rq); -	q->flush_rq.cmd_type = REQ_TYPE_FS; -	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; -	q->flush_rq.rq_disk = first_rq->rq_disk; -	q->flush_rq.end_io = flush_end_io; -  	q->flush_pending_idx ^= 1; -	list_add_tail(&q->flush_rq.queuelist, &q->queue_head); -	return true; + +	blk_rq_init(q, q->flush_rq); +	if (q->mq_ops) +		blk_mq_clone_flush_request(q->flush_rq, first_rq); + +	q->flush_rq->cmd_type = REQ_TYPE_FS; +	q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; +	q->flush_rq->rq_disk = first_rq->rq_disk; +	q->flush_rq->end_io = flush_end_io; + +	return blk_flush_queue_rq(q->flush_rq, false);  }  static void flush_data_end_io(struct request *rq, int error) @@ -284,16 +324,37 @@ static void flush_data_end_io(struct request *rq, int error)  		blk_run_queue_async(q);  } +static void mq_flush_data_end_io(struct request *rq, int error) +{ +	struct request_queue *q = rq->q; +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *ctx; +	unsigned long flags; + +	ctx = rq->mq_ctx; +	hctx = q->mq_ops->map_queue(q, ctx->cpu); + +	/* +	 * After populating an empty queue, kick it to avoid stall.  Read +	 * the comment in flush_end_io(). +	 */ +	spin_lock_irqsave(&q->mq_flush_lock, flags); +	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) +		blk_mq_run_hw_queue(hctx, true); +	spin_unlock_irqrestore(&q->mq_flush_lock, flags); +} +  /**   * blk_insert_flush - insert a new FLUSH/FUA request   * @rq: request to insert   *   * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. + * or __blk_mq_run_hw_queue() to dispatch request.   * @rq is being submitted.  Analyze what needs to be done and put it on the   * right queue.   *   * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock) in !mq case   */  void blk_insert_flush(struct request *rq)  { @@ -316,7 +377,10 @@ void blk_insert_flush(struct request *rq)  	 * complete the request.  	 */  	if (!policy) { -		__blk_end_bidi_request(rq, 0, 0, 0); +		if (q->mq_ops) +			blk_mq_end_io(rq, 0); +		else +			__blk_end_bidi_request(rq, 0, 0, 0);  		return;  	} @@ -329,7 +393,10 @@ void blk_insert_flush(struct request *rq)  	 */  	if ((policy & REQ_FSEQ_DATA) &&  	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { -		list_add_tail(&rq->queuelist, &q->queue_head); +		if (q->mq_ops) { +			blk_mq_insert_request(rq, false, false, true); +		} else +			list_add_tail(&rq->queuelist, &q->queue_head);  		return;  	} @@ -341,56 +408,17 @@ void blk_insert_flush(struct request *rq)  	INIT_LIST_HEAD(&rq->flush.list);  	rq->cmd_flags |= REQ_FLUSH_SEQ;  	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ -	rq->end_io = flush_data_end_io; +	if (q->mq_ops) { +		rq->end_io = mq_flush_data_end_io; -	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); -} - -/** - * blk_abort_flushes - @q is being aborted, abort flush requests - * @q: request_queue being aborted - * - * To be called from elv_abort_queue().  @q is being aborted.  Prepare all - * FLUSH/FUA requests for abortion. - * - * CONTEXT: - * spin_lock_irq(q->queue_lock) - */ -void blk_abort_flushes(struct request_queue *q) -{ -	struct request *rq, *n; -	int i; - -	/* -	 * Requests in flight for data are already owned by the dispatch -	 * queue or the device driver.  Just restore for normal completion. -	 */ -	list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) { -		list_del_init(&rq->flush.list); -		blk_flush_restore_request(rq); -	} - -	/* -	 * We need to give away requests on flush queues.  Restore for -	 * normal completion and put them on the dispatch queue. -	 */ -	for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) { -		list_for_each_entry_safe(rq, n, &q->flush_queue[i], -					 flush.list) { -			list_del_init(&rq->flush.list); -			blk_flush_restore_request(rq); -			list_add_tail(&rq->queuelist, &q->queue_head); -		} +		spin_lock_irq(&q->mq_flush_lock); +		blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); +		spin_unlock_irq(&q->mq_flush_lock); +		return;  	} -} +	rq->end_io = flush_data_end_io; -static void bio_end_flush(struct bio *bio, int err) -{ -	if (err) -		clear_bit(BIO_UPTODATE, &bio->bi_flags); -	if (bio->bi_private) -		complete(bio->bi_private); -	bio_put(bio); +	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);  }  /** @@ -408,7 +436,6 @@ static void bio_end_flush(struct bio *bio, int err)  int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,  		sector_t *error_sector)  { -	DECLARE_COMPLETION_ONSTACK(wait);  	struct request_queue *q;  	struct bio *bio;  	int ret = 0; @@ -430,13 +457,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,  		return -ENXIO;  	bio = bio_alloc(gfp_mask, 0); -	bio->bi_end_io = bio_end_flush;  	bio->bi_bdev = bdev; -	bio->bi_private = &wait; -	bio_get(bio); -	submit_bio(WRITE_FLUSH, bio); -	wait_for_completion_io(&wait); +	ret = submit_bio_wait(WRITE_FLUSH, bio);  	/*  	 * The driver must store the error location in ->bi_sector, if @@ -444,12 +467,14 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,  	 * copied from blk_rq_pos(rq).  	 */  	if (error_sector) -		*error_sector = bio->bi_sector; - -	if (!bio_flagged(bio, BIO_UPTODATE)) -		ret = -EIO; +		*error_sector = bio->bi_iter.bi_sector;  	bio_put(bio);  	return ret;  }  EXPORT_SYMBOL(blkdev_issue_flush); + +void blk_mq_init_flush(struct request_queue *q) +{ +	spin_lock_init(&q->mq_flush_lock); +} diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 03cf7179e8e..7fbab84399e 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -43,30 +43,32 @@ static const char *bi_unsupported_name = "unsupported";   */  int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)  { -	struct bio_vec *iv, *ivprv = NULL; +	struct bio_vec iv, ivprv = { NULL };  	unsigned int segments = 0;  	unsigned int seg_size = 0; -	unsigned int i = 0; +	struct bvec_iter iter; +	int prev = 0; -	bio_for_each_integrity_vec(iv, bio, i) { +	bio_for_each_integrity_vec(iv, bio, iter) { -		if (ivprv) { -			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) +		if (prev) { +			if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))  				goto new_segment; -			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) +			if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))  				goto new_segment; -			if (seg_size + iv->bv_len > queue_max_segment_size(q)) +			if (seg_size + iv.bv_len > queue_max_segment_size(q))  				goto new_segment; -			seg_size += iv->bv_len; +			seg_size += iv.bv_len;  		} else {  new_segment:  			segments++; -			seg_size = iv->bv_len; +			seg_size = iv.bv_len;  		} +		prev = 1;  		ivprv = iv;  	} @@ -87,24 +89,25 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);  int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,  			    struct scatterlist *sglist)  { -	struct bio_vec *iv, *ivprv = NULL; +	struct bio_vec iv, ivprv = { NULL };  	struct scatterlist *sg = NULL;  	unsigned int segments = 0; -	unsigned int i = 0; +	struct bvec_iter iter; +	int prev = 0; -	bio_for_each_integrity_vec(iv, bio, i) { +	bio_for_each_integrity_vec(iv, bio, iter) { -		if (ivprv) { -			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) +		if (prev) { +			if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))  				goto new_segment; -			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) +			if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))  				goto new_segment; -			if (sg->length + iv->bv_len > queue_max_segment_size(q)) +			if (sg->length + iv.bv_len > queue_max_segment_size(q))  				goto new_segment; -			sg->length += iv->bv_len; +			sg->length += iv.bv_len;  		} else {  new_segment:  			if (!sg) @@ -114,10 +117,11 @@ new_segment:  				sg = sg_next(sg);  			} -			sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset); +			sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);  			segments++;  		} +		prev = 1;  		ivprv = iv;  	} diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 46cd7bd18b3..1a27f45ec77 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -6,7 +6,6 @@  #include <linux/init.h>  #include <linux/bio.h>  #include <linux/blkdev.h> -#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */  #include <linux/slab.h>  #include "blk.h" @@ -69,7 +68,7 @@ static void ioc_destroy_icq(struct io_cq *icq)  	 * under queue_lock.  If it's not pointing to @icq now, it never  	 * will.  Hint assignment itself can race safely.  	 */ -	if (rcu_dereference_raw(ioc->icq_hint) == icq) +	if (rcu_access_pointer(ioc->icq_hint) == icq)  		rcu_assign_pointer(ioc->icq_hint, NULL);  	ioc_exit_icq(icq); diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index 4b8d9b54111..0736729d649 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -14,9 +14,6 @@  #include "blk.h" -int blk_iopoll_enabled = 1; -EXPORT_SYMBOL(blk_iopoll_enabled); -  static unsigned int blk_iopoll_budget __read_mostly = 256;  static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); @@ -35,7 +32,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)  	unsigned long flags;  	local_irq_save(flags); -	list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); +	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));  	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);  	local_irq_restore(flags);  } @@ -52,7 +49,7 @@ EXPORT_SYMBOL(blk_iopoll_sched);  void __blk_iopoll_complete(struct blk_iopoll *iop)  {  	list_del(&iop->list); -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);  }  EXPORT_SYMBOL(__blk_iopoll_complete); @@ -67,19 +64,19 @@ EXPORT_SYMBOL(__blk_iopoll_complete);   *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()   *     is called.   **/ -void blk_iopoll_complete(struct blk_iopoll *iopoll) +void blk_iopoll_complete(struct blk_iopoll *iop)  {  	unsigned long flags;  	local_irq_save(flags); -	__blk_iopoll_complete(iopoll); +	__blk_iopoll_complete(iop);  	local_irq_restore(flags);  }  EXPORT_SYMBOL(blk_iopoll_complete);  static void blk_iopoll_softirq(struct softirq_action *h)  { -	struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); +	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);  	int rearm = 0, budget = blk_iopoll_budget;  	unsigned long start_time = jiffies; @@ -164,7 +161,7 @@ EXPORT_SYMBOL(blk_iopoll_disable);  void blk_iopoll_enable(struct blk_iopoll *iop)  {  	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);  }  EXPORT_SYMBOL(blk_iopoll_enable); @@ -201,7 +198,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,  		local_irq_disable();  		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), -				 &__get_cpu_var(blk_cpu_iopoll)); +				 this_cpu_ptr(&blk_cpu_iopoll));  		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);  		local_irq_enable();  	} diff --git a/block/blk-lib.c b/block/blk-lib.c index d6f50d57256..8411be3c19d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -43,8 +43,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,  	DECLARE_COMPLETION_ONSTACK(wait);  	struct request_queue *q = bdev_get_queue(bdev);  	int type = REQ_WRITE | REQ_DISCARD; -	sector_t max_discard_sectors; -	sector_t granularity, alignment; +	unsigned int max_discard_sectors, granularity; +	int alignment;  	struct bio_batch bb;  	struct bio *bio;  	int ret = 0; @@ -58,16 +58,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,  	/* Zero-sector (unknown) and one-sector granularities are the same.  */  	granularity = max(q->limits.discard_granularity >> 9, 1U); -	alignment = bdev_discard_alignment(bdev) >> 9; -	alignment = sector_div(alignment, granularity); +	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;  	/*  	 * Ensure that max_discard_sectors is of the proper  	 * granularity, so that requests stay aligned after a split.  	 */  	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); -	sector_div(max_discard_sectors, granularity); -	max_discard_sectors *= granularity; +	max_discard_sectors -= max_discard_sectors % granularity;  	if (unlikely(!max_discard_sectors)) {  		/* Avoid infinite loop below. Being cautious never hurts. */  		return -EOPNOTSUPP; @@ -110,17 +108,25 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,  			req_sects = end_sect - sector;  		} -		bio->bi_sector = sector; +		bio->bi_iter.bi_sector = sector;  		bio->bi_end_io = bio_batch_end_io;  		bio->bi_bdev = bdev;  		bio->bi_private = &bb; -		bio->bi_size = req_sects << 9; +		bio->bi_iter.bi_size = req_sects << 9;  		nr_sects -= req_sects;  		sector = end_sect;  		atomic_inc(&bb.done);  		submit_bio(type, bio); + +		/* +		 * We can loop for a long time in here, if someone does +		 * full device discards (like mkfs). Be nice and allow +		 * us to schedule out to avoid softlocking if preempt +		 * is disabled. +		 */ +		cond_resched();  	}  	blk_finish_plug(&plug); @@ -176,7 +182,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,  			break;  		} -		bio->bi_sector = sector; +		bio->bi_iter.bi_sector = sector;  		bio->bi_end_io = bio_batch_end_io;  		bio->bi_bdev = bdev;  		bio->bi_private = &bb; @@ -186,11 +192,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,  		bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);  		if (nr_sects > max_write_same_sectors) { -			bio->bi_size = max_write_same_sectors << 9; +			bio->bi_iter.bi_size = max_write_same_sectors << 9;  			nr_sects -= max_write_same_sectors;  			sector += max_write_same_sectors;  		} else { -			bio->bi_size = nr_sects << 9; +			bio->bi_iter.bi_size = nr_sects << 9;  			nr_sects = 0;  		} @@ -220,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same);   *  Generate and issue number of bios with zerofiled pages.   */ -int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, -			sector_t nr_sects, gfp_t gfp_mask) +static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, +				  sector_t nr_sects, gfp_t gfp_mask)  {  	int ret;  	struct bio *bio; @@ -242,7 +248,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,  			break;  		} -		bio->bi_sector = sector; +		bio->bi_iter.bi_sector = sector;  		bio->bi_bdev   = bdev;  		bio->bi_end_io = bio_batch_end_io;  		bio->bi_private = &bb; diff --git a/block/blk-map.c b/block/blk-map.c index 623e1cd4cff..f890d4345b0 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -20,7 +20,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,  		rq->biotail->bi_next = bio;  		rq->biotail = bio; -		rq->__data_len += bio->bi_size; +		rq->__data_len += bio->bi_iter.bi_size;  	}  	return 0;  } @@ -76,7 +76,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,  	ret = blk_rq_append_bio(q, rq, bio);  	if (!ret) -		return bio->bi_size; +		return bio->bi_iter.bi_size;  	/* if it was boucned we must call the end io function */  	bio_endio(bio, 0); @@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,  	if (!bio_flagged(bio, BIO_USER_MAPPED))  		rq->cmd_flags |= REQ_COPY_USER; -	rq->buffer = NULL;  	return 0;  unmap_rq:  	blk_rq_unmap_user(bio); @@ -188,7 +187,7 @@ EXPORT_SYMBOL(blk_rq_map_user);   *    unmapping.   */  int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, -			struct rq_map_data *map_data, struct sg_iovec *iov, +			struct rq_map_data *map_data, const struct sg_iovec *iov,  			int iov_count, unsigned int len, gfp_t gfp_mask)  {  	struct bio *bio; @@ -220,7 +219,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,  	if (IS_ERR(bio))  		return PTR_ERR(bio); -	if (bio->bi_size != len) { +	if (bio->bi_iter.bi_size != len) {  		/*  		 * Grab an extra reference to this bio, as bio_unmap_user()  		 * expects to be able to drop it twice as it happens on the @@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,  	blk_queue_bounce(q, &bio);  	bio_get(bio);  	blk_rq_bio_prep(q, rq, bio); -	rq->buffer = NULL;  	return 0;  }  EXPORT_SYMBOL(blk_rq_map_user_iov); @@ -285,7 +283,7 @@ EXPORT_SYMBOL(blk_rq_unmap_user);   *   * Description:   *    Data will be mapped directly if possible. Otherwise a bounce - *    buffer is used. Can be called multple times to append multple + *    buffer is used. Can be called multiple times to append multiple   *    buffers.   */  int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, @@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,  	}  	blk_queue_bounce(q, &rq->bio); -	rq->buffer = NULL;  	return 0;  }  EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index 5f244825379..54535831f1e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -12,38 +12,56 @@  static unsigned int __blk_recalc_rq_segments(struct request_queue *q,  					     struct bio *bio)  { -	struct bio_vec *bv, *bvprv = NULL; -	int cluster, i, high, highprv = 1; +	struct bio_vec bv, bvprv = { NULL }; +	int cluster, high, highprv = 1, no_sg_merge;  	unsigned int seg_size, nr_phys_segs;  	struct bio *fbio, *bbio; +	struct bvec_iter iter;  	if (!bio)  		return 0; +	/* +	 * This should probably be returning 0, but blk_add_request_payload() +	 * (Christoph!!!!) +	 */ +	if (bio->bi_rw & REQ_DISCARD) +		return 1; + +	if (bio->bi_rw & REQ_WRITE_SAME) +		return 1; +  	fbio = bio;  	cluster = blk_queue_cluster(q);  	seg_size = 0;  	nr_phys_segs = 0; +	no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); +	high = 0;  	for_each_bio(bio) { -		bio_for_each_segment(bv, bio, i) { +		bio_for_each_segment(bv, bio, iter) {  			/* -			 * the trick here is making sure that a high page is -			 * never considered part of another segment, since that -			 * might change with the bounce page. +			 * If SG merging is disabled, each bio vector is +			 * a segment  			 */ -			high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q); -			if (high || highprv) +			if (no_sg_merge)  				goto new_segment; -			if (cluster) { -				if (seg_size + bv->bv_len + +			/* +			 * the trick here is making sure that a high page is +			 * never considered part of another segment, since +			 * that might change with the bounce page. +			 */ +			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); +			if (!high && !highprv && cluster) { +				if (seg_size + bv.bv_len  				    > queue_max_segment_size(q))  					goto new_segment; -				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) +				if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))  					goto new_segment; -				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) +				if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))  					goto new_segment; -				seg_size += bv->bv_len; +				seg_size += bv.bv_len;  				bvprv = bv;  				continue;  			} @@ -54,7 +72,7 @@ new_segment:  			nr_phys_segs++;  			bvprv = bv; -			seg_size = bv->bv_len; +			seg_size = bv.bv_len;  			highprv = high;  		}  		bbio = bio; @@ -75,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq)  void blk_recount_segments(struct request_queue *q, struct bio *bio)  { -	struct bio *nxt = bio->bi_next; +	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags)) +		bio->bi_phys_segments = bio->bi_vcnt; +	else { +		struct bio *nxt = bio->bi_next; + +		bio->bi_next = NULL; +		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); +		bio->bi_next = nxt; +	} -	bio->bi_next = NULL; -	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); -	bio->bi_next = nxt;  	bio->bi_flags |= (1 << BIO_SEG_VALID);  }  EXPORT_SYMBOL(blk_recount_segments); @@ -87,6 +110,9 @@ EXPORT_SYMBOL(blk_recount_segments);  static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,  				   struct bio *nxt)  { +	struct bio_vec end_bv = { NULL }, nxt_bv; +	struct bvec_iter iter; +  	if (!blk_queue_cluster(q))  		return 0; @@ -97,34 +123,40 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,  	if (!bio_has_data(bio))  		return 1; -	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) +	bio_for_each_segment(end_bv, bio, iter) +		if (end_bv.bv_len == iter.bi_size) +			break; + +	nxt_bv = bio_iovec(nxt); + +	if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))  		return 0;  	/*  	 * bio and nxt are contiguous in memory; check if the queue allows  	 * these two to be merged into one  	 */ -	if (BIO_SEG_BOUNDARY(q, bio, nxt)) +	if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))  		return 1;  	return 0;  } -static void +static inline void  __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, -		     struct scatterlist *sglist, struct bio_vec **bvprv, +		     struct scatterlist *sglist, struct bio_vec *bvprv,  		     struct scatterlist **sg, int *nsegs, int *cluster)  {  	int nbytes = bvec->bv_len; -	if (*bvprv && *cluster) { +	if (*sg && *cluster) {  		if ((*sg)->length + nbytes > queue_max_segment_size(q))  			goto new_segment; -		if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec)) +		if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))  			goto new_segment; -		if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec)) +		if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))  			goto new_segment;  		(*sg)->length += nbytes; @@ -150,7 +182,49 @@ new_segment:  		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);  		(*nsegs)++;  	} -	*bvprv = bvec; +	*bvprv = *bvec; +} + +static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, +			     struct scatterlist *sglist, +			     struct scatterlist **sg) +{ +	struct bio_vec bvec, bvprv = { NULL }; +	struct bvec_iter iter; +	int nsegs, cluster; + +	nsegs = 0; +	cluster = blk_queue_cluster(q); + +	if (bio->bi_rw & REQ_DISCARD) { +		/* +		 * This is a hack - drivers should be neither modifying the +		 * biovec, nor relying on bi_vcnt - but because of +		 * blk_add_request_payload(), a discard bio may or may not have +		 * a payload we need to set up here (thank you Christoph) and +		 * bi_vcnt is really the only way of telling if we need to. +		 */ + +		if (bio->bi_vcnt) +			goto single_segment; + +		return 0; +	} + +	if (bio->bi_rw & REQ_WRITE_SAME) { +single_segment: +		*sg = sglist; +		bvec = bio_iovec(bio); +		sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); +		return 1; +	} + +	for_each_bio(bio) +		bio_for_each_segment(bvec, bio, iter) +			__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, +					     &nsegs, &cluster); + +	return nsegs;  }  /* @@ -160,24 +234,11 @@ new_segment:  int blk_rq_map_sg(struct request_queue *q, struct request *rq,  		  struct scatterlist *sglist)  { -	struct bio_vec *bvec, *bvprv; -	struct req_iterator iter; -	struct scatterlist *sg; -	int nsegs, cluster; - -	nsegs = 0; -	cluster = blk_queue_cluster(q); - -	/* -	 * for each bio in rq -	 */ -	bvprv = NULL; -	sg = NULL; -	rq_for_each_segment(bvec, rq, iter) { -		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, -				     &nsegs, &cluster); -	} /* segments in rq */ +	struct scatterlist *sg = NULL; +	int nsegs = 0; +	if (rq->bio) +		nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);  	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&  	    (blk_rq_bytes(rq) & q->dma_pad_mask)) { @@ -223,21 +284,13 @@ EXPORT_SYMBOL(blk_rq_map_sg);  int blk_bio_map_sg(struct request_queue *q, struct bio *bio,  		   struct scatterlist *sglist)  { -	struct bio_vec *bvec, *bvprv; -	struct scatterlist *sg; -	int nsegs, cluster; -	unsigned long i; - -	nsegs = 0; -	cluster = blk_queue_cluster(q); - -	bvprv = NULL; -	sg = NULL; -	bio_for_each_segment(bvec, bio, i) { -		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, -				     &nsegs, &cluster); -	} /* segments in bio */ +	struct scatterlist *sg = NULL; +	int nsegs; +	struct bio *next = bio->bi_next; +	bio->bi_next = NULL; +	nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); +	bio->bi_next = next;  	if (sg)  		sg_mark_end(sg); @@ -308,6 +361,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,  	return ll_new_hw_segment(q, req, bio);  } +/* + * blk-mq uses req->special to carry normal driver per-request payload, it + * does not indicate a prepared command that we cannot merge with. + */ +static bool req_no_special_merge(struct request *req) +{ +	struct request_queue *q = req->q; + +	return !q->mq_ops && req->special; +} +  static int ll_merge_requests_fn(struct request_queue *q, struct request *req,  				struct request *next)  { @@ -319,7 +383,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,  	 * First check if the either of the requests are re-queued  	 * requests.  Can't merge them if they are.  	 */ -	if (req->special || next->special) +	if (req_no_special_merge(req) || req_no_special_merge(next))  		return 0;  	/* @@ -416,7 +480,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,  	if (rq_data_dir(req) != rq_data_dir(next)  	    || req->rq_disk != next->rq_disk -	    || next->special) +	    || req_no_special_merge(next))  		return 0;  	if (req->cmd_flags & REQ_WRITE_SAME && @@ -504,6 +568,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,  bool blk_rq_merge_ok(struct request *rq, struct bio *bio)  { +	struct request_queue *q = rq->q; +  	if (!rq_mergeable(rq) || !bio_mergeable(bio))  		return false; @@ -515,7 +581,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)  		return false;  	/* must be same device and not a special request */ -	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) +	if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))  		return false;  	/* only merge integrity protected bio into ditto rq */ @@ -527,14 +593,22 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)  	    !blk_write_same_mergeable(rq->bio, bio))  		return false; +	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { +		struct bio_vec *bprev; + +		bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1]; +		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) +			return false; +	} +  	return true;  }  int blk_try_merge(struct request *rq, struct bio *bio)  { -	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector) +	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)  		return ELEVATOR_BACK_MERGE; -	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector) +	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)  		return ELEVATOR_FRONT_MERGE;  	return ELEVATOR_NO_MERGE;  } diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c new file mode 100644 index 00000000000..bb3ed488f7b --- /dev/null +++ b/block/blk-mq-cpu.c @@ -0,0 +1,67 @@ +/* + * CPU notifier helper code for blk-mq + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/list.h> +#include <linux/llist.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" + +static LIST_HEAD(blk_mq_cpu_notify_list); +static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock); + +static int blk_mq_main_cpu_notify(struct notifier_block *self, +				  unsigned long action, void *hcpu) +{ +	unsigned int cpu = (unsigned long) hcpu; +	struct blk_mq_cpu_notifier *notify; +	int ret = NOTIFY_OK; + +	raw_spin_lock(&blk_mq_cpu_notify_lock); + +	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { +		ret = notify->notify(notify->data, action, cpu); +		if (ret != NOTIFY_OK) +			break; +	} + +	raw_spin_unlock(&blk_mq_cpu_notify_lock); +	return ret; +} + +void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ +	BUG_ON(!notifier->notify); + +	raw_spin_lock(&blk_mq_cpu_notify_lock); +	list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); +	raw_spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ +	raw_spin_lock(&blk_mq_cpu_notify_lock); +	list_del(¬ifier->list); +	raw_spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, +			      int (*fn)(void *, unsigned long, unsigned int), +			      void *data) +{ +	notifier->notify = fn; +	notifier->data = data; +} + +void __init blk_mq_cpu_init(void) +{ +	hotcpu_notifier(blk_mq_main_cpu_notify, 0); +} diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c new file mode 100644 index 00000000000..1065d7c65fa --- /dev/null +++ b/block/blk-mq-cpumap.c @@ -0,0 +1,119 @@ +/* + * CPU <-> hardware queue mapping helpers + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" + +static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, +			      const int cpu) +{ +	return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); +} + +static int get_first_sibling(unsigned int cpu) +{ +	unsigned int ret; + +	ret = cpumask_first(topology_thread_cpumask(cpu)); +	if (ret < nr_cpu_ids) +		return ret; + +	return cpu; +} + +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) +{ +	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; +	cpumask_var_t cpus; + +	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) +		return 1; + +	cpumask_clear(cpus); +	nr_cpus = nr_uniq_cpus = 0; +	for_each_online_cpu(i) { +		nr_cpus++; +		first_sibling = get_first_sibling(i); +		if (!cpumask_test_cpu(first_sibling, cpus)) +			nr_uniq_cpus++; +		cpumask_set_cpu(i, cpus); +	} + +	queue = 0; +	for_each_possible_cpu(i) { +		if (!cpu_online(i)) { +			map[i] = 0; +			continue; +		} + +		/* +		 * Easy case - we have equal or more hardware queues. Or +		 * there are no thread siblings to take into account. Do +		 * 1:1 if enough, or sequential mapping if less. +		 */ +		if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { +			map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); +			queue++; +			continue; +		} + +		/* +		 * Less then nr_cpus queues, and we have some number of +		 * threads per cores. Map sibling threads to the same +		 * queue. +		 */ +		first_sibling = get_first_sibling(i); +		if (first_sibling == i) { +			map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, +							queue); +			queue++; +		} else +			map[i] = map[first_sibling]; +	} + +	free_cpumask_var(cpus); +	return 0; +} + +unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) +{ +	unsigned int *map; + +	/* If cpus are offline, map them to first hctx */ +	map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, +				set->numa_node); +	if (!map) +		return NULL; + +	if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) +		return map; + +	kfree(map); +	return NULL; +} + +/* + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +{ +	int i; + +	for_each_possible_cpu(i) { +		if (index == mq_map[i]) +			return cpu_to_node(i); +	} + +	return NUMA_NO_NODE; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c new file mode 100644 index 00000000000..ed521786755 --- /dev/null +++ b/block/blk-mq-sysfs.c @@ -0,0 +1,456 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/smp.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static void blk_mq_sysfs_release(struct kobject *kobj) +{ +} + +struct blk_mq_ctx_sysfs_entry { +	struct attribute attr; +	ssize_t (*show)(struct blk_mq_ctx *, char *); +	ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); +}; + +struct blk_mq_hw_ctx_sysfs_entry { +	struct attribute attr; +	ssize_t (*show)(struct blk_mq_hw_ctx *, char *); +	ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); +}; + +static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, +				 char *page) +{ +	struct blk_mq_ctx_sysfs_entry *entry; +	struct blk_mq_ctx *ctx; +	struct request_queue *q; +	ssize_t res; + +	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); +	ctx = container_of(kobj, struct blk_mq_ctx, kobj); +	q = ctx->queue; + +	if (!entry->show) +		return -EIO; + +	res = -ENOENT; +	mutex_lock(&q->sysfs_lock); +	if (!blk_queue_dying(q)) +		res = entry->show(ctx, page); +	mutex_unlock(&q->sysfs_lock); +	return res; +} + +static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, +				  const char *page, size_t length) +{ +	struct blk_mq_ctx_sysfs_entry *entry; +	struct blk_mq_ctx *ctx; +	struct request_queue *q; +	ssize_t res; + +	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); +	ctx = container_of(kobj, struct blk_mq_ctx, kobj); +	q = ctx->queue; + +	if (!entry->store) +		return -EIO; + +	res = -ENOENT; +	mutex_lock(&q->sysfs_lock); +	if (!blk_queue_dying(q)) +		res = entry->store(ctx, page, length); +	mutex_unlock(&q->sysfs_lock); +	return res; +} + +static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, +				    struct attribute *attr, char *page) +{ +	struct blk_mq_hw_ctx_sysfs_entry *entry; +	struct blk_mq_hw_ctx *hctx; +	struct request_queue *q; +	ssize_t res; + +	entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); +	hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); +	q = hctx->queue; + +	if (!entry->show) +		return -EIO; + +	res = -ENOENT; +	mutex_lock(&q->sysfs_lock); +	if (!blk_queue_dying(q)) +		res = entry->show(hctx, page); +	mutex_unlock(&q->sysfs_lock); +	return res; +} + +static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, +				     struct attribute *attr, const char *page, +				     size_t length) +{ +	struct blk_mq_hw_ctx_sysfs_entry *entry; +	struct blk_mq_hw_ctx *hctx; +	struct request_queue *q; +	ssize_t res; + +	entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); +	hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); +	q = hctx->queue; + +	if (!entry->store) +		return -EIO; + +	res = -ENOENT; +	mutex_lock(&q->sysfs_lock); +	if (!blk_queue_dying(q)) +		res = entry->store(hctx, page, length); +	mutex_unlock(&q->sysfs_lock); +	return res; +} + +static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) +{ +	return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], +				ctx->rq_dispatched[0]); +} + +static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) +{ +	return sprintf(page, "%lu\n", ctx->rq_merged); +} + +static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) +{ +	return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], +				ctx->rq_completed[0]); +} + +static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) +{ +	char *start_page = page; +	struct request *rq; + +	page += sprintf(page, "%s:\n", msg); + +	list_for_each_entry(rq, list, queuelist) +		page += sprintf(page, "\t%p\n", rq); + +	return page - start_page; +} + +static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) +{ +	ssize_t ret; + +	spin_lock(&ctx->lock); +	ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); +	spin_unlock(&ctx->lock); + +	return ret; +} + +static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, +					   char *page) +{ +	return sprintf(page, "%lu\n", hctx->queued); +} + +static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) +{ +	return sprintf(page, "%lu\n", hctx->run); +} + +static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, +					       char *page) +{ +	char *start_page = page; +	int i; + +	page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); + +	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { +		unsigned long d = 1U << (i - 1); + +		page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); +	} + +	return page - start_page; +} + +static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, +					    char *page) +{ +	ssize_t ret; + +	spin_lock(&hctx->lock); +	ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); +	spin_unlock(&hctx->lock); + +	return ret; +} + +static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +{ +	return blk_mq_tag_sysfs_show(hctx->tags, page); +} + +static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) +{ +	return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); +} + +static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) +{ +	unsigned int i, first = 1; +	ssize_t ret = 0; + +	blk_mq_disable_hotplug(); + +	for_each_cpu(i, hctx->cpumask) { +		if (first) +			ret += sprintf(ret + page, "%u", i); +		else +			ret += sprintf(ret + page, ", %u", i); + +		first = 0; +	} + +	blk_mq_enable_hotplug(); + +	ret += sprintf(ret + page, "\n"); +	return ret; +} + +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { +	.attr = {.name = "dispatched", .mode = S_IRUGO }, +	.show = blk_mq_sysfs_dispatched_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { +	.attr = {.name = "merged", .mode = S_IRUGO }, +	.show = blk_mq_sysfs_merged_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { +	.attr = {.name = "completed", .mode = S_IRUGO }, +	.show = blk_mq_sysfs_completed_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { +	.attr = {.name = "rq_list", .mode = S_IRUGO }, +	.show = blk_mq_sysfs_rq_list_show, +}; + +static struct attribute *default_ctx_attrs[] = { +	&blk_mq_sysfs_dispatched.attr, +	&blk_mq_sysfs_merged.attr, +	&blk_mq_sysfs_completed.attr, +	&blk_mq_sysfs_rq_list.attr, +	NULL, +}; + +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { +	.attr = {.name = "queued", .mode = S_IRUGO }, +	.show = blk_mq_hw_sysfs_queued_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { +	.attr = {.name = "run", .mode = S_IRUGO }, +	.show = blk_mq_hw_sysfs_run_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { +	.attr = {.name = "dispatched", .mode = S_IRUGO }, +	.show = blk_mq_hw_sysfs_dispatched_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { +	.attr = {.name = "active", .mode = S_IRUGO }, +	.show = blk_mq_hw_sysfs_active_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { +	.attr = {.name = "pending", .mode = S_IRUGO }, +	.show = blk_mq_hw_sysfs_rq_list_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { +	.attr = {.name = "tags", .mode = S_IRUGO }, +	.show = blk_mq_hw_sysfs_tags_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { +	.attr = {.name = "cpu_list", .mode = S_IRUGO }, +	.show = blk_mq_hw_sysfs_cpus_show, +}; + +static struct attribute *default_hw_ctx_attrs[] = { +	&blk_mq_hw_sysfs_queued.attr, +	&blk_mq_hw_sysfs_run.attr, +	&blk_mq_hw_sysfs_dispatched.attr, +	&blk_mq_hw_sysfs_pending.attr, +	&blk_mq_hw_sysfs_tags.attr, +	&blk_mq_hw_sysfs_cpus.attr, +	&blk_mq_hw_sysfs_active.attr, +	NULL, +}; + +static const struct sysfs_ops blk_mq_sysfs_ops = { +	.show	= blk_mq_sysfs_show, +	.store	= blk_mq_sysfs_store, +}; + +static const struct sysfs_ops blk_mq_hw_sysfs_ops = { +	.show	= blk_mq_hw_sysfs_show, +	.store	= blk_mq_hw_sysfs_store, +}; + +static struct kobj_type blk_mq_ktype = { +	.sysfs_ops	= &blk_mq_sysfs_ops, +	.release	= blk_mq_sysfs_release, +}; + +static struct kobj_type blk_mq_ctx_ktype = { +	.sysfs_ops	= &blk_mq_sysfs_ops, +	.default_attrs	= default_ctx_attrs, +	.release	= blk_mq_sysfs_release, +}; + +static struct kobj_type blk_mq_hw_ktype = { +	.sysfs_ops	= &blk_mq_hw_sysfs_ops, +	.default_attrs	= default_hw_ctx_attrs, +	.release	= blk_mq_sysfs_release, +}; + +static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) +{ +	struct blk_mq_ctx *ctx; +	int i; + +	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) +		return; + +	hctx_for_each_ctx(hctx, ctx, i) +		kobject_del(&ctx->kobj); + +	kobject_del(&hctx->kobj); +} + +static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) +{ +	struct request_queue *q = hctx->queue; +	struct blk_mq_ctx *ctx; +	int i, ret; + +	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) +		return 0; + +	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); +	if (ret) +		return ret; + +	hctx_for_each_ctx(hctx, ctx, i) { +		ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); +		if (ret) +			break; +	} + +	return ret; +} + +void blk_mq_unregister_disk(struct gendisk *disk) +{ +	struct request_queue *q = disk->queue; +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *ctx; +	int i, j; + +	queue_for_each_hw_ctx(q, hctx, i) { +		blk_mq_unregister_hctx(hctx); + +		hctx_for_each_ctx(hctx, ctx, j) +			kobject_put(&ctx->kobj); + +		kobject_put(&hctx->kobj); +	} + +	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); +	kobject_del(&q->mq_kobj); +	kobject_put(&q->mq_kobj); + +	kobject_put(&disk_to_dev(disk)->kobj); +} + +static void blk_mq_sysfs_init(struct request_queue *q) +{ +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *ctx; +	int i, j; + +	kobject_init(&q->mq_kobj, &blk_mq_ktype); + +	queue_for_each_hw_ctx(q, hctx, i) { +		kobject_init(&hctx->kobj, &blk_mq_hw_ktype); + +		hctx_for_each_ctx(hctx, ctx, j) +			kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); +	} +} + +int blk_mq_register_disk(struct gendisk *disk) +{ +	struct device *dev = disk_to_dev(disk); +	struct request_queue *q = disk->queue; +	struct blk_mq_hw_ctx *hctx; +	int ret, i; + +	blk_mq_sysfs_init(q); + +	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); +	if (ret < 0) +		return ret; + +	kobject_uevent(&q->mq_kobj, KOBJ_ADD); + +	queue_for_each_hw_ctx(q, hctx, i) { +		hctx->flags |= BLK_MQ_F_SYSFS_UP; +		ret = blk_mq_register_hctx(hctx); +		if (ret) +			break; +	} + +	if (ret) { +		blk_mq_unregister_disk(disk); +		return ret; +	} + +	return 0; +} + +void blk_mq_sysfs_unregister(struct request_queue *q) +{ +	struct blk_mq_hw_ctx *hctx; +	int i; + +	queue_for_each_hw_ctx(q, hctx, i) +		blk_mq_unregister_hctx(hctx); +} + +int blk_mq_sysfs_register(struct request_queue *q) +{ +	struct blk_mq_hw_ctx *hctx; +	int i, ret = 0; + +	queue_for_each_hw_ctx(q, hctx, i) { +		ret = blk_mq_register_hctx(hctx); +		if (ret) +			break; +	} + +	return ret; +} diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c new file mode 100644 index 00000000000..c1b92426c95 --- /dev/null +++ b/block/blk-mq-tag.c @@ -0,0 +1,618 @@ +/* + * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread + * over multiple cachelines to avoid ping-pong between multiple submitters + * or submitter and completer. Uses rolling wakeups to avoid falling of + * the scaling cliff when we run out of tags and have to start putting + * submitters to sleep. + * + * Uses active queue tracking to support fairer distribution of tags + * between multiple submitters when a shared tag map is used. + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/random.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) +{ +	int i; + +	for (i = 0; i < bt->map_nr; i++) { +		struct blk_align_bitmap *bm = &bt->map[i]; +		int ret; + +		ret = find_first_zero_bit(&bm->word, bm->depth); +		if (ret < bm->depth) +			return true; +	} + +	return false; +} + +bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +{ +	if (!tags) +		return true; + +	return bt_has_free_tags(&tags->bitmap_tags); +} + +static inline int bt_index_inc(int index) +{ +	return (index + 1) & (BT_WAIT_QUEUES - 1); +} + +static inline void bt_index_atomic_inc(atomic_t *index) +{ +	int old = atomic_read(index); +	int new = bt_index_inc(old); +	atomic_cmpxchg(index, old, new); +} + +/* + * If a previously inactive queue goes active, bump the active user count. + */ +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ +	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && +	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) +		atomic_inc(&hctx->tags->active_queues); + +	return true; +} + +/* + * Wakeup all potentially sleeping on normal (non-reserved) tags + */ +static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) +{ +	struct blk_mq_bitmap_tags *bt; +	int i, wake_index; + +	bt = &tags->bitmap_tags; +	wake_index = atomic_read(&bt->wake_index); +	for (i = 0; i < BT_WAIT_QUEUES; i++) { +		struct bt_wait_state *bs = &bt->bs[wake_index]; + +		if (waitqueue_active(&bs->wait)) +			wake_up(&bs->wait); + +		wake_index = bt_index_inc(wake_index); +	} +} + +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ +	struct blk_mq_tags *tags = hctx->tags; + +	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) +		return; + +	atomic_dec(&tags->active_queues); + +	blk_mq_tag_wakeup_all(tags); +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, +				  struct blk_mq_bitmap_tags *bt) +{ +	unsigned int depth, users; + +	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) +		return true; +	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) +		return true; + +	/* +	 * Don't try dividing an ant +	 */ +	if (bt->depth == 1) +		return true; + +	users = atomic_read(&hctx->tags->active_queues); +	if (!users) +		return true; + +	/* +	 * Allow at least some tags +	 */ +	depth = max((bt->depth + users - 1) / users, 4U); +	return atomic_read(&hctx->nr_active) < depth; +} + +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) +{ +	int tag, org_last_tag, end; + +	org_last_tag = last_tag; +	end = bm->depth; +	do { +restart: +		tag = find_next_zero_bit(&bm->word, end, last_tag); +		if (unlikely(tag >= end)) { +			/* +			 * We started with an offset, start from 0 to +			 * exhaust the map. +			 */ +			if (org_last_tag && last_tag) { +				end = last_tag; +				last_tag = 0; +				goto restart; +			} +			return -1; +		} +		last_tag = tag + 1; +	} while (test_and_set_bit_lock(tag, &bm->word)); + +	return tag; +} + +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, +		    unsigned int *tag_cache) +{ +	unsigned int last_tag, org_last_tag; +	int index, i, tag; + +	if (!hctx_may_queue(hctx, bt)) +		return -1; + +	last_tag = org_last_tag = *tag_cache; +	index = TAG_TO_INDEX(bt, last_tag); + +	for (i = 0; i < bt->map_nr; i++) { +		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); +		if (tag != -1) { +			tag += (index << bt->bits_per_word); +			goto done; +		} + +		last_tag = 0; +		if (++index >= bt->map_nr) +			index = 0; +	} + +	*tag_cache = 0; +	return -1; + +	/* +	 * Only update the cache from the allocation path, if we ended +	 * up using the specific cached tag. +	 */ +done: +	if (tag == org_last_tag) { +		last_tag = tag + 1; +		if (last_tag >= bt->depth - 1) +			last_tag = 0; + +		*tag_cache = last_tag; +	} + +	return tag; +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, +					 struct blk_mq_hw_ctx *hctx) +{ +	struct bt_wait_state *bs; +	int wait_index; + +	if (!hctx) +		return &bt->bs[0]; + +	wait_index = atomic_read(&hctx->wait_index); +	bs = &bt->bs[wait_index]; +	bt_index_atomic_inc(&hctx->wait_index); +	return bs; +} + +static int bt_get(struct blk_mq_alloc_data *data, +		struct blk_mq_bitmap_tags *bt, +		struct blk_mq_hw_ctx *hctx, +		unsigned int *last_tag) +{ +	struct bt_wait_state *bs; +	DEFINE_WAIT(wait); +	int tag; + +	tag = __bt_get(hctx, bt, last_tag); +	if (tag != -1) +		return tag; + +	if (!(data->gfp & __GFP_WAIT)) +		return -1; + +	bs = bt_wait_ptr(bt, hctx); +	do { +		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + +		tag = __bt_get(hctx, bt, last_tag); +		if (tag != -1) +			break; + +		blk_mq_put_ctx(data->ctx); + +		io_schedule(); + +		data->ctx = blk_mq_get_ctx(data->q); +		data->hctx = data->q->mq_ops->map_queue(data->q, +				data->ctx->cpu); +		if (data->reserved) { +			bt = &data->hctx->tags->breserved_tags; +		} else { +			last_tag = &data->ctx->last_tag; +			hctx = data->hctx; +			bt = &hctx->tags->bitmap_tags; +		} +		finish_wait(&bs->wait, &wait); +		bs = bt_wait_ptr(bt, hctx); +	} while (1); + +	finish_wait(&bs->wait, &wait); +	return tag; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) +{ +	int tag; + +	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, +			&data->ctx->last_tag); +	if (tag >= 0) +		return tag + data->hctx->tags->nr_reserved_tags; + +	return BLK_MQ_TAG_FAIL; +} + +static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) +{ +	int tag, zero = 0; + +	if (unlikely(!data->hctx->tags->nr_reserved_tags)) { +		WARN_ON_ONCE(1); +		return BLK_MQ_TAG_FAIL; +	} + +	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero); +	if (tag < 0) +		return BLK_MQ_TAG_FAIL; + +	return tag; +} + +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) +{ +	if (!data->reserved) +		return __blk_mq_get_tag(data); + +	return __blk_mq_get_reserved_tag(data); +} + +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +{ +	int i, wake_index; + +	wake_index = atomic_read(&bt->wake_index); +	for (i = 0; i < BT_WAIT_QUEUES; i++) { +		struct bt_wait_state *bs = &bt->bs[wake_index]; + +		if (waitqueue_active(&bs->wait)) { +			int o = atomic_read(&bt->wake_index); +			if (wake_index != o) +				atomic_cmpxchg(&bt->wake_index, o, wake_index); + +			return bs; +		} + +		wake_index = bt_index_inc(wake_index); +	} + +	return NULL; +} + +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +{ +	const int index = TAG_TO_INDEX(bt, tag); +	struct bt_wait_state *bs; +	int wait_cnt; + +	/* +	 * The unlock memory barrier need to order access to req in free +	 * path and clearing tag bit +	 */ +	clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + +	bs = bt_wake_ptr(bt); +	if (!bs) +		return; + +	wait_cnt = atomic_dec_return(&bs->wait_cnt); +	if (wait_cnt == 0) { +wake: +		atomic_add(bt->wake_cnt, &bs->wait_cnt); +		bt_index_atomic_inc(&bt->wake_index); +		wake_up(&bs->wait); +	} else if (wait_cnt < 0) { +		wait_cnt = atomic_inc_return(&bs->wait_cnt); +		if (!wait_cnt) +			goto wake; +	} +} + +static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +{ +	BUG_ON(tag >= tags->nr_tags); + +	bt_clear_tag(&tags->bitmap_tags, tag); +} + +static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, +				      unsigned int tag) +{ +	BUG_ON(tag >= tags->nr_reserved_tags); + +	bt_clear_tag(&tags->breserved_tags, tag); +} + +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, +		    unsigned int *last_tag) +{ +	struct blk_mq_tags *tags = hctx->tags; + +	if (tag >= tags->nr_reserved_tags) { +		const int real_tag = tag - tags->nr_reserved_tags; + +		__blk_mq_put_tag(tags, real_tag); +		*last_tag = real_tag; +	} else +		__blk_mq_put_reserved_tag(tags, tag); +} + +static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, +			     unsigned long *free_map, unsigned int off) +{ +	int i; + +	for (i = 0; i < bt->map_nr; i++) { +		struct blk_align_bitmap *bm = &bt->map[i]; +		int bit = 0; + +		do { +			bit = find_next_zero_bit(&bm->word, bm->depth, bit); +			if (bit >= bm->depth) +				break; + +			__set_bit(bit + off, free_map); +			bit++; +		} while (1); + +		off += (1 << bt->bits_per_word); +	} +} + +void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, +			  void (*fn)(void *, unsigned long *), void *data) +{ +	unsigned long *tag_map; +	size_t map_size; + +	map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; +	tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); +	if (!tag_map) +		return; + +	bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); +	if (tags->nr_reserved_tags) +		bt_for_each_free(&tags->breserved_tags, tag_map, 0); + +	fn(data, tag_map); +	kfree(tag_map); +} +EXPORT_SYMBOL(blk_mq_tag_busy_iter); + +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ +	unsigned int i, used; + +	for (i = 0, used = 0; i < bt->map_nr; i++) { +		struct blk_align_bitmap *bm = &bt->map[i]; + +		used += bitmap_weight(&bm->word, bm->depth); +	} + +	return bt->depth - used; +} + +static void bt_update_count(struct blk_mq_bitmap_tags *bt, +			    unsigned int depth) +{ +	unsigned int tags_per_word = 1U << bt->bits_per_word; +	unsigned int map_depth = depth; + +	if (depth) { +		int i; + +		for (i = 0; i < bt->map_nr; i++) { +			bt->map[i].depth = min(map_depth, tags_per_word); +			map_depth -= bt->map[i].depth; +		} +	} + +	bt->wake_cnt = BT_WAIT_BATCH; +	if (bt->wake_cnt > depth / 4) +		bt->wake_cnt = max(1U, depth / 4); + +	bt->depth = depth; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, +			int node, bool reserved) +{ +	int i; + +	bt->bits_per_word = ilog2(BITS_PER_LONG); + +	/* +	 * Depth can be zero for reserved tags, that's not a failure +	 * condition. +	 */ +	if (depth) { +		unsigned int nr, tags_per_word; + +		tags_per_word = (1 << bt->bits_per_word); + +		/* +		 * If the tag space is small, shrink the number of tags +		 * per word so we spread over a few cachelines, at least. +		 * If less than 4 tags, just forget about it, it's not +		 * going to work optimally anyway. +		 */ +		if (depth >= 4) { +			while (tags_per_word * 4 > depth) { +				bt->bits_per_word--; +				tags_per_word = (1 << bt->bits_per_word); +			} +		} + +		nr = ALIGN(depth, tags_per_word) / tags_per_word; +		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), +						GFP_KERNEL, node); +		if (!bt->map) +			return -ENOMEM; + +		bt->map_nr = nr; +	} + +	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); +	if (!bt->bs) { +		kfree(bt->map); +		return -ENOMEM; +	} + +	bt_update_count(bt, depth); + +	for (i = 0; i < BT_WAIT_QUEUES; i++) { +		init_waitqueue_head(&bt->bs[i].wait); +		atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); +	} + +	return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ +	kfree(bt->map); +	kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, +						   int node) +{ +	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + +	if (bt_alloc(&tags->bitmap_tags, depth, node, false)) +		goto enomem; +	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) +		goto enomem; + +	return tags; +enomem: +	bt_free(&tags->bitmap_tags); +	kfree(tags); +	return NULL; +} + +struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, +				     unsigned int reserved_tags, int node) +{ +	struct blk_mq_tags *tags; + +	if (total_tags > BLK_MQ_TAG_MAX) { +		pr_err("blk-mq: tag depth too large\n"); +		return NULL; +	} + +	tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); +	if (!tags) +		return NULL; + +	tags->nr_tags = total_tags; +	tags->nr_reserved_tags = reserved_tags; + +	return blk_mq_init_bitmap_tags(tags, node); +} + +void blk_mq_free_tags(struct blk_mq_tags *tags) +{ +	bt_free(&tags->bitmap_tags); +	bt_free(&tags->breserved_tags); +	kfree(tags); +} + +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ +	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + +	*tag = prandom_u32() % depth; +} + +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) +{ +	tdepth -= tags->nr_reserved_tags; +	if (tdepth > tags->nr_tags) +		return -EINVAL; + +	/* +	 * Don't need (or can't) update reserved tags here, they remain +	 * static and should never need resizing. +	 */ +	bt_update_count(&tags->bitmap_tags, tdepth); +	blk_mq_tag_wakeup_all(tags); +	return 0; +} + +ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) +{ +	char *orig_page = page; +	unsigned int free, res; + +	if (!tags) +		return 0; + +	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " +			"bits_per_word=%u\n", +			tags->nr_tags, tags->nr_reserved_tags, +			tags->bitmap_tags.bits_per_word); + +	free = bt_unused_tags(&tags->bitmap_tags); +	res = bt_unused_tags(&tags->breserved_tags); + +	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); +	page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); + +	return page - orig_page; +} diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h new file mode 100644 index 00000000000..6206ed17ef7 --- /dev/null +++ b/block/blk-mq-tag.h @@ -0,0 +1,88 @@ +#ifndef INT_BLK_MQ_TAG_H +#define INT_BLK_MQ_TAG_H + +#include "blk-mq.h" + +enum { +	BT_WAIT_QUEUES	= 8, +	BT_WAIT_BATCH	= 8, +}; + +struct bt_wait_state { +	atomic_t wait_cnt; +	wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +#define TAG_TO_INDEX(bt, tag)	((tag) >> (bt)->bits_per_word) +#define TAG_TO_BIT(bt, tag)	((tag) & ((1 << (bt)->bits_per_word) - 1)) + +struct blk_mq_bitmap_tags { +	unsigned int depth; +	unsigned int wake_cnt; +	unsigned int bits_per_word; + +	unsigned int map_nr; +	struct blk_align_bitmap *map; + +	atomic_t wake_index; +	struct bt_wait_state *bs; +}; + +/* + * Tag address space map. + */ +struct blk_mq_tags { +	unsigned int nr_tags; +	unsigned int nr_reserved_tags; + +	atomic_t active_queues; + +	struct blk_mq_bitmap_tags bitmap_tags; +	struct blk_mq_bitmap_tags breserved_tags; + +	struct request **rqs; +	struct list_head page_list; +}; + + +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); +extern void blk_mq_free_tags(struct blk_mq_tags *tags); + +extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); +extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); +extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); +extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); +extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); + +enum { +	BLK_MQ_TAG_CACHE_MIN	= 1, +	BLK_MQ_TAG_CACHE_MAX	= 64, +}; + +enum { +	BLK_MQ_TAG_FAIL		= -1U, +	BLK_MQ_TAG_MIN		= BLK_MQ_TAG_CACHE_MIN, +	BLK_MQ_TAG_MAX		= BLK_MQ_TAG_FAIL - 1, +}; + +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ +	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) +		return false; + +	return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ +	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) +		return; + +	__blk_mq_tag_idle(hctx); +} + +#endif diff --git a/block/blk-mq.c b/block/blk-mq.c new file mode 100644 index 00000000000..ad69ef657e8 --- /dev/null +++ b/block/blk-mq.c @@ -0,0 +1,2058 @@ +/* + * Block multiqueue core code + * + * Copyright (C) 2013-2014 Jens Axboe + * Copyright (C) 2013-2014 Christoph Hellwig + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/smp.h> +#include <linux/llist.h> +#include <linux/list_sort.h> +#include <linux/cpu.h> +#include <linux/cache.h> +#include <linux/sched/sysctl.h> +#include <linux/delay.h> + +#include <trace/events/block.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static DEFINE_MUTEX(all_q_mutex); +static LIST_HEAD(all_q_list); + +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); + +/* + * Check if any of the ctx's have pending work in this hardware queue + */ +static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +{ +	unsigned int i; + +	for (i = 0; i < hctx->ctx_map.map_size; i++) +		if (hctx->ctx_map.map[i].word) +			return true; + +	return false; +} + +static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, +					      struct blk_mq_ctx *ctx) +{ +	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; +} + +#define CTX_TO_BIT(hctx, ctx)	\ +	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) + +/* + * Mark this ctx as having pending work in this hardware queue + */ +static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, +				     struct blk_mq_ctx *ctx) +{ +	struct blk_align_bitmap *bm = get_bm(hctx, ctx); + +	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) +		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); +} + +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, +				      struct blk_mq_ctx *ctx) +{ +	struct blk_align_bitmap *bm = get_bm(hctx, ctx); + +	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); +} + +static int blk_mq_queue_enter(struct request_queue *q) +{ +	int ret; + +	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000); +	smp_wmb(); + +	/* we have problems freezing the queue if it's initializing */ +	if (!blk_queue_dying(q) && +	    (!blk_queue_bypass(q) || !blk_queue_init_done(q))) +		return 0; + +	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000); + +	spin_lock_irq(q->queue_lock); +	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, +		!blk_queue_bypass(q) || blk_queue_dying(q), +		*q->queue_lock); +	/* inc usage with lock hold to avoid freeze_queue runs here */ +	if (!ret && !blk_queue_dying(q)) +		__percpu_counter_add(&q->mq_usage_counter, 1, 1000000); +	else if (blk_queue_dying(q)) +		ret = -ENODEV; +	spin_unlock_irq(q->queue_lock); + +	return ret; +} + +static void blk_mq_queue_exit(struct request_queue *q) +{ +	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000); +} + +void blk_mq_drain_queue(struct request_queue *q) +{ +	while (true) { +		s64 count; + +		spin_lock_irq(q->queue_lock); +		count = percpu_counter_sum(&q->mq_usage_counter); +		spin_unlock_irq(q->queue_lock); + +		if (count == 0) +			break; +		blk_mq_start_hw_queues(q); +		msleep(10); +	} +} + +/* + * Guarantee no request is in use, so we can change any data structure of + * the queue afterward. + */ +static void blk_mq_freeze_queue(struct request_queue *q) +{ +	bool drain; + +	spin_lock_irq(q->queue_lock); +	drain = !q->bypass_depth++; +	queue_flag_set(QUEUE_FLAG_BYPASS, q); +	spin_unlock_irq(q->queue_lock); + +	if (drain) +		blk_mq_drain_queue(q); +} + +static void blk_mq_unfreeze_queue(struct request_queue *q) +{ +	bool wake = false; + +	spin_lock_irq(q->queue_lock); +	if (!--q->bypass_depth) { +		queue_flag_clear(QUEUE_FLAG_BYPASS, q); +		wake = true; +	} +	WARN_ON_ONCE(q->bypass_depth < 0); +	spin_unlock_irq(q->queue_lock); +	if (wake) +		wake_up_all(&q->mq_freeze_wq); +} + +bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) +{ +	return blk_mq_has_free_tags(hctx->tags); +} +EXPORT_SYMBOL(blk_mq_can_queue); + +static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, +			       struct request *rq, unsigned int rw_flags) +{ +	if (blk_queue_io_stat(q)) +		rw_flags |= REQ_IO_STAT; + +	INIT_LIST_HEAD(&rq->queuelist); +	/* csd/requeue_work/fifo_time is initialized before use */ +	rq->q = q; +	rq->mq_ctx = ctx; +	rq->cmd_flags |= rw_flags; +	/* do not touch atomic flags, it needs atomic ops against the timer */ +	rq->cpu = -1; +	INIT_HLIST_NODE(&rq->hash); +	RB_CLEAR_NODE(&rq->rb_node); +	rq->rq_disk = NULL; +	rq->part = NULL; +	rq->start_time = jiffies; +#ifdef CONFIG_BLK_CGROUP +	rq->rl = NULL; +	set_start_time_ns(rq); +	rq->io_start_time_ns = 0; +#endif +	rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) +	rq->nr_integrity_segments = 0; +#endif +	rq->special = NULL; +	/* tag was already set */ +	rq->errors = 0; + +	rq->extra_len = 0; +	rq->sense_len = 0; +	rq->resid_len = 0; +	rq->sense = NULL; + +	INIT_LIST_HEAD(&rq->timeout_list); +	rq->timeout = 0; + +	rq->end_io = NULL; +	rq->end_io_data = NULL; +	rq->next_rq = NULL; + +	ctx->rq_dispatched[rw_is_sync(rw_flags)]++; +} + +static struct request * +__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) +{ +	struct request *rq; +	unsigned int tag; + +	tag = blk_mq_get_tag(data); +	if (tag != BLK_MQ_TAG_FAIL) { +		rq = data->hctx->tags->rqs[tag]; + +		rq->cmd_flags = 0; +		if (blk_mq_tag_busy(data->hctx)) { +			rq->cmd_flags = REQ_MQ_INFLIGHT; +			atomic_inc(&data->hctx->nr_active); +		} + +		rq->tag = tag; +		blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); +		return rq; +	} + +	return NULL; +} + +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, +		bool reserved) +{ +	struct blk_mq_ctx *ctx; +	struct blk_mq_hw_ctx *hctx; +	struct request *rq; +	struct blk_mq_alloc_data alloc_data; + +	if (blk_mq_queue_enter(q)) +		return NULL; + +	ctx = blk_mq_get_ctx(q); +	hctx = q->mq_ops->map_queue(q, ctx->cpu); +	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, +			reserved, ctx, hctx); + +	rq = __blk_mq_alloc_request(&alloc_data, rw); +	if (!rq && (gfp & __GFP_WAIT)) { +		__blk_mq_run_hw_queue(hctx); +		blk_mq_put_ctx(ctx); + +		ctx = blk_mq_get_ctx(q); +		hctx = q->mq_ops->map_queue(q, ctx->cpu); +		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, +				hctx); +		rq =  __blk_mq_alloc_request(&alloc_data, rw); +		ctx = alloc_data.ctx; +	} +	blk_mq_put_ctx(ctx); +	return rq; +} +EXPORT_SYMBOL(blk_mq_alloc_request); + +static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, +				  struct blk_mq_ctx *ctx, struct request *rq) +{ +	const int tag = rq->tag; +	struct request_queue *q = rq->q; + +	if (rq->cmd_flags & REQ_MQ_INFLIGHT) +		atomic_dec(&hctx->nr_active); + +	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +	blk_mq_put_tag(hctx, tag, &ctx->last_tag); +	blk_mq_queue_exit(q); +} + +void blk_mq_free_request(struct request *rq) +{ +	struct blk_mq_ctx *ctx = rq->mq_ctx; +	struct blk_mq_hw_ctx *hctx; +	struct request_queue *q = rq->q; + +	ctx->rq_completed[rq_is_sync(rq)]++; + +	hctx = q->mq_ops->map_queue(q, ctx->cpu); +	__blk_mq_free_request(hctx, ctx, rq); +} + +/* + * Clone all relevant state from a request that has been put on hold in + * the flush state machine into the preallocated flush request that hangs + * off the request queue. + * + * For a driver the flush request should be invisible, that's why we are + * impersonating the original request here. + */ +void blk_mq_clone_flush_request(struct request *flush_rq, +		struct request *orig_rq) +{ +	struct blk_mq_hw_ctx *hctx = +		orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); + +	flush_rq->mq_ctx = orig_rq->mq_ctx; +	flush_rq->tag = orig_rq->tag; +	memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), +		hctx->cmd_size); +} + +inline void __blk_mq_end_io(struct request *rq, int error) +{ +	blk_account_io_done(rq); + +	if (rq->end_io) { +		rq->end_io(rq, error); +	} else { +		if (unlikely(blk_bidi_rq(rq))) +			blk_mq_free_request(rq->next_rq); +		blk_mq_free_request(rq); +	} +} +EXPORT_SYMBOL(__blk_mq_end_io); + +void blk_mq_end_io(struct request *rq, int error) +{ +	if (blk_update_request(rq, error, blk_rq_bytes(rq))) +		BUG(); +	__blk_mq_end_io(rq, error); +} +EXPORT_SYMBOL(blk_mq_end_io); + +static void __blk_mq_complete_request_remote(void *data) +{ +	struct request *rq = data; + +	rq->q->softirq_done_fn(rq); +} + +static void blk_mq_ipi_complete_request(struct request *rq) +{ +	struct blk_mq_ctx *ctx = rq->mq_ctx; +	bool shared = false; +	int cpu; + +	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { +		rq->q->softirq_done_fn(rq); +		return; +	} + +	cpu = get_cpu(); +	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) +		shared = cpus_share_cache(cpu, ctx->cpu); + +	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { +		rq->csd.func = __blk_mq_complete_request_remote; +		rq->csd.info = rq; +		rq->csd.flags = 0; +		smp_call_function_single_async(ctx->cpu, &rq->csd); +	} else { +		rq->q->softirq_done_fn(rq); +	} +	put_cpu(); +} + +void __blk_mq_complete_request(struct request *rq) +{ +	struct request_queue *q = rq->q; + +	if (!q->softirq_done_fn) +		blk_mq_end_io(rq, rq->errors); +	else +		blk_mq_ipi_complete_request(rq); +} + +/** + * blk_mq_complete_request - end I/O on a request + * @rq:		the request being processed + * + * Description: + *	Ends all I/O on a request. It does not handle partial completions. + *	The actual completion happens out-of-order, through a IPI handler. + **/ +void blk_mq_complete_request(struct request *rq) +{ +	struct request_queue *q = rq->q; + +	if (unlikely(blk_should_fake_timeout(q))) +		return; +	if (!blk_mark_rq_complete(rq)) +		__blk_mq_complete_request(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); + +static void blk_mq_start_request(struct request *rq, bool last) +{ +	struct request_queue *q = rq->q; + +	trace_block_rq_issue(q, rq); + +	rq->resid_len = blk_rq_bytes(rq); +	if (unlikely(blk_bidi_rq(rq))) +		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + +	blk_add_timer(rq); + +	/* +	 * Mark us as started and clear complete. Complete might have been +	 * set if requeue raced with timeout, which then marked it as +	 * complete. So be sure to clear complete again when we start +	 * the request, otherwise we'll ignore the completion event. +	 */ +	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) +		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) +		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + +	if (q->dma_drain_size && blk_rq_bytes(rq)) { +		/* +		 * Make sure space for the drain appears.  We know we can do +		 * this because max_hw_segments has been adjusted to be one +		 * fewer than the device can handle. +		 */ +		rq->nr_phys_segments++; +	} + +	/* +	 * Flag the last request in the series so that drivers know when IO +	 * should be kicked off, if they don't do it on a per-request basis. +	 * +	 * Note: the flag isn't the only condition drivers should do kick off. +	 * If drive is busy, the last request might not have the bit set. +	 */ +	if (last) +		rq->cmd_flags |= REQ_END; +} + +static void __blk_mq_requeue_request(struct request *rq) +{ +	struct request_queue *q = rq->q; + +	trace_block_rq_requeue(q, rq); +	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + +	rq->cmd_flags &= ~REQ_END; + +	if (q->dma_drain_size && blk_rq_bytes(rq)) +		rq->nr_phys_segments--; +} + +void blk_mq_requeue_request(struct request *rq) +{ +	__blk_mq_requeue_request(rq); +	blk_clear_rq_complete(rq); + +	BUG_ON(blk_queued_rq(rq)); +	blk_mq_add_to_requeue_list(rq, true); +} +EXPORT_SYMBOL(blk_mq_requeue_request); + +static void blk_mq_requeue_work(struct work_struct *work) +{ +	struct request_queue *q = +		container_of(work, struct request_queue, requeue_work); +	LIST_HEAD(rq_list); +	struct request *rq, *next; +	unsigned long flags; + +	spin_lock_irqsave(&q->requeue_lock, flags); +	list_splice_init(&q->requeue_list, &rq_list); +	spin_unlock_irqrestore(&q->requeue_lock, flags); + +	list_for_each_entry_safe(rq, next, &rq_list, queuelist) { +		if (!(rq->cmd_flags & REQ_SOFTBARRIER)) +			continue; + +		rq->cmd_flags &= ~REQ_SOFTBARRIER; +		list_del_init(&rq->queuelist); +		blk_mq_insert_request(rq, true, false, false); +	} + +	while (!list_empty(&rq_list)) { +		rq = list_entry(rq_list.next, struct request, queuelist); +		list_del_init(&rq->queuelist); +		blk_mq_insert_request(rq, false, false, false); +	} + +	blk_mq_run_queues(q, false); +} + +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +{ +	struct request_queue *q = rq->q; +	unsigned long flags; + +	/* +	 * We abuse this flag that is otherwise used by the I/O scheduler to +	 * request head insertation from the workqueue. +	 */ +	BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + +	spin_lock_irqsave(&q->requeue_lock, flags); +	if (at_head) { +		rq->cmd_flags |= REQ_SOFTBARRIER; +		list_add(&rq->queuelist, &q->requeue_list); +	} else { +		list_add_tail(&rq->queuelist, &q->requeue_list); +	} +	spin_unlock_irqrestore(&q->requeue_lock, flags); +} +EXPORT_SYMBOL(blk_mq_add_to_requeue_list); + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ +	kblockd_schedule_work(&q->requeue_work); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + +static inline bool is_flush_request(struct request *rq, unsigned int tag) +{ +	return ((rq->cmd_flags & REQ_FLUSH_SEQ) && +			rq->q->flush_rq->tag == tag); +} + +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) +{ +	struct request *rq = tags->rqs[tag]; + +	if (!is_flush_request(rq, tag)) +		return rq; + +	return rq->q->flush_rq; +} +EXPORT_SYMBOL(blk_mq_tag_to_rq); + +struct blk_mq_timeout_data { +	struct blk_mq_hw_ctx *hctx; +	unsigned long *next; +	unsigned int *next_set; +}; + +static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) +{ +	struct blk_mq_timeout_data *data = __data; +	struct blk_mq_hw_ctx *hctx = data->hctx; +	unsigned int tag; + +	 /* It may not be in flight yet (this is where +	 * the REQ_ATOMIC_STARTED flag comes in). The requests are +	 * statically allocated, so we know it's always safe to access the +	 * memory associated with a bit offset into ->rqs[]. +	 */ +	tag = 0; +	do { +		struct request *rq; + +		tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); +		if (tag >= hctx->tags->nr_tags) +			break; + +		rq = blk_mq_tag_to_rq(hctx->tags, tag++); +		if (rq->q != hctx->queue) +			continue; +		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) +			continue; + +		blk_rq_check_expired(rq, data->next, data->next_set); +	} while (1); +} + +static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, +					unsigned long *next, +					unsigned int *next_set) +{ +	struct blk_mq_timeout_data data = { +		.hctx		= hctx, +		.next		= next, +		.next_set	= next_set, +	}; + +	/* +	 * Ask the tagging code to iterate busy requests, so we can +	 * check them for timeout. +	 */ +	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); +} + +static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +{ +	struct request_queue *q = rq->q; + +	/* +	 * We know that complete is set at this point. If STARTED isn't set +	 * anymore, then the request isn't active and the "timeout" should +	 * just be ignored. This can happen due to the bitflag ordering. +	 * Timeout first checks if STARTED is set, and if it is, assumes +	 * the request is active. But if we race with completion, then +	 * we both flags will get cleared. So check here again, and ignore +	 * a timeout event with a request that isn't active. +	 */ +	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) +		return BLK_EH_NOT_HANDLED; + +	if (!q->mq_ops->timeout) +		return BLK_EH_RESET_TIMER; + +	return q->mq_ops->timeout(rq); +} + +static void blk_mq_rq_timer(unsigned long data) +{ +	struct request_queue *q = (struct request_queue *) data; +	struct blk_mq_hw_ctx *hctx; +	unsigned long next = 0; +	int i, next_set = 0; + +	queue_for_each_hw_ctx(q, hctx, i) { +		/* +		 * If not software queues are currently mapped to this +		 * hardware queue, there's nothing to check +		 */ +		if (!hctx->nr_ctx || !hctx->tags) +			continue; + +		blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); +	} + +	if (next_set) { +		next = blk_rq_timeout(round_jiffies_up(next)); +		mod_timer(&q->timeout, next); +	} else { +		queue_for_each_hw_ctx(q, hctx, i) +			blk_mq_tag_idle(hctx); +	} +} + +/* + * Reverse check our software queue for entries that we could potentially + * merge with. Currently includes a hand-wavy stop count of 8, to not spend + * too much time checking for merges. + */ +static bool blk_mq_attempt_merge(struct request_queue *q, +				 struct blk_mq_ctx *ctx, struct bio *bio) +{ +	struct request *rq; +	int checked = 8; + +	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { +		int el_ret; + +		if (!checked--) +			break; + +		if (!blk_rq_merge_ok(rq, bio)) +			continue; + +		el_ret = blk_try_merge(rq, bio); +		if (el_ret == ELEVATOR_BACK_MERGE) { +			if (bio_attempt_back_merge(q, rq, bio)) { +				ctx->rq_merged++; +				return true; +			} +			break; +		} else if (el_ret == ELEVATOR_FRONT_MERGE) { +			if (bio_attempt_front_merge(q, rq, bio)) { +				ctx->rq_merged++; +				return true; +			} +			break; +		} +	} + +	return false; +} + +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +{ +	struct blk_mq_ctx *ctx; +	int i; + +	for (i = 0; i < hctx->ctx_map.map_size; i++) { +		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; +		unsigned int off, bit; + +		if (!bm->word) +			continue; + +		bit = 0; +		off = i * hctx->ctx_map.bits_per_word; +		do { +			bit = find_next_bit(&bm->word, bm->depth, bit); +			if (bit >= bm->depth) +				break; + +			ctx = hctx->ctxs[bit + off]; +			clear_bit(bit, &bm->word); +			spin_lock(&ctx->lock); +			list_splice_tail_init(&ctx->rq_list, list); +			spin_unlock(&ctx->lock); + +			bit++; +		} while (1); +	} +} + +/* + * Run this hardware queue, pulling any software queues mapped to it in. + * Note that this function currently has various problems around ordering + * of IO. In particular, we'd like FIFO behaviour on handling existing + * items on the hctx->dispatch list. Ignore that for now. + */ +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +{ +	struct request_queue *q = hctx->queue; +	struct request *rq; +	LIST_HEAD(rq_list); +	int queued; + +	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); + +	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) +		return; + +	hctx->run++; + +	/* +	 * Touch any software queue that has pending entries. +	 */ +	flush_busy_ctxs(hctx, &rq_list); + +	/* +	 * If we have previous entries on our dispatch list, grab them +	 * and stuff them at the front for more fair dispatch. +	 */ +	if (!list_empty_careful(&hctx->dispatch)) { +		spin_lock(&hctx->lock); +		if (!list_empty(&hctx->dispatch)) +			list_splice_init(&hctx->dispatch, &rq_list); +		spin_unlock(&hctx->lock); +	} + +	/* +	 * Now process all the entries, sending them to the driver. +	 */ +	queued = 0; +	while (!list_empty(&rq_list)) { +		int ret; + +		rq = list_first_entry(&rq_list, struct request, queuelist); +		list_del_init(&rq->queuelist); + +		blk_mq_start_request(rq, list_empty(&rq_list)); + +		ret = q->mq_ops->queue_rq(hctx, rq); +		switch (ret) { +		case BLK_MQ_RQ_QUEUE_OK: +			queued++; +			continue; +		case BLK_MQ_RQ_QUEUE_BUSY: +			list_add(&rq->queuelist, &rq_list); +			__blk_mq_requeue_request(rq); +			break; +		default: +			pr_err("blk-mq: bad return on queue: %d\n", ret); +		case BLK_MQ_RQ_QUEUE_ERROR: +			rq->errors = -EIO; +			blk_mq_end_io(rq, rq->errors); +			break; +		} + +		if (ret == BLK_MQ_RQ_QUEUE_BUSY) +			break; +	} + +	if (!queued) +		hctx->dispatched[0]++; +	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) +		hctx->dispatched[ilog2(queued) + 1]++; + +	/* +	 * Any items that need requeuing? Stuff them into hctx->dispatch, +	 * that is where we will continue on next queue run. +	 */ +	if (!list_empty(&rq_list)) { +		spin_lock(&hctx->lock); +		list_splice(&rq_list, &hctx->dispatch); +		spin_unlock(&hctx->lock); +	} +} + +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ +	int cpu = hctx->next_cpu; + +	if (--hctx->next_cpu_batch <= 0) { +		int next_cpu; + +		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); +		if (next_cpu >= nr_cpu_ids) +			next_cpu = cpumask_first(hctx->cpumask); + +		hctx->next_cpu = next_cpu; +		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; +	} + +	return cpu; +} + +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ +	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) +		return; + +	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) +		__blk_mq_run_hw_queue(hctx); +	else if (hctx->queue->nr_hw_queues == 1) +		kblockd_schedule_delayed_work(&hctx->run_work, 0); +	else { +		unsigned int cpu; + +		cpu = blk_mq_hctx_next_cpu(hctx); +		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); +	} +} + +void blk_mq_run_queues(struct request_queue *q, bool async) +{ +	struct blk_mq_hw_ctx *hctx; +	int i; + +	queue_for_each_hw_ctx(q, hctx, i) { +		if ((!blk_mq_hctx_has_pending(hctx) && +		    list_empty_careful(&hctx->dispatch)) || +		    test_bit(BLK_MQ_S_STOPPED, &hctx->state)) +			continue; + +		preempt_disable(); +		blk_mq_run_hw_queue(hctx, async); +		preempt_enable(); +	} +} +EXPORT_SYMBOL(blk_mq_run_queues); + +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) +{ +	cancel_delayed_work(&hctx->run_work); +	cancel_delayed_work(&hctx->delay_work); +	set_bit(BLK_MQ_S_STOPPED, &hctx->state); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queue); + +void blk_mq_stop_hw_queues(struct request_queue *q) +{ +	struct blk_mq_hw_ctx *hctx; +	int i; + +	queue_for_each_hw_ctx(q, hctx, i) +		blk_mq_stop_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queues); + +void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) +{ +	clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + +	preempt_disable(); +	blk_mq_run_hw_queue(hctx, false); +	preempt_enable(); +} +EXPORT_SYMBOL(blk_mq_start_hw_queue); + +void blk_mq_start_hw_queues(struct request_queue *q) +{ +	struct blk_mq_hw_ctx *hctx; +	int i; + +	queue_for_each_hw_ctx(q, hctx, i) +		blk_mq_start_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_start_hw_queues); + + +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) +{ +	struct blk_mq_hw_ctx *hctx; +	int i; + +	queue_for_each_hw_ctx(q, hctx, i) { +		if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) +			continue; + +		clear_bit(BLK_MQ_S_STOPPED, &hctx->state); +		preempt_disable(); +		blk_mq_run_hw_queue(hctx, async); +		preempt_enable(); +	} +} +EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); + +static void blk_mq_run_work_fn(struct work_struct *work) +{ +	struct blk_mq_hw_ctx *hctx; + +	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + +	__blk_mq_run_hw_queue(hctx); +} + +static void blk_mq_delay_work_fn(struct work_struct *work) +{ +	struct blk_mq_hw_ctx *hctx; + +	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); + +	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) +		__blk_mq_run_hw_queue(hctx); +} + +void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) +{ +	unsigned long tmo = msecs_to_jiffies(msecs); + +	if (hctx->queue->nr_hw_queues == 1) +		kblockd_schedule_delayed_work(&hctx->delay_work, tmo); +	else { +		unsigned int cpu; + +		cpu = blk_mq_hctx_next_cpu(hctx); +		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); +	} +} +EXPORT_SYMBOL(blk_mq_delay_queue); + +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, +				    struct request *rq, bool at_head) +{ +	struct blk_mq_ctx *ctx = rq->mq_ctx; + +	trace_block_rq_insert(hctx->queue, rq); + +	if (at_head) +		list_add(&rq->queuelist, &ctx->rq_list); +	else +		list_add_tail(&rq->queuelist, &ctx->rq_list); + +	blk_mq_hctx_mark_pending(hctx, ctx); +} + +void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, +		bool async) +{ +	struct request_queue *q = rq->q; +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; + +	current_ctx = blk_mq_get_ctx(q); +	if (!cpu_online(ctx->cpu)) +		rq->mq_ctx = ctx = current_ctx; + +	hctx = q->mq_ops->map_queue(q, ctx->cpu); + +	if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) && +	    !(rq->cmd_flags & (REQ_FLUSH_SEQ))) { +		blk_insert_flush(rq); +	} else { +		spin_lock(&ctx->lock); +		__blk_mq_insert_request(hctx, rq, at_head); +		spin_unlock(&ctx->lock); +	} + +	if (run_queue) +		blk_mq_run_hw_queue(hctx, async); + +	blk_mq_put_ctx(current_ctx); +} + +static void blk_mq_insert_requests(struct request_queue *q, +				     struct blk_mq_ctx *ctx, +				     struct list_head *list, +				     int depth, +				     bool from_schedule) + +{ +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *current_ctx; + +	trace_block_unplug(q, depth, !from_schedule); + +	current_ctx = blk_mq_get_ctx(q); + +	if (!cpu_online(ctx->cpu)) +		ctx = current_ctx; +	hctx = q->mq_ops->map_queue(q, ctx->cpu); + +	/* +	 * preemption doesn't flush plug list, so it's possible ctx->cpu is +	 * offline now +	 */ +	spin_lock(&ctx->lock); +	while (!list_empty(list)) { +		struct request *rq; + +		rq = list_first_entry(list, struct request, queuelist); +		list_del_init(&rq->queuelist); +		rq->mq_ctx = ctx; +		__blk_mq_insert_request(hctx, rq, false); +	} +	spin_unlock(&ctx->lock); + +	blk_mq_run_hw_queue(hctx, from_schedule); +	blk_mq_put_ctx(current_ctx); +} + +static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) +{ +	struct request *rqa = container_of(a, struct request, queuelist); +	struct request *rqb = container_of(b, struct request, queuelist); + +	return !(rqa->mq_ctx < rqb->mq_ctx || +		 (rqa->mq_ctx == rqb->mq_ctx && +		  blk_rq_pos(rqa) < blk_rq_pos(rqb))); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ +	struct blk_mq_ctx *this_ctx; +	struct request_queue *this_q; +	struct request *rq; +	LIST_HEAD(list); +	LIST_HEAD(ctx_list); +	unsigned int depth; + +	list_splice_init(&plug->mq_list, &list); + +	list_sort(NULL, &list, plug_ctx_cmp); + +	this_q = NULL; +	this_ctx = NULL; +	depth = 0; + +	while (!list_empty(&list)) { +		rq = list_entry_rq(list.next); +		list_del_init(&rq->queuelist); +		BUG_ON(!rq->q); +		if (rq->mq_ctx != this_ctx) { +			if (this_ctx) { +				blk_mq_insert_requests(this_q, this_ctx, +							&ctx_list, depth, +							from_schedule); +			} + +			this_ctx = rq->mq_ctx; +			this_q = rq->q; +			depth = 0; +		} + +		depth++; +		list_add_tail(&rq->queuelist, &ctx_list); +	} + +	/* +	 * If 'this_ctx' is set, we know we have entries to complete +	 * on 'ctx_list'. Do those. +	 */ +	if (this_ctx) { +		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, +				       from_schedule); +	} +} + +static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) +{ +	init_request_from_bio(rq, bio); + +	if (blk_do_io_stat(rq)) +		blk_account_io_start(rq, 1); +} + +static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, +					 struct blk_mq_ctx *ctx, +					 struct request *rq, struct bio *bio) +{ +	struct request_queue *q = hctx->queue; + +	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { +		blk_mq_bio_to_request(rq, bio); +		spin_lock(&ctx->lock); +insert_rq: +		__blk_mq_insert_request(hctx, rq, false); +		spin_unlock(&ctx->lock); +		return false; +	} else { +		spin_lock(&ctx->lock); +		if (!blk_mq_attempt_merge(q, ctx, bio)) { +			blk_mq_bio_to_request(rq, bio); +			goto insert_rq; +		} + +		spin_unlock(&ctx->lock); +		__blk_mq_free_request(hctx, ctx, rq); +		return true; +	} +} + +struct blk_map_ctx { +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *ctx; +}; + +static struct request *blk_mq_map_request(struct request_queue *q, +					  struct bio *bio, +					  struct blk_map_ctx *data) +{ +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *ctx; +	struct request *rq; +	int rw = bio_data_dir(bio); +	struct blk_mq_alloc_data alloc_data; + +	if (unlikely(blk_mq_queue_enter(q))) { +		bio_endio(bio, -EIO); +		return NULL; +	} + +	ctx = blk_mq_get_ctx(q); +	hctx = q->mq_ops->map_queue(q, ctx->cpu); + +	if (rw_is_sync(bio->bi_rw)) +		rw |= REQ_SYNC; + +	trace_block_getrq(q, bio, rw); +	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, +			hctx); +	rq = __blk_mq_alloc_request(&alloc_data, rw); +	if (unlikely(!rq)) { +		__blk_mq_run_hw_queue(hctx); +		blk_mq_put_ctx(ctx); +		trace_block_sleeprq(q, bio, rw); + +		ctx = blk_mq_get_ctx(q); +		hctx = q->mq_ops->map_queue(q, ctx->cpu); +		blk_mq_set_alloc_data(&alloc_data, q, +				__GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); +		rq = __blk_mq_alloc_request(&alloc_data, rw); +		ctx = alloc_data.ctx; +		hctx = alloc_data.hctx; +	} + +	hctx->queued++; +	data->hctx = hctx; +	data->ctx = ctx; +	return rq; +} + +/* + * Multiple hardware queue variant. This will not use per-process plugs, + * but will attempt to bypass the hctx queueing if we can go straight to + * hardware for SYNC IO. + */ +static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ +	const int is_sync = rw_is_sync(bio->bi_rw); +	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); +	struct blk_map_ctx data; +	struct request *rq; + +	blk_queue_bounce(q, &bio); + +	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { +		bio_endio(bio, -EIO); +		return; +	} + +	rq = blk_mq_map_request(q, bio, &data); +	if (unlikely(!rq)) +		return; + +	if (unlikely(is_flush_fua)) { +		blk_mq_bio_to_request(rq, bio); +		blk_insert_flush(rq); +		goto run_queue; +	} + +	if (is_sync) { +		int ret; + +		blk_mq_bio_to_request(rq, bio); +		blk_mq_start_request(rq, true); + +		/* +		 * For OK queue, we are done. For error, kill it. Any other +		 * error (busy), just add it to our list as we previously +		 * would have done +		 */ +		ret = q->mq_ops->queue_rq(data.hctx, rq); +		if (ret == BLK_MQ_RQ_QUEUE_OK) +			goto done; +		else { +			__blk_mq_requeue_request(rq); + +			if (ret == BLK_MQ_RQ_QUEUE_ERROR) { +				rq->errors = -EIO; +				blk_mq_end_io(rq, rq->errors); +				goto done; +			} +		} +	} + +	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { +		/* +		 * For a SYNC request, send it to the hardware immediately. For +		 * an ASYNC request, just ensure that we run it later on. The +		 * latter allows for merging opportunities and more efficient +		 * dispatching. +		 */ +run_queue: +		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); +	} +done: +	blk_mq_put_ctx(data.ctx); +} + +/* + * Single hardware queue variant. This will attempt to use any per-process + * plug for merging and IO deferral. + */ +static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +{ +	const int is_sync = rw_is_sync(bio->bi_rw); +	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); +	unsigned int use_plug, request_count = 0; +	struct blk_map_ctx data; +	struct request *rq; + +	/* +	 * If we have multiple hardware queues, just go directly to +	 * one of those for sync IO. +	 */ +	use_plug = !is_flush_fua && !is_sync; + +	blk_queue_bounce(q, &bio); + +	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { +		bio_endio(bio, -EIO); +		return; +	} + +	if (use_plug && !blk_queue_nomerges(q) && +	    blk_attempt_plug_merge(q, bio, &request_count)) +		return; + +	rq = blk_mq_map_request(q, bio, &data); +	if (unlikely(!rq)) +		return; + +	if (unlikely(is_flush_fua)) { +		blk_mq_bio_to_request(rq, bio); +		blk_insert_flush(rq); +		goto run_queue; +	} + +	/* +	 * A task plug currently exists. Since this is completely lockless, +	 * utilize that to temporarily store requests until the task is +	 * either done or scheduled away. +	 */ +	if (use_plug) { +		struct blk_plug *plug = current->plug; + +		if (plug) { +			blk_mq_bio_to_request(rq, bio); +			if (list_empty(&plug->mq_list)) +				trace_block_plug(q); +			else if (request_count >= BLK_MAX_REQUEST_COUNT) { +				blk_flush_plug_list(plug, false); +				trace_block_plug(q); +			} +			list_add_tail(&rq->queuelist, &plug->mq_list); +			blk_mq_put_ctx(data.ctx); +			return; +		} +	} + +	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { +		/* +		 * For a SYNC request, send it to the hardware immediately. For +		 * an ASYNC request, just ensure that we run it later on. The +		 * latter allows for merging opportunities and more efficient +		 * dispatching. +		 */ +run_queue: +		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); +	} + +	blk_mq_put_ctx(data.ctx); +} + +/* + * Default mapping to a software queue, since we use one per CPU. + */ +struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) +{ +	return q->queue_hw_ctx[q->mq_map[cpu]]; +} +EXPORT_SYMBOL(blk_mq_map_queue); + +static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, +		struct blk_mq_tags *tags, unsigned int hctx_idx) +{ +	struct page *page; + +	if (tags->rqs && set->ops->exit_request) { +		int i; + +		for (i = 0; i < tags->nr_tags; i++) { +			if (!tags->rqs[i]) +				continue; +			set->ops->exit_request(set->driver_data, tags->rqs[i], +						hctx_idx, i); +		} +	} + +	while (!list_empty(&tags->page_list)) { +		page = list_first_entry(&tags->page_list, struct page, lru); +		list_del_init(&page->lru); +		__free_pages(page, page->private); +	} + +	kfree(tags->rqs); + +	blk_mq_free_tags(tags); +} + +static size_t order_to_size(unsigned int order) +{ +	return (size_t)PAGE_SIZE << order; +} + +static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, +		unsigned int hctx_idx) +{ +	struct blk_mq_tags *tags; +	unsigned int i, j, entries_per_page, max_order = 4; +	size_t rq_size, left; + +	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, +				set->numa_node); +	if (!tags) +		return NULL; + +	INIT_LIST_HEAD(&tags->page_list); + +	tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), +					GFP_KERNEL, set->numa_node); +	if (!tags->rqs) { +		blk_mq_free_tags(tags); +		return NULL; +	} + +	/* +	 * rq_size is the size of the request plus driver payload, rounded +	 * to the cacheline size +	 */ +	rq_size = round_up(sizeof(struct request) + set->cmd_size, +				cache_line_size()); +	left = rq_size * set->queue_depth; + +	for (i = 0; i < set->queue_depth; ) { +		int this_order = max_order; +		struct page *page; +		int to_do; +		void *p; + +		while (left < order_to_size(this_order - 1) && this_order) +			this_order--; + +		do { +			page = alloc_pages_node(set->numa_node, GFP_KERNEL, +						this_order); +			if (page) +				break; +			if (!this_order--) +				break; +			if (order_to_size(this_order) < rq_size) +				break; +		} while (1); + +		if (!page) +			goto fail; + +		page->private = this_order; +		list_add_tail(&page->lru, &tags->page_list); + +		p = page_address(page); +		entries_per_page = order_to_size(this_order) / rq_size; +		to_do = min(entries_per_page, set->queue_depth - i); +		left -= to_do * rq_size; +		for (j = 0; j < to_do; j++) { +			tags->rqs[i] = p; +			if (set->ops->init_request) { +				if (set->ops->init_request(set->driver_data, +						tags->rqs[i], hctx_idx, i, +						set->numa_node)) +					goto fail; +			} + +			p += rq_size; +			i++; +		} +	} + +	return tags; + +fail: +	pr_warn("%s: failed to allocate requests\n", __func__); +	blk_mq_free_rq_map(set, tags, hctx_idx); +	return NULL; +} + +static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) +{ +	kfree(bitmap->map); +} + +static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) +{ +	unsigned int bpw = 8, total, num_maps, i; + +	bitmap->bits_per_word = bpw; + +	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; +	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), +					GFP_KERNEL, node); +	if (!bitmap->map) +		return -ENOMEM; + +	bitmap->map_size = num_maps; + +	total = nr_cpu_ids; +	for (i = 0; i < num_maps; i++) { +		bitmap->map[i].depth = min(total, bitmap->bits_per_word); +		total -= bitmap->map[i].depth; +	} + +	return 0; +} + +static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) +{ +	struct request_queue *q = hctx->queue; +	struct blk_mq_ctx *ctx; +	LIST_HEAD(tmp); + +	/* +	 * Move ctx entries to new CPU, if this one is going away. +	 */ +	ctx = __blk_mq_get_ctx(q, cpu); + +	spin_lock(&ctx->lock); +	if (!list_empty(&ctx->rq_list)) { +		list_splice_init(&ctx->rq_list, &tmp); +		blk_mq_hctx_clear_pending(hctx, ctx); +	} +	spin_unlock(&ctx->lock); + +	if (list_empty(&tmp)) +		return NOTIFY_OK; + +	ctx = blk_mq_get_ctx(q); +	spin_lock(&ctx->lock); + +	while (!list_empty(&tmp)) { +		struct request *rq; + +		rq = list_first_entry(&tmp, struct request, queuelist); +		rq->mq_ctx = ctx; +		list_move_tail(&rq->queuelist, &ctx->rq_list); +	} + +	hctx = q->mq_ops->map_queue(q, ctx->cpu); +	blk_mq_hctx_mark_pending(hctx, ctx); + +	spin_unlock(&ctx->lock); + +	blk_mq_run_hw_queue(hctx, true); +	blk_mq_put_ctx(ctx); +	return NOTIFY_OK; +} + +static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) +{ +	struct request_queue *q = hctx->queue; +	struct blk_mq_tag_set *set = q->tag_set; + +	if (set->tags[hctx->queue_num]) +		return NOTIFY_OK; + +	set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); +	if (!set->tags[hctx->queue_num]) +		return NOTIFY_STOP; + +	hctx->tags = set->tags[hctx->queue_num]; +	return NOTIFY_OK; +} + +static int blk_mq_hctx_notify(void *data, unsigned long action, +			      unsigned int cpu) +{ +	struct blk_mq_hw_ctx *hctx = data; + +	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) +		return blk_mq_hctx_cpu_offline(hctx, cpu); +	else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) +		return blk_mq_hctx_cpu_online(hctx, cpu); + +	return NOTIFY_OK; +} + +static void blk_mq_exit_hw_queues(struct request_queue *q, +		struct blk_mq_tag_set *set, int nr_queue) +{ +	struct blk_mq_hw_ctx *hctx; +	unsigned int i; + +	queue_for_each_hw_ctx(q, hctx, i) { +		if (i == nr_queue) +			break; + +		blk_mq_tag_idle(hctx); + +		if (set->ops->exit_hctx) +			set->ops->exit_hctx(hctx, i); + +		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); +		kfree(hctx->ctxs); +		blk_mq_free_bitmap(&hctx->ctx_map); +	} + +} + +static void blk_mq_free_hw_queues(struct request_queue *q, +		struct blk_mq_tag_set *set) +{ +	struct blk_mq_hw_ctx *hctx; +	unsigned int i; + +	queue_for_each_hw_ctx(q, hctx, i) { +		free_cpumask_var(hctx->cpumask); +		kfree(hctx); +	} +} + +static int blk_mq_init_hw_queues(struct request_queue *q, +		struct blk_mq_tag_set *set) +{ +	struct blk_mq_hw_ctx *hctx; +	unsigned int i; + +	/* +	 * Initialize hardware queues +	 */ +	queue_for_each_hw_ctx(q, hctx, i) { +		int node; + +		node = hctx->numa_node; +		if (node == NUMA_NO_NODE) +			node = hctx->numa_node = set->numa_node; + +		INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); +		INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); +		spin_lock_init(&hctx->lock); +		INIT_LIST_HEAD(&hctx->dispatch); +		hctx->queue = q; +		hctx->queue_num = i; +		hctx->flags = set->flags; +		hctx->cmd_size = set->cmd_size; + +		blk_mq_init_cpu_notifier(&hctx->cpu_notifier, +						blk_mq_hctx_notify, hctx); +		blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + +		hctx->tags = set->tags[i]; + +		/* +		 * Allocate space for all possible cpus to avoid allocation in +		 * runtime +		 */ +		hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), +						GFP_KERNEL, node); +		if (!hctx->ctxs) +			break; + +		if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) +			break; + +		hctx->nr_ctx = 0; + +		if (set->ops->init_hctx && +		    set->ops->init_hctx(hctx, set->driver_data, i)) +			break; +	} + +	if (i == q->nr_hw_queues) +		return 0; + +	/* +	 * Init failed +	 */ +	blk_mq_exit_hw_queues(q, set, i); + +	return 1; +} + +static void blk_mq_init_cpu_queues(struct request_queue *q, +				   unsigned int nr_hw_queues) +{ +	unsigned int i; + +	for_each_possible_cpu(i) { +		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); +		struct blk_mq_hw_ctx *hctx; + +		memset(__ctx, 0, sizeof(*__ctx)); +		__ctx->cpu = i; +		spin_lock_init(&__ctx->lock); +		INIT_LIST_HEAD(&__ctx->rq_list); +		__ctx->queue = q; + +		/* If the cpu isn't online, the cpu is mapped to first hctx */ +		if (!cpu_online(i)) +			continue; + +		hctx = q->mq_ops->map_queue(q, i); +		cpumask_set_cpu(i, hctx->cpumask); +		hctx->nr_ctx++; + +		/* +		 * Set local node, IFF we have more than one hw queue. If +		 * not, we remain on the home node of the device +		 */ +		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) +			hctx->numa_node = cpu_to_node(i); +	} +} + +static void blk_mq_map_swqueue(struct request_queue *q) +{ +	unsigned int i; +	struct blk_mq_hw_ctx *hctx; +	struct blk_mq_ctx *ctx; + +	queue_for_each_hw_ctx(q, hctx, i) { +		cpumask_clear(hctx->cpumask); +		hctx->nr_ctx = 0; +	} + +	/* +	 * Map software to hardware queues +	 */ +	queue_for_each_ctx(q, ctx, i) { +		/* If the cpu isn't online, the cpu is mapped to first hctx */ +		if (!cpu_online(i)) +			continue; + +		hctx = q->mq_ops->map_queue(q, i); +		cpumask_set_cpu(i, hctx->cpumask); +		ctx->index_hw = hctx->nr_ctx; +		hctx->ctxs[hctx->nr_ctx++] = ctx; +	} + +	queue_for_each_hw_ctx(q, hctx, i) { +		/* +		 * If not software queues are mapped to this hardware queue, +		 * disable it and free the request entries +		 */ +		if (!hctx->nr_ctx) { +			struct blk_mq_tag_set *set = q->tag_set; + +			if (set->tags[i]) { +				blk_mq_free_rq_map(set, set->tags[i], i); +				set->tags[i] = NULL; +				hctx->tags = NULL; +			} +			continue; +		} + +		/* +		 * Initialize batch roundrobin counts +		 */ +		hctx->next_cpu = cpumask_first(hctx->cpumask); +		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; +	} +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +{ +	struct blk_mq_hw_ctx *hctx; +	struct request_queue *q; +	bool shared; +	int i; + +	if (set->tag_list.next == set->tag_list.prev) +		shared = false; +	else +		shared = true; + +	list_for_each_entry(q, &set->tag_list, tag_set_list) { +		blk_mq_freeze_queue(q); + +		queue_for_each_hw_ctx(q, hctx, i) { +			if (shared) +				hctx->flags |= BLK_MQ_F_TAG_SHARED; +			else +				hctx->flags &= ~BLK_MQ_F_TAG_SHARED; +		} +		blk_mq_unfreeze_queue(q); +	} +} + +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ +	struct blk_mq_tag_set *set = q->tag_set; + +	blk_mq_freeze_queue(q); + +	mutex_lock(&set->tag_list_lock); +	list_del_init(&q->tag_set_list); +	blk_mq_update_tag_set_depth(set); +	mutex_unlock(&set->tag_list_lock); + +	blk_mq_unfreeze_queue(q); +} + +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, +				     struct request_queue *q) +{ +	q->tag_set = set; + +	mutex_lock(&set->tag_list_lock); +	list_add_tail(&q->tag_set_list, &set->tag_list); +	blk_mq_update_tag_set_depth(set); +	mutex_unlock(&set->tag_list_lock); +} + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ +	struct blk_mq_hw_ctx **hctxs; +	struct blk_mq_ctx __percpu *ctx; +	struct request_queue *q; +	unsigned int *map; +	int i; + +	ctx = alloc_percpu(struct blk_mq_ctx); +	if (!ctx) +		return ERR_PTR(-ENOMEM); + +	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, +			set->numa_node); + +	if (!hctxs) +		goto err_percpu; + +	map = blk_mq_make_queue_map(set); +	if (!map) +		goto err_map; + +	for (i = 0; i < set->nr_hw_queues; i++) { +		int node = blk_mq_hw_queue_to_node(map, i); + +		hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), +					GFP_KERNEL, node); +		if (!hctxs[i]) +			goto err_hctxs; + +		if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) +			goto err_hctxs; + +		atomic_set(&hctxs[i]->nr_active, 0); +		hctxs[i]->numa_node = node; +		hctxs[i]->queue_num = i; +	} + +	q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); +	if (!q) +		goto err_hctxs; + +	if (percpu_counter_init(&q->mq_usage_counter, 0)) +		goto err_map; + +	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); +	blk_queue_rq_timeout(q, 30000); + +	q->nr_queues = nr_cpu_ids; +	q->nr_hw_queues = set->nr_hw_queues; +	q->mq_map = map; + +	q->queue_ctx = ctx; +	q->queue_hw_ctx = hctxs; + +	q->mq_ops = set->ops; +	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + +	if (!(set->flags & BLK_MQ_F_SG_MERGE)) +		q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; + +	q->sg_reserved_size = INT_MAX; + +	INIT_WORK(&q->requeue_work, blk_mq_requeue_work); +	INIT_LIST_HEAD(&q->requeue_list); +	spin_lock_init(&q->requeue_lock); + +	if (q->nr_hw_queues > 1) +		blk_queue_make_request(q, blk_mq_make_request); +	else +		blk_queue_make_request(q, blk_sq_make_request); + +	blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); +	if (set->timeout) +		blk_queue_rq_timeout(q, set->timeout); + +	/* +	 * Do this after blk_queue_make_request() overrides it... +	 */ +	q->nr_requests = set->queue_depth; + +	if (set->ops->complete) +		blk_queue_softirq_done(q, set->ops->complete); + +	blk_mq_init_flush(q); +	blk_mq_init_cpu_queues(q, set->nr_hw_queues); + +	q->flush_rq = kzalloc(round_up(sizeof(struct request) + +				set->cmd_size, cache_line_size()), +				GFP_KERNEL); +	if (!q->flush_rq) +		goto err_hw; + +	if (blk_mq_init_hw_queues(q, set)) +		goto err_flush_rq; + +	mutex_lock(&all_q_mutex); +	list_add_tail(&q->all_q_node, &all_q_list); +	mutex_unlock(&all_q_mutex); + +	blk_mq_add_queue_tag_set(set, q); + +	blk_mq_map_swqueue(q); + +	return q; + +err_flush_rq: +	kfree(q->flush_rq); +err_hw: +	blk_cleanup_queue(q); +err_hctxs: +	kfree(map); +	for (i = 0; i < set->nr_hw_queues; i++) { +		if (!hctxs[i]) +			break; +		free_cpumask_var(hctxs[i]->cpumask); +		kfree(hctxs[i]); +	} +err_map: +	kfree(hctxs); +err_percpu: +	free_percpu(ctx); +	return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(blk_mq_init_queue); + +void blk_mq_free_queue(struct request_queue *q) +{ +	struct blk_mq_tag_set	*set = q->tag_set; + +	blk_mq_del_queue_tag_set(q); + +	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); +	blk_mq_free_hw_queues(q, set); + +	percpu_counter_destroy(&q->mq_usage_counter); + +	free_percpu(q->queue_ctx); +	kfree(q->queue_hw_ctx); +	kfree(q->mq_map); + +	q->queue_ctx = NULL; +	q->queue_hw_ctx = NULL; +	q->mq_map = NULL; + +	mutex_lock(&all_q_mutex); +	list_del_init(&q->all_q_node); +	mutex_unlock(&all_q_mutex); +} + +/* Basically redo blk_mq_init_queue with queue frozen */ +static void blk_mq_queue_reinit(struct request_queue *q) +{ +	blk_mq_freeze_queue(q); + +	blk_mq_sysfs_unregister(q); + +	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); + +	/* +	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe +	 * we should change hctx numa_node according to new topology (this +	 * involves free and re-allocate memory, worthy doing?) +	 */ + +	blk_mq_map_swqueue(q); + +	blk_mq_sysfs_register(q); + +	blk_mq_unfreeze_queue(q); +} + +static int blk_mq_queue_reinit_notify(struct notifier_block *nb, +				      unsigned long action, void *hcpu) +{ +	struct request_queue *q; + +	/* +	 * Before new mappings are established, hotadded cpu might already +	 * start handling requests. This doesn't break anything as we map +	 * offline CPUs to first hardware queue. We will re-init the queue +	 * below to get optimal settings. +	 */ +	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && +	    action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) +		return NOTIFY_OK; + +	mutex_lock(&all_q_mutex); +	list_for_each_entry(q, &all_q_list, all_q_node) +		blk_mq_queue_reinit(q); +	mutex_unlock(&all_q_mutex); +	return NOTIFY_OK; +} + +/* + * Alloc a tag set to be associated with one or more request queues. + * May fail with EINVAL for various error conditions. May adjust the + * requested depth down, if if it too large. In that case, the set + * value will be stored in set->queue_depth. + */ +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) +{ +	int i; + +	if (!set->nr_hw_queues) +		return -EINVAL; +	if (!set->queue_depth) +		return -EINVAL; +	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) +		return -EINVAL; + +	if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) +		return -EINVAL; + +	if (set->queue_depth > BLK_MQ_MAX_DEPTH) { +		pr_info("blk-mq: reduced tag depth to %u\n", +			BLK_MQ_MAX_DEPTH); +		set->queue_depth = BLK_MQ_MAX_DEPTH; +	} + +	set->tags = kmalloc_node(set->nr_hw_queues * +				 sizeof(struct blk_mq_tags *), +				 GFP_KERNEL, set->numa_node); +	if (!set->tags) +		goto out; + +	for (i = 0; i < set->nr_hw_queues; i++) { +		set->tags[i] = blk_mq_init_rq_map(set, i); +		if (!set->tags[i]) +			goto out_unwind; +	} + +	mutex_init(&set->tag_list_lock); +	INIT_LIST_HEAD(&set->tag_list); + +	return 0; + +out_unwind: +	while (--i >= 0) +		blk_mq_free_rq_map(set, set->tags[i], i); +out: +	return -ENOMEM; +} +EXPORT_SYMBOL(blk_mq_alloc_tag_set); + +void blk_mq_free_tag_set(struct blk_mq_tag_set *set) +{ +	int i; + +	for (i = 0; i < set->nr_hw_queues; i++) { +		if (set->tags[i]) +			blk_mq_free_rq_map(set, set->tags[i], i); +	} + +	kfree(set->tags); +} +EXPORT_SYMBOL(blk_mq_free_tag_set); + +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ +	struct blk_mq_tag_set *set = q->tag_set; +	struct blk_mq_hw_ctx *hctx; +	int i, ret; + +	if (!set || nr > set->queue_depth) +		return -EINVAL; + +	ret = 0; +	queue_for_each_hw_ctx(q, hctx, i) { +		ret = blk_mq_tag_update_depth(hctx->tags, nr); +		if (ret) +			break; +	} + +	if (!ret) +		q->nr_requests = nr; + +	return ret; +} + +void blk_mq_disable_hotplug(void) +{ +	mutex_lock(&all_q_mutex); +} + +void blk_mq_enable_hotplug(void) +{ +	mutex_unlock(&all_q_mutex); +} + +static int __init blk_mq_init(void) +{ +	blk_mq_cpu_init(); + +	/* Must be called after percpu_counter_hotcpu_callback() */ +	hotcpu_notifier(blk_mq_queue_reinit_notify, -10); + +	return 0; +} +subsys_initcall(blk_mq_init); diff --git a/block/blk-mq.h b/block/blk-mq.h new file mode 100644 index 00000000000..26460884c6c --- /dev/null +++ b/block/blk-mq.h @@ -0,0 +1,117 @@ +#ifndef INT_BLK_MQ_H +#define INT_BLK_MQ_H + +struct blk_mq_tag_set; + +struct blk_mq_ctx { +	struct { +		spinlock_t		lock; +		struct list_head	rq_list; +	}  ____cacheline_aligned_in_smp; + +	unsigned int		cpu; +	unsigned int		index_hw; + +	unsigned int		last_tag ____cacheline_aligned_in_smp; + +	/* incremented at dispatch time */ +	unsigned long		rq_dispatched[2]; +	unsigned long		rq_merged; + +	/* incremented at completion time */ +	unsigned long		____cacheline_aligned_in_smp rq_completed[2]; + +	struct request_queue	*queue; +	struct kobject		kobj; +} ____cacheline_aligned_in_smp; + +void __blk_mq_complete_request(struct request *rq); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +void blk_mq_init_flush(struct request_queue *q); +void blk_mq_drain_queue(struct request_queue *q); +void blk_mq_free_queue(struct request_queue *q); +void blk_mq_clone_flush_request(struct request *flush_rq, +		struct request *orig_rq); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); + +/* + * CPU hotplug helpers + */ +struct blk_mq_cpu_notifier; +void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, +			      int (*fn)(void *, unsigned long, unsigned int), +			      void *data); +void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); +void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); +void blk_mq_cpu_init(void); +void blk_mq_enable_hotplug(void); +void blk_mq_disable_hotplug(void); + +/* + * CPU -> queue mappings + */ +extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); + +/* + * sysfs helpers + */ +extern int blk_mq_sysfs_register(struct request_queue *q); +extern void blk_mq_sysfs_unregister(struct request_queue *q); + +/* + * Basic implementation of sparser bitmap, allowing the user to spread + * the bits over more cachelines. + */ +struct blk_align_bitmap { +	unsigned long word; +	unsigned long depth; +} ____cacheline_aligned_in_smp; + +static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, +					   unsigned int cpu) +{ +	return per_cpu_ptr(q->queue_ctx, cpu); +} + +/* + * This assumes per-cpu software queueing queues. They could be per-node + * as well, for instance. For now this is hardcoded as-is. Note that we don't + * care about preemption, since we know the ctx's are persistent. This does + * mean that we can't rely on ctx always matching the currently running CPU. + */ +static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) +{ +	return __blk_mq_get_ctx(q, get_cpu()); +} + +static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) +{ +	put_cpu(); +} + +struct blk_mq_alloc_data { +	/* input parameter */ +	struct request_queue *q; +	gfp_t gfp; +	bool reserved; + +	/* input & output parameter */ +	struct blk_mq_ctx *ctx; +	struct blk_mq_hw_ctx *hctx; +}; + +static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, +		struct request_queue *q, gfp_t gfp, bool reserved, +		struct blk_mq_ctx *ctx, +		struct blk_mq_hw_ctx *hctx) +{ +	data->q = q; +	data->gfp = gfp; +	data->reserved = reserved; +	data->ctx = ctx; +	data->hctx = hctx; +} + +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c index c50ecf0ea3b..f1a1795a568 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -113,6 +113,7 @@ void blk_set_default_limits(struct queue_limits *lim)  	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;  	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;  	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; +	lim->chunk_sectors = 0;  	lim->max_write_same_sectors = 0;  	lim->max_discard_sectors = 0;  	lim->discard_granularity = 0; @@ -144,6 +145,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)  	lim->discard_zeroes_data = 1;  	lim->max_segments = USHRT_MAX;  	lim->max_hw_sectors = UINT_MAX; +	lim->max_segment_size = UINT_MAX;  	lim->max_sectors = UINT_MAX;  	lim->max_write_same_sectors = UINT_MAX;  } @@ -195,17 +197,17 @@ EXPORT_SYMBOL(blk_queue_make_request);  /**   * blk_queue_bounce_limit - set bounce buffer limit for queue   * @q: the request queue for the device - * @dma_mask: the maximum address the device can handle + * @max_addr: the maximum address the device can handle   *   * Description:   *    Different hardware can have different requirements as to what pages   *    it can do I/O directly to. A low level driver can call   *    blk_queue_bounce_limit to have lower memory pages allocated as bounce - *    buffers for doing I/O to pages residing above @dma_mask. + *    buffers for doing I/O to pages residing above @max_addr.   **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) +void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)  { -	unsigned long b_pfn = dma_mask >> PAGE_SHIFT; +	unsigned long b_pfn = max_addr >> PAGE_SHIFT;  	int dma = 0;  	q->bounce_gfp = GFP_NOIO; @@ -276,6 +278,26 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto  EXPORT_SYMBOL(blk_queue_max_hw_sectors);  /** + * blk_queue_chunk_sectors - set size of the chunk for this queue + * @q:  the request queue for the device + * @chunk_sectors:  chunk sectors in the usual 512b unit + * + * Description: + *    If a driver doesn't want IOs to cross a given chunk size, it can set + *    this limit and prevent merging across chunks. Note that the chunk size + *    must currently be a power-of-2 in sectors. Also note that the block + *    layer must accept a page worth of data at any offset. So if the + *    crossing of chunks is a hard limitation in the driver, it must still be + *    prepared to split single page bios. + **/ +void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) +{ +	BUG_ON(!is_power_of_2(chunk_sectors)); +	q->limits.chunk_sectors = chunk_sectors; +} +EXPORT_SYMBOL(blk_queue_chunk_sectors); + +/**   * blk_queue_max_discard_sectors - set max sectors for a single discard   * @q:  the request queue for the device   * @max_discard_sectors: maximum number of sectors to discard @@ -591,6 +613,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,  		ret = -1;  	} +	t->raid_partial_stripes_expensive = +		max(t->raid_partial_stripes_expensive, +		    b->raid_partial_stripes_expensive); +  	/* Find lowest common alignment_offset */  	t->alignment_offset = lcm(t->alignment_offset, alignment)  		& (max(t->physical_block_size, t->io_min) - 1); diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ec9e60636f4..53b1737e978 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -23,20 +23,20 @@ static void blk_done_softirq(struct softirq_action *h)  	struct list_head *cpu_list, local_list;  	local_irq_disable(); -	cpu_list = &__get_cpu_var(blk_cpu_done); +	cpu_list = this_cpu_ptr(&blk_cpu_done);  	list_replace_init(cpu_list, &local_list);  	local_irq_enable();  	while (!list_empty(&local_list)) {  		struct request *rq; -		rq = list_entry(local_list.next, struct request, csd.list); -		list_del_init(&rq->csd.list); +		rq = list_entry(local_list.next, struct request, ipi_list); +		list_del_init(&rq->ipi_list);  		rq->q->softirq_done_fn(rq);  	}  } -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP  static void trigger_softirq(void *data)  {  	struct request *rq = data; @@ -44,10 +44,10 @@ static void trigger_softirq(void *data)  	struct list_head *list;  	local_irq_save(flags); -	list = &__get_cpu_var(blk_cpu_done); -	list_add_tail(&rq->csd.list, list); +	list = this_cpu_ptr(&blk_cpu_done); +	list_add_tail(&rq->ipi_list, list); -	if (list->next == &rq->csd.list) +	if (list->next == &rq->ipi_list)  		raise_softirq_irqoff(BLOCK_SOFTIRQ);  	local_irq_restore(flags); @@ -65,13 +65,13 @@ static int raise_blk_irq(int cpu, struct request *rq)  		data->info = rq;  		data->flags = 0; -		__smp_call_function_single(cpu, data, 0); +		smp_call_function_single_async(cpu, data);  		return 0;  	}  	return 1;  } -#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +#else /* CONFIG_SMP */  static int raise_blk_irq(int cpu, struct request *rq)  {  	return 1; @@ -90,7 +90,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,  		local_irq_disable();  		list_splice_init(&per_cpu(blk_cpu_done, cpu), -				 &__get_cpu_var(blk_cpu_done)); +				 this_cpu_ptr(&blk_cpu_done));  		raise_softirq_irqoff(BLOCK_SOFTIRQ);  		local_irq_enable();  	} @@ -135,8 +135,8 @@ void __blk_complete_request(struct request *req)  	if (ccpu == cpu || shared) {  		struct list_head *list;  do_local: -		list = &__get_cpu_var(blk_cpu_done); -		list_add_tail(&req->csd.list, list); +		list = this_cpu_ptr(&blk_cpu_done); +		list_add_tail(&req->ipi_list, list);  		/*  		 * if the list only contains our just added request, @@ -144,7 +144,7 @@ do_local:  		 * entries there, someone already raised the irq but it  		 * hasn't run yet.  		 */ -		if (list->next == &req->csd.list) +		if (list->next == &req->ipi_list)  			raise_softirq_irqoff(BLOCK_SOFTIRQ);  	} else if (raise_blk_irq(ccpu, req))  		goto do_local; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3aa5b195f4d..23321fbab29 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -7,9 +7,11 @@  #include <linux/bio.h>  #include <linux/blkdev.h>  #include <linux/blktrace_api.h> +#include <linux/blk-mq.h>  #include "blk.h"  #include "blk-cgroup.h" +#include "blk-mq.h"  struct queue_sysfs_entry {  	struct attribute attr; @@ -46,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)  static ssize_t  queue_requests_store(struct request_queue *q, const char *page, size_t count)  { -	struct request_list *rl;  	unsigned long nr; -	int ret; +	int ret, err; -	if (!q->request_fn) +	if (!q->request_fn && !q->mq_ops)  		return -EINVAL;  	ret = queue_var_store(&nr, page, count); @@ -60,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)  	if (nr < BLKDEV_MIN_RQ)  		nr = BLKDEV_MIN_RQ; -	spin_lock_irq(q->queue_lock); -	q->nr_requests = nr; -	blk_queue_congestion_threshold(q); - -	/* congestion isn't cgroup aware and follows root blkcg for now */ -	rl = &q->root_rl; - -	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) -		blk_set_queue_congested(q, BLK_RW_SYNC); -	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) -		blk_clear_queue_congested(q, BLK_RW_SYNC); - -	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) -		blk_set_queue_congested(q, BLK_RW_ASYNC); -	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) -		blk_clear_queue_congested(q, BLK_RW_ASYNC); - -	blk_queue_for_each_rl(rl, q) { -		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { -			blk_set_rl_full(rl, BLK_RW_SYNC); -		} else { -			blk_clear_rl_full(rl, BLK_RW_SYNC); -			wake_up(&rl->wait[BLK_RW_SYNC]); -		} - -		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { -			blk_set_rl_full(rl, BLK_RW_ASYNC); -		} else { -			blk_clear_rl_full(rl, BLK_RW_ASYNC); -			wake_up(&rl->wait[BLK_RW_ASYNC]); -		} -	} +	if (q->request_fn) +		err = blk_update_nr_requests(q, nr); +	else +		err = blk_mq_update_nr_requests(q, nr); + +	if (err) +		return err; -	spin_unlock_irq(q->queue_lock);  	return ret;  } @@ -287,7 +262,7 @@ static ssize_t  queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)  {  	ssize_t ret = -EINVAL; -#if defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP  	unsigned long val;  	ret = queue_var_store(&val, page, count); @@ -542,6 +517,11 @@ static void blk_release_queue(struct kobject *kobj)  	if (q->queue_tags)  		__blk_queue_free_tags(q); +	if (q->mq_ops) +		blk_mq_free_queue(q); + +	kfree(q->flush_rq); +  	blk_trace_shutdown(q);  	bdi_destroy(&q->backing_dev_info); @@ -575,6 +555,7 @@ int blk_register_queue(struct gendisk *disk)  	 * bypass from queue allocation.  	 */  	blk_queue_bypass_end(q); +	queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);  	ret = blk_trace_init_sysfs(dev);  	if (ret) @@ -588,6 +569,9 @@ int blk_register_queue(struct gendisk *disk)  	kobject_uevent(&q->kobj, KOBJ_ADD); +	if (q->mq_ops) +		blk_mq_register_disk(disk); +  	if (!q->request_fn)  		return 0; @@ -610,6 +594,9 @@ void blk_unregister_queue(struct gendisk *disk)  	if (WARN_ON(!q))  		return; +	if (q->mq_ops) +		blk_mq_unregister_disk(disk); +  	if (q->request_fn)  		elv_unregister_queue(q); diff --git a/block/blk-tag.c b/block/blk-tag.c index 3f33d867226..a185b86741e 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -27,18 +27,15 @@ struct request *blk_queue_find_tag(struct request_queue *q, int tag)  EXPORT_SYMBOL(blk_queue_find_tag);  /** - * __blk_free_tags - release a given set of tag maintenance info + * blk_free_tags - release a given set of tag maintenance info   * @bqt:	the tag map to free   * - * Tries to free the specified @bqt.  Returns true if it was - * actually freed and false if there are still references using it + * Drop the reference count on @bqt and frees it when the last reference + * is dropped.   */ -static int __blk_free_tags(struct blk_queue_tag *bqt) +void blk_free_tags(struct blk_queue_tag *bqt)  { -	int retval; - -	retval = atomic_dec_and_test(&bqt->refcnt); -	if (retval) { +	if (atomic_dec_and_test(&bqt->refcnt)) {  		BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <  							bqt->max_depth); @@ -50,9 +47,8 @@ static int __blk_free_tags(struct blk_queue_tag *bqt)  		kfree(bqt);  	} - -	return retval;  } +EXPORT_SYMBOL(blk_free_tags);  /**   * __blk_queue_free_tags - release tag maintenance info @@ -69,28 +65,13 @@ void __blk_queue_free_tags(struct request_queue *q)  	if (!bqt)  		return; -	__blk_free_tags(bqt); +	blk_free_tags(bqt);  	q->queue_tags = NULL;  	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);  }  /** - * blk_free_tags - release a given set of tag maintenance info - * @bqt:	the tag map to free - * - * For externally managed @bqt frees the map.  Callers of this - * function must guarantee to have released all the queues that - * might have been using this tag map. - */ -void blk_free_tags(struct blk_queue_tag *bqt) -{ -	if (unlikely(!__blk_free_tags(bqt))) -		BUG(); -} -EXPORT_SYMBOL(blk_free_tags); - -/**   * blk_queue_free_tags - release tag maintenance info   * @q:  the request queue for the device   * diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8331aba9426..3fdb21a390c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -256,6 +256,12 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)  	}								\  } while (0) +static void tg_stats_init(struct tg_stats_cpu *tg_stats) +{ +	blkg_rwstat_init(&tg_stats->service_bytes); +	blkg_rwstat_init(&tg_stats->serviced); +} +  /*   * Worker for allocating per cpu stat for tgs. This is scheduled on the   * system_wq once there are some groups on the alloc_list waiting for @@ -269,12 +275,16 @@ static void tg_stats_alloc_fn(struct work_struct *work)  alloc_stats:  	if (!stats_cpu) { +		int cpu; +  		stats_cpu = alloc_percpu(struct tg_stats_cpu);  		if (!stats_cpu) {  			/* allocation failed, try again after some time */  			schedule_delayed_work(dwork, msecs_to_jiffies(10));  			return;  		} +		for_each_possible_cpu(cpu) +			tg_stats_init(per_cpu_ptr(stats_cpu, cpu));  	}  	spin_lock_irq(&tg_stats_alloc_lock); @@ -734,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,  static bool throtl_slice_used(struct throtl_grp *tg, bool rw)  {  	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) -		return 0; +		return false;  	return 1;  } @@ -832,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,  	if (tg->io_disp[rw] + 1 <= io_allowed) {  		if (wait)  			*wait = 0; -		return 1; +		return true;  	}  	/* Calc approx time to dispatch */ @@ -867,14 +877,14 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,  	do_div(tmp, HZ);  	bytes_allowed = tmp; -	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { +	if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {  		if (wait)  			*wait = 0; -		return 1; +		return true;  	}  	/* Calc approx time to dispatch */ -	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; +	extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;  	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);  	if (!jiffy_wait) @@ -913,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,  	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {  		if (wait)  			*wait = 0; -		return 1; +		return true;  	}  	/* @@ -977,7 +987,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)  	bool rw = bio_data_dir(bio);  	/* Charge the bio to the group */ -	tg->bytes_disp[rw] += bio->bi_size; +	tg->bytes_disp[rw] += bio->bi_iter.bi_size;  	tg->io_disp[rw]++;  	/* @@ -993,8 +1003,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)  	 */  	if (!(bio->bi_rw & REQ_THROTTLED)) {  		bio->bi_rw |= REQ_THROTTLED; -		throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, -					     bio->bi_rw); +		throtl_update_dispatch_stats(tg_to_blkg(tg), +					     bio->bi_iter.bi_size, bio->bi_rw);  	}  } @@ -1248,7 +1258,7 @@ out_unlock:   * of throtl_data->service_queue.  Those bio's are ready and issued by this   * function.   */ -void blk_throtl_dispatch_work_fn(struct work_struct *work) +static void blk_throtl_dispatch_work_fn(struct work_struct *work)  {  	struct throtl_data *td = container_of(work, struct throtl_data,  					      dispatch_work); @@ -1293,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,  	return __blkg_prfill_rwstat(sf, pd, &rwstat);  } -static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, -			       struct cftype *cft, struct seq_file *sf) +static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)  { -	struct blkcg *blkcg = css_to_blkcg(css); - -	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, -			  cft->private, true); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, +			  &blkcg_policy_throtl, seq_cft(sf)->private, true);  	return 0;  } @@ -1325,26 +1332,24 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,  	return __blkg_prfill_u64(sf, pd, v);  } -static int tg_print_conf_u64(struct cgroup_subsys_state *css, -			     struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_u64(struct seq_file *sf, void *v)  { -	blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, -			  &blkcg_policy_throtl, cft->private, false); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, +			  &blkcg_policy_throtl, seq_cft(sf)->private, false);  	return 0;  } -static int tg_print_conf_uint(struct cgroup_subsys_state *css, -			      struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_uint(struct seq_file *sf, void *v)  { -	blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, -			  &blkcg_policy_throtl, cft->private, false); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, +			  &blkcg_policy_throtl, seq_cft(sf)->private, false);  	return 0;  } -static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, -		       const char *buf, bool is_u64) +static ssize_t tg_set_conf(struct kernfs_open_file *of, +			   char *buf, size_t nbytes, loff_t off, bool is_u64)  { -	struct blkcg *blkcg = css_to_blkcg(css); +	struct blkcg *blkcg = css_to_blkcg(of_css(of));  	struct blkg_conf_ctx ctx;  	struct throtl_grp *tg;  	struct throtl_service_queue *sq; @@ -1363,9 +1368,9 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,  		ctx.v = -1;  	if (is_u64) -		*(u64 *)((void *)tg + cft->private) = ctx.v; +		*(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;  	else -		*(unsigned int *)((void *)tg + cft->private) = ctx.v; +		*(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;  	throtl_log(&tg->service_queue,  		   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1399,59 +1404,55 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,  	}  	blkg_conf_finish(&ctx); -	return 0; +	return nbytes;  } -static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, -			   const char *buf) +static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, +			       char *buf, size_t nbytes, loff_t off)  { -	return tg_set_conf(css, cft, buf, true); +	return tg_set_conf(of, buf, nbytes, off, true);  } -static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, -			    const char *buf) +static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, +				char *buf, size_t nbytes, loff_t off)  { -	return tg_set_conf(css, cft, buf, false); +	return tg_set_conf(of, buf, nbytes, off, false);  }  static struct cftype throtl_files[] = {  	{  		.name = "throttle.read_bps_device",  		.private = offsetof(struct throtl_grp, bps[READ]), -		.read_seq_string = tg_print_conf_u64, -		.write_string = tg_set_conf_u64, -		.max_write_len = 256, +		.seq_show = tg_print_conf_u64, +		.write = tg_set_conf_u64,  	},  	{  		.name = "throttle.write_bps_device",  		.private = offsetof(struct throtl_grp, bps[WRITE]), -		.read_seq_string = tg_print_conf_u64, -		.write_string = tg_set_conf_u64, -		.max_write_len = 256, +		.seq_show = tg_print_conf_u64, +		.write = tg_set_conf_u64,  	},  	{  		.name = "throttle.read_iops_device",  		.private = offsetof(struct throtl_grp, iops[READ]), -		.read_seq_string = tg_print_conf_uint, -		.write_string = tg_set_conf_uint, -		.max_write_len = 256, +		.seq_show = tg_print_conf_uint, +		.write = tg_set_conf_uint,  	},  	{  		.name = "throttle.write_iops_device",  		.private = offsetof(struct throtl_grp, iops[WRITE]), -		.read_seq_string = tg_print_conf_uint, -		.write_string = tg_set_conf_uint, -		.max_write_len = 256, +		.seq_show = tg_print_conf_uint, +		.write = tg_set_conf_uint,  	},  	{  		.name = "throttle.io_service_bytes",  		.private = offsetof(struct tg_stats_cpu, service_bytes), -		.read_seq_string = tg_print_cpu_rwstat, +		.seq_show = tg_print_cpu_rwstat,  	},  	{  		.name = "throttle.io_serviced",  		.private = offsetof(struct tg_stats_cpu, serviced), -		.read_seq_string = tg_print_cpu_rwstat, +		.seq_show = tg_print_cpu_rwstat,  	},  	{ }	/* terminate */  }; @@ -1498,7 +1499,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)  	if (tg) {  		if (!tg->has_rules[rw]) {  			throtl_update_dispatch_stats(tg_to_blkg(tg), -						     bio->bi_size, bio->bi_rw); +					bio->bi_iter.bi_size, bio->bi_rw);  			goto out_unlock_rcu;  		}  	} @@ -1554,7 +1555,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)  	/* out-of-limit, queue to @tg */  	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",  		   rw == READ ? 'R' : 'W', -		   tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], +		   tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],  		   tg->io_disp[rw], tg->iops[rw],  		   sq->nr_queued[READ], sq->nr_queued[WRITE]); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 65f10356396..95a09590ccf 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -7,6 +7,7 @@  #include <linux/fault-inject.h>  #include "blk.h" +#include "blk-mq.h"  #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -31,7 +32,7 @@ static int __init fail_io_timeout_debugfs(void)  	struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",  						NULL, &fail_io_timeout); -	return IS_ERR(dir) ? PTR_ERR(dir) : 0; +	return PTR_ERR_OR_ZERO(dir);  }  late_initcall(fail_io_timeout_debugfs); @@ -88,11 +89,15 @@ static void blk_rq_timed_out(struct request *req)  		ret = q->rq_timed_out_fn(req);  	switch (ret) {  	case BLK_EH_HANDLED: -		__blk_complete_request(req); +		/* Can we use req->errors here? */ +		if (q->mq_ops) +			__blk_mq_complete_request(req); +		else +			__blk_complete_request(req);  		break;  	case BLK_EH_RESET_TIMER: -		blk_clear_rq_complete(req);  		blk_add_timer(req); +		blk_clear_rq_complete(req);  		break;  	case BLK_EH_NOT_HANDLED:  		/* @@ -108,6 +113,23 @@ static void blk_rq_timed_out(struct request *req)  	}  } +void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, +			  unsigned int *next_set) +{ +	if (time_after_eq(jiffies, rq->deadline)) { +		list_del_init(&rq->timeout_list); + +		/* +		 * Check if we raced with end io completion +		 */ +		if (!blk_mark_rq_complete(rq)) +			blk_rq_timed_out(rq); +	} else if (!*next_set || time_after(*next_timeout, rq->deadline)) { +		*next_timeout = rq->deadline; +		*next_set = 1; +	} +} +  void blk_rq_timed_out_timer(unsigned long data)  {  	struct request_queue *q = (struct request_queue *) data; @@ -117,21 +139,8 @@ void blk_rq_timed_out_timer(unsigned long data)  	spin_lock_irqsave(q->queue_lock, flags); -	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { -		if (time_after_eq(jiffies, rq->deadline)) { -			list_del_init(&rq->timeout_list); - -			/* -			 * Check if we raced with end io completion -			 */ -			if (blk_mark_rq_complete(rq)) -				continue; -			blk_rq_timed_out(rq); -		} else if (!next_set || time_after(next, rq->deadline)) { -			next = rq->deadline; -			next_set = 1; -		} -	} +	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) +		blk_rq_check_expired(rq, &next, &next_set);  	if (next_set)  		mod_timer(&q->timeout, round_jiffies_up(next)); @@ -157,6 +166,17 @@ void blk_abort_request(struct request *req)  }  EXPORT_SYMBOL_GPL(blk_abort_request); +unsigned long blk_rq_timeout(unsigned long timeout) +{ +	unsigned long maxt; + +	maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); +	if (time_after(timeout, maxt)) +		timeout = maxt; + +	return timeout; +} +  /**   * blk_add_timer - Start timeout timer for a single request   * @req:	request that is about to start running. @@ -174,7 +194,6 @@ void blk_add_timer(struct request *req)  		return;  	BUG_ON(!list_empty(&req->timeout_list)); -	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));  	/*  	 * Some LLDs, like scsi, peek at the timeout to prevent a @@ -184,17 +203,29 @@ void blk_add_timer(struct request *req)  		req->timeout = q->rq_timeout;  	req->deadline = jiffies + req->timeout; -	list_add_tail(&req->timeout_list, &q->timeout_list); +	if (!q->mq_ops) +		list_add_tail(&req->timeout_list, &req->q->timeout_list);  	/*  	 * If the timer isn't already pending or this timeout is earlier  	 * than an existing one, modify the timer. Round up to next nearest  	 * second.  	 */ -	expiry = round_jiffies_up(req->deadline); +	expiry = blk_rq_timeout(round_jiffies_up(req->deadline));  	if (!timer_pending(&q->timeout) || -	    time_before(expiry, q->timeout.expires)) -		mod_timer(&q->timeout, expiry); -} +	    time_before(expiry, q->timeout.expires)) { +		unsigned long diff = q->timeout.expires - expiry; + +		/* +		 * Due to added timer slack to group timers, the timer +		 * will often be a little in front of what we asked for. +		 * So apply some tolerance here too, otherwise we keep +		 * modifying the timer because expires for value X +		 * will be X + something. +		 */ +		if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) +			mod_timer(&q->timeout, expiry); +	} +} diff --git a/block/blk.h b/block/blk.h index e837b8f619b..6748c4f8d7a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,7 +9,11 @@  /* Number of requests a "batching" process may submit */  #define BLK_BATCH_REQ	32 +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT		(5 * HZ) +  extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *request_cachep;  extern struct kobj_type blk_queue_ktype;  extern struct ida blk_queue_ida; @@ -34,14 +38,30 @@ bool __blk_end_bidi_request(struct request *rq, int error,  			    unsigned int nr_bytes, unsigned int bidi_bytes);  void blk_rq_timed_out_timer(unsigned long data); +void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, +			  unsigned int *next_set); +unsigned long blk_rq_timeout(unsigned long timeout); +void blk_add_timer(struct request *req);  void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); + + +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, +			     struct bio *bio); +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, +			    struct bio *bio); +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, +			    unsigned int *request_count); + +void blk_account_io_start(struct request *req, bool new_io); +void blk_account_io_completion(struct request *req, unsigned int bytes); +void blk_account_io_done(struct request *req);  /*   * Internal atomic flags for request handling   */  enum rq_atomic_flags {  	REQ_ATOM_COMPLETE = 0, +	REQ_ATOM_STARTED,  };  /* @@ -61,10 +81,9 @@ static inline void blk_clear_rq_complete(struct request *rq)  /*   * Internal elevator interface   */ -#define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash) +#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)  void blk_insert_flush(struct request *rq); -void blk_abort_flushes(struct request_queue *q);  static inline struct request *__elv_next_request(struct request_queue *q)  { @@ -96,7 +115,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)  			q->flush_queue_delayed = 1;  			return NULL;  		} -		if (unlikely(blk_queue_dying(q)) || +		if (unlikely(blk_queue_bypass(q)) ||  		    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))  			return NULL;  	} @@ -168,6 +187,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)  	return q->nr_congestion_off;  } +extern int blk_update_nr_requests(struct request_queue *, unsigned int); +  /*   * Contribute to IO statistics IFF:   * diff --git a/block/bounce.c b/block/bounce.c new file mode 100644 index 00000000000..ab21ba203d5 --- /dev/null +++ b/block/bounce.c @@ -0,0 +1,290 @@ +/* bounce buffer handling for block devices + * + * - Split from highmem.c + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/export.h> +#include <linux/swap.h> +#include <linux/gfp.h> +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/printk.h> +#include <asm/tlbflush.h> + +#include <trace/events/block.h> + +#define POOL_SIZE	64 +#define ISA_POOL_SIZE	16 + +static mempool_t *page_pool, *isa_page_pool; + +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) +static __init int init_emergency_pool(void) +{ +#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) +	if (max_pfn <= max_low_pfn) +		return 0; +#endif + +	page_pool = mempool_create_page_pool(POOL_SIZE, 0); +	BUG_ON(!page_pool); +	pr_info("pool size: %d pages\n", POOL_SIZE); + +	return 0; +} + +__initcall(init_emergency_pool); +#endif + +#ifdef CONFIG_HIGHMEM +/* + * highmem version, map in to vec + */ +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) +{ +	unsigned long flags; +	unsigned char *vto; + +	local_irq_save(flags); +	vto = kmap_atomic(to->bv_page); +	memcpy(vto + to->bv_offset, vfrom, to->bv_len); +	kunmap_atomic(vto); +	local_irq_restore(flags); +} + +#else /* CONFIG_HIGHMEM */ + +#define bounce_copy_vec(to, vfrom)	\ +	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) + +#endif /* CONFIG_HIGHMEM */ + +/* + * allocate pages in the DMA region for the ISA pool + */ +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) +{ +	return mempool_alloc_pages(gfp_mask | GFP_DMA, data); +} + +/* + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA + * as the max address, so check if the pool has already been created. + */ +int init_emergency_isa_pool(void) +{ +	if (isa_page_pool) +		return 0; + +	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, +				       mempool_free_pages, (void *) 0); +	BUG_ON(!isa_page_pool); + +	pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); +	return 0; +} + +/* + * Simple bounce buffer support for highmem pages. Depending on the + * queue gfp mask set, *to may or may not be a highmem page. kmap it + * always, it will do the Right Thing + */ +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) +{ +	unsigned char *vfrom; +	struct bio_vec tovec, *fromvec = from->bi_io_vec; +	struct bvec_iter iter; + +	bio_for_each_segment(tovec, to, iter) { +		if (tovec.bv_page != fromvec->bv_page) { +			/* +			 * fromvec->bv_offset and fromvec->bv_len might have +			 * been modified by the block layer, so use the original +			 * copy, bounce_copy_vec already uses tovec->bv_len +			 */ +			vfrom = page_address(fromvec->bv_page) + +				tovec.bv_offset; + +			bounce_copy_vec(&tovec, vfrom); +			flush_dcache_page(tovec.bv_page); +		} + +		fromvec++; +	} +} + +static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) +{ +	struct bio *bio_orig = bio->bi_private; +	struct bio_vec *bvec, *org_vec; +	int i; + +	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) +		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); + +	/* +	 * free up bounce indirect pages used +	 */ +	bio_for_each_segment_all(bvec, bio, i) { +		org_vec = bio_orig->bi_io_vec + i; +		if (bvec->bv_page == org_vec->bv_page) +			continue; + +		dec_zone_page_state(bvec->bv_page, NR_BOUNCE); +		mempool_free(bvec->bv_page, pool); +	} + +	bio_endio(bio_orig, err); +	bio_put(bio); +} + +static void bounce_end_io_write(struct bio *bio, int err) +{ +	bounce_end_io(bio, page_pool, err); +} + +static void bounce_end_io_write_isa(struct bio *bio, int err) +{ + +	bounce_end_io(bio, isa_page_pool, err); +} + +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) +{ +	struct bio *bio_orig = bio->bi_private; + +	if (test_bit(BIO_UPTODATE, &bio->bi_flags)) +		copy_to_high_bio_irq(bio_orig, bio); + +	bounce_end_io(bio, pool, err); +} + +static void bounce_end_io_read(struct bio *bio, int err) +{ +	__bounce_end_io_read(bio, page_pool, err); +} + +static void bounce_end_io_read_isa(struct bio *bio, int err) +{ +	__bounce_end_io_read(bio, isa_page_pool, err); +} + +#ifdef CONFIG_NEED_BOUNCE_POOL +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ +	if (bio_data_dir(bio) != WRITE) +		return 0; + +	if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) +		return 0; + +	return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); +} +#else +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ +	return 0; +} +#endif /* CONFIG_NEED_BOUNCE_POOL */ + +static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, +			       mempool_t *pool, int force) +{ +	struct bio *bio; +	int rw = bio_data_dir(*bio_orig); +	struct bio_vec *to, from; +	struct bvec_iter iter; +	unsigned i; + +	if (force) +		goto bounce; +	bio_for_each_segment(from, *bio_orig, iter) +		if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) +			goto bounce; + +	return; +bounce: +	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); + +	bio_for_each_segment_all(to, bio, i) { +		struct page *page = to->bv_page; + +		if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) +			continue; + +		inc_zone_page_state(to->bv_page, NR_BOUNCE); +		to->bv_page = mempool_alloc(pool, q->bounce_gfp); + +		if (rw == WRITE) { +			char *vto, *vfrom; + +			flush_dcache_page(page); + +			vto = page_address(to->bv_page) + to->bv_offset; +			vfrom = kmap_atomic(page) + to->bv_offset; +			memcpy(vto, vfrom, to->bv_len); +			kunmap_atomic(vfrom); +		} +	} + +	trace_block_bio_bounce(q, *bio_orig); + +	bio->bi_flags |= (1 << BIO_BOUNCED); + +	if (pool == page_pool) { +		bio->bi_end_io = bounce_end_io_write; +		if (rw == READ) +			bio->bi_end_io = bounce_end_io_read; +	} else { +		bio->bi_end_io = bounce_end_io_write_isa; +		if (rw == READ) +			bio->bi_end_io = bounce_end_io_read_isa; +	} + +	bio->bi_private = *bio_orig; +	*bio_orig = bio; +} + +void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) +{ +	int must_bounce; +	mempool_t *pool; + +	/* +	 * Data-less bio, nothing to bounce +	 */ +	if (!bio_has_data(*bio_orig)) +		return; + +	must_bounce = must_snapshot_stable_pages(q, *bio_orig); + +	/* +	 * for non-isa bounce case, just check if the bounce pfn is equal +	 * to or bigger than the highest pfn in the system -- in that case, +	 * don't waste time iterating over bio segments +	 */ +	if (!(q->bounce_gfp & GFP_DMA)) { +		if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) +			return; +		pool = page_pool; +	} else { +		BUG_ON(!isa_page_pool); +		pool = isa_page_pool; +	} + +	/* +	 * slow path +	 */ +	__blk_queue_bounce(q, bio_orig, pool, must_bounce); +} + +EXPORT_SYMBOL(blk_queue_bounce); diff --git a/block/bsg.c b/block/bsg.c index 420a5a9f1b2..ff46addde5d 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -196,7 +196,6 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,  	 * fill in request structure  	 */  	rq->cmd_len = hdr->request_len; -	rq->cmd_type = REQ_TYPE_BLOCK_PC;  	rq->timeout = msecs_to_jiffies(hdr->timeout);  	if (!rq->timeout) @@ -273,6 +272,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,  	rq = blk_get_request(q, rw, GFP_KERNEL);  	if (!rq)  		return ERR_PTR(-ENOMEM); +	blk_rq_set_block_pc(rq); +  	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);  	if (ret)  		goto out; @@ -1008,7 +1009,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,  	/*  	 * we need a proper transport to send commands, not a stacked device  	 */ -	if (!q->request_fn) +	if (!queue_is_rq_based(q))  		return 0;  	bcd = &q->bsg_dev; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index dabb9d02cf9..cadc3784174 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)  {  	if (cfqd->busy_queues) {  		cfq_log(cfqd, "schedule dispatch"); -		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); +		kblockd_schedule_work(&cfqd->unplug_work);  	}  } @@ -1508,6 +1508,29 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)  }  #ifdef CONFIG_CFQ_GROUP_IOSCHED +static void cfqg_stats_init(struct cfqg_stats *stats) +{ +	blkg_rwstat_init(&stats->service_bytes); +	blkg_rwstat_init(&stats->serviced); +	blkg_rwstat_init(&stats->merged); +	blkg_rwstat_init(&stats->service_time); +	blkg_rwstat_init(&stats->wait_time); +	blkg_rwstat_init(&stats->queued); + +	blkg_stat_init(&stats->sectors); +	blkg_stat_init(&stats->time); + +#ifdef CONFIG_DEBUG_BLK_CGROUP +	blkg_stat_init(&stats->unaccounted_time); +	blkg_stat_init(&stats->avg_queue_size_sum); +	blkg_stat_init(&stats->avg_queue_size_samples); +	blkg_stat_init(&stats->dequeue); +	blkg_stat_init(&stats->group_wait_time); +	blkg_stat_init(&stats->idle_time); +	blkg_stat_init(&stats->empty_time); +#endif +} +  static void cfq_pd_init(struct blkcg_gq *blkg)  {  	struct cfq_group *cfqg = blkg_to_cfqg(blkg); @@ -1515,6 +1538,8 @@ static void cfq_pd_init(struct blkcg_gq *blkg)  	cfq_init_cfqg_base(cfqg);  	cfqg->weight = blkg->blkcg->cfq_weight;  	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; +	cfqg_stats_init(&cfqg->stats); +	cfqg_stats_init(&cfqg->dead_stats);  }  static void cfq_pd_offline(struct blkcg_gq *blkg) @@ -1607,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,  	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);  } -static int cfqg_print_weight_device(struct cgroup_subsys_state *css, -				    struct cftype *cft, struct seq_file *sf) +static int cfqg_print_weight_device(struct seq_file *sf, void *v)  { -	blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, -			  &blkcg_policy_cfq, 0, false); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), +			  cfqg_prfill_weight_device, &blkcg_policy_cfq, +			  0, false);  	return 0;  } @@ -1625,34 +1650,31 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,  	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);  } -static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, -					 struct cftype *cft, -					 struct seq_file *sf) +static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)  { -	blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, -			  &blkcg_policy_cfq, 0, false); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), +			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, +			  0, false);  	return 0;  } -static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, -			    struct seq_file *sf) +static int cfq_print_weight(struct seq_file *sf, void *v)  { -	seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); +	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);  	return 0;  } -static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, -				 struct cftype *cft, struct seq_file *sf) +static int cfq_print_leaf_weight(struct seq_file *sf, void *v)  { -	seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); +	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);  	return 0;  } -static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, -				    struct cftype *cft, const char *buf, -				    bool is_leaf_weight) +static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, +					char *buf, size_t nbytes, loff_t off, +					bool is_leaf_weight)  { -	struct blkcg *blkcg = css_to_blkcg(css); +	struct blkcg *blkcg = css_to_blkcg(of_css(of));  	struct blkg_conf_ctx ctx;  	struct cfq_group *cfqg;  	int ret; @@ -1675,19 +1697,19 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,  	}  	blkg_conf_finish(&ctx); -	return ret; +	return ret ?: nbytes;  } -static int cfqg_set_weight_device(struct cgroup_subsys_state *css, -				  struct cftype *cft, const char *buf) +static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, +				      char *buf, size_t nbytes, loff_t off)  { -	return __cfqg_set_weight_device(css, cft, buf, false); +	return __cfqg_set_weight_device(of, buf, nbytes, off, false);  } -static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, -				       struct cftype *cft, const char *buf) +static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, +					   char *buf, size_t nbytes, loff_t off)  { -	return __cfqg_set_weight_device(css, cft, buf, true); +	return __cfqg_set_weight_device(of, buf, nbytes, off, true);  }  static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, @@ -1737,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,  	return __cfq_set_weight(css, cft, val, true);  } -static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, -			   struct seq_file *sf) +static int cfqg_print_stat(struct seq_file *sf, void *v)  { -	struct blkcg *blkcg = css_to_blkcg(css); - -	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, -			  cft->private, false); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, +			  &blkcg_policy_cfq, seq_cft(sf)->private, false);  	return 0;  } -static int cfqg_print_rwstat(struct cgroup_subsys_state *css, -			     struct cftype *cft, struct seq_file *sf) +static int cfqg_print_rwstat(struct seq_file *sf, void *v)  { -	struct blkcg *blkcg = css_to_blkcg(css); - -	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, -			  cft->private, true); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, +			  &blkcg_policy_cfq, seq_cft(sf)->private, true);  	return 0;  } @@ -1773,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,  	return __blkg_prfill_rwstat(sf, pd, &sum);  } -static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, -				     struct cftype *cft, struct seq_file *sf) +static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)  { -	struct blkcg *blkcg = css_to_blkcg(css); - -	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, -			  &blkcg_policy_cfq, cft->private, false); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), +			  cfqg_prfill_stat_recursive, &blkcg_policy_cfq, +			  seq_cft(sf)->private, false);  	return 0;  } -static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, -				       struct cftype *cft, struct seq_file *sf) +static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)  { -	struct blkcg *blkcg = css_to_blkcg(css); - -	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, -			  &blkcg_policy_cfq, cft->private, true); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), +			  cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, +			  seq_cft(sf)->private, true);  	return 0;  } @@ -1803,20 +1815,18 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,  	if (samples) {  		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); -		do_div(v, samples); +		v = div64_u64(v, samples);  	}  	__blkg_prfill_u64(sf, pd, v);  	return 0;  }  /* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, -				     struct cftype *cft, struct seq_file *sf) +static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)  { -	struct blkcg *blkcg = css_to_blkcg(css); - -	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, -			  &blkcg_policy_cfq, 0, false); +	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), +			  cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, +			  0, false);  	return 0;  }  #endif	/* CONFIG_DEBUG_BLK_CGROUP */ @@ -1826,14 +1836,13 @@ static struct cftype cfq_blkcg_files[] = {  	{  		.name = "weight_device",  		.flags = CFTYPE_ONLY_ON_ROOT, -		.read_seq_string = cfqg_print_leaf_weight_device, -		.write_string = cfqg_set_leaf_weight_device, -		.max_write_len = 256, +		.seq_show = cfqg_print_leaf_weight_device, +		.write = cfqg_set_leaf_weight_device,  	},  	{  		.name = "weight",  		.flags = CFTYPE_ONLY_ON_ROOT, -		.read_seq_string = cfq_print_leaf_weight, +		.seq_show = cfq_print_leaf_weight,  		.write_u64 = cfq_set_leaf_weight,  	}, @@ -1841,26 +1850,24 @@ static struct cftype cfq_blkcg_files[] = {  	{  		.name = "weight_device",  		.flags = CFTYPE_NOT_ON_ROOT, -		.read_seq_string = cfqg_print_weight_device, -		.write_string = cfqg_set_weight_device, -		.max_write_len = 256, +		.seq_show = cfqg_print_weight_device, +		.write = cfqg_set_weight_device,  	},  	{  		.name = "weight",  		.flags = CFTYPE_NOT_ON_ROOT, -		.read_seq_string = cfq_print_weight, +		.seq_show = cfq_print_weight,  		.write_u64 = cfq_set_weight,  	},  	{  		.name = "leaf_weight_device", -		.read_seq_string = cfqg_print_leaf_weight_device, -		.write_string = cfqg_set_leaf_weight_device, -		.max_write_len = 256, +		.seq_show = cfqg_print_leaf_weight_device, +		.write = cfqg_set_leaf_weight_device,  	},  	{  		.name = "leaf_weight", -		.read_seq_string = cfq_print_leaf_weight, +		.seq_show = cfq_print_leaf_weight,  		.write_u64 = cfq_set_leaf_weight,  	}, @@ -1868,114 +1875,114 @@ static struct cftype cfq_blkcg_files[] = {  	{  		.name = "time",  		.private = offsetof(struct cfq_group, stats.time), -		.read_seq_string = cfqg_print_stat, +		.seq_show = cfqg_print_stat,  	},  	{  		.name = "sectors",  		.private = offsetof(struct cfq_group, stats.sectors), -		.read_seq_string = cfqg_print_stat, +		.seq_show = cfqg_print_stat,  	},  	{  		.name = "io_service_bytes",  		.private = offsetof(struct cfq_group, stats.service_bytes), -		.read_seq_string = cfqg_print_rwstat, +		.seq_show = cfqg_print_rwstat,  	},  	{  		.name = "io_serviced",  		.private = offsetof(struct cfq_group, stats.serviced), -		.read_seq_string = cfqg_print_rwstat, +		.seq_show = cfqg_print_rwstat,  	},  	{  		.name = "io_service_time",  		.private = offsetof(struct cfq_group, stats.service_time), -		.read_seq_string = cfqg_print_rwstat, +		.seq_show = cfqg_print_rwstat,  	},  	{  		.name = "io_wait_time",  		.private = offsetof(struct cfq_group, stats.wait_time), -		.read_seq_string = cfqg_print_rwstat, +		.seq_show = cfqg_print_rwstat,  	},  	{  		.name = "io_merged",  		.private = offsetof(struct cfq_group, stats.merged), -		.read_seq_string = cfqg_print_rwstat, +		.seq_show = cfqg_print_rwstat,  	},  	{  		.name = "io_queued",  		.private = offsetof(struct cfq_group, stats.queued), -		.read_seq_string = cfqg_print_rwstat, +		.seq_show = cfqg_print_rwstat,  	},  	/* the same statictics which cover the cfqg and its descendants */  	{  		.name = "time_recursive",  		.private = offsetof(struct cfq_group, stats.time), -		.read_seq_string = cfqg_print_stat_recursive, +		.seq_show = cfqg_print_stat_recursive,  	},  	{  		.name = "sectors_recursive",  		.private = offsetof(struct cfq_group, stats.sectors), -		.read_seq_string = cfqg_print_stat_recursive, +		.seq_show = cfqg_print_stat_recursive,  	},  	{  		.name = "io_service_bytes_recursive",  		.private = offsetof(struct cfq_group, stats.service_bytes), -		.read_seq_string = cfqg_print_rwstat_recursive, +		.seq_show = cfqg_print_rwstat_recursive,  	},  	{  		.name = "io_serviced_recursive",  		.private = offsetof(struct cfq_group, stats.serviced), -		.read_seq_string = cfqg_print_rwstat_recursive, +		.seq_show = cfqg_print_rwstat_recursive,  	},  	{  		.name = "io_service_time_recursive",  		.private = offsetof(struct cfq_group, stats.service_time), -		.read_seq_string = cfqg_print_rwstat_recursive, +		.seq_show = cfqg_print_rwstat_recursive,  	},  	{  		.name = "io_wait_time_recursive",  		.private = offsetof(struct cfq_group, stats.wait_time), -		.read_seq_string = cfqg_print_rwstat_recursive, +		.seq_show = cfqg_print_rwstat_recursive,  	},  	{  		.name = "io_merged_recursive",  		.private = offsetof(struct cfq_group, stats.merged), -		.read_seq_string = cfqg_print_rwstat_recursive, +		.seq_show = cfqg_print_rwstat_recursive,  	},  	{  		.name = "io_queued_recursive",  		.private = offsetof(struct cfq_group, stats.queued), -		.read_seq_string = cfqg_print_rwstat_recursive, +		.seq_show = cfqg_print_rwstat_recursive,  	},  #ifdef CONFIG_DEBUG_BLK_CGROUP  	{  		.name = "avg_queue_size", -		.read_seq_string = cfqg_print_avg_queue_size, +		.seq_show = cfqg_print_avg_queue_size,  	},  	{  		.name = "group_wait_time",  		.private = offsetof(struct cfq_group, stats.group_wait_time), -		.read_seq_string = cfqg_print_stat, +		.seq_show = cfqg_print_stat,  	},  	{  		.name = "idle_time",  		.private = offsetof(struct cfq_group, stats.idle_time), -		.read_seq_string = cfqg_print_stat, +		.seq_show = cfqg_print_stat,  	},  	{  		.name = "empty_time",  		.private = offsetof(struct cfq_group, stats.empty_time), -		.read_seq_string = cfqg_print_stat, +		.seq_show = cfqg_print_stat,  	},  	{  		.name = "dequeue",  		.private = offsetof(struct cfq_group, stats.dequeue), -		.read_seq_string = cfqg_print_stat, +		.seq_show = cfqg_print_stat,  	},  	{  		.name = "unaccounted_time",  		.private = offsetof(struct cfq_group, stats.unaccounted_time), -		.read_seq_string = cfqg_print_stat, +		.seq_show = cfqg_print_stat,  	},  #endif	/* CONFIG_DEBUG_BLK_CGROUP */  	{ }	/* terminate */ @@ -2357,10 +2364,10 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,  	 * reposition in fifo if next is older than rq  	 */  	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -	    time_before(rq_fifo_time(next), rq_fifo_time(rq)) && +	    time_before(next->fifo_time, rq->fifo_time) &&  	    cfqq == RQ_CFQQ(next)) {  		list_move(&rq->queuelist, &next->queuelist); -		rq_set_fifo_time(rq, rq_fifo_time(next)); +		rq->fifo_time = next->fifo_time;  	}  	if (cfqq->next_rq == next) @@ -2804,7 +2811,7 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)  		return NULL;  	rq = rq_entry_fifo(cfqq->fifo.next); -	if (time_before(jiffies, rq_fifo_time(rq))) +	if (time_before(jiffies, rq->fifo_time))  		rq = NULL;  	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); @@ -3917,7 +3924,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)  	cfq_log_cfqq(cfqd, cfqq, "insert_request");  	cfq_init_prio_data(cfqq, RQ_CIC(rq)); -	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); +	rq->fifo_time = jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)];  	list_add_tail(&rq->queuelist, &cfqq->fifo);  	cfq_add_rq_rb(rq);  	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, @@ -4358,7 +4365,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)  	if (!eq)  		return -ENOMEM; -	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); +	cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);  	if (!cfqd) {  		kobject_put(&eq->kobj);  		return -ENOMEM; @@ -4453,7 +4460,7 @@ out_free:  static ssize_t  cfq_var_show(unsigned int var, char *page)  { -	return sprintf(page, "%d\n", var); +	return sprintf(page, "%u\n", var);  }  static ssize_t diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c index cc2637f8674..9dbc67e42a9 100644 --- a/block/cmdline-parser.c +++ b/block/cmdline-parser.c @@ -4,8 +4,7 @@   * Written by Cai Zhiyong <caizhiyong@huawei.com>   *   */ -#include <linux/buffer_head.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/cmdline-parser.h>  static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) @@ -159,6 +158,7 @@ void cmdline_parts_free(struct cmdline_parts **parts)  		*parts = next_parts;  	}  } +EXPORT_SYMBOL(cmdline_parts_free);  int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)  { @@ -206,6 +206,7 @@ fail:  	cmdline_parts_free(parts);  	goto done;  } +EXPORT_SYMBOL(cmdline_parts_parse);  struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,  					 const char *bdev) @@ -214,17 +215,17 @@ struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,  		parts = parts->next_parts;  	return parts;  } +EXPORT_SYMBOL(cmdline_parts_find);  /*   *  add_part()   *    0 success.   *    1 can not add so many partitions.   */ -void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, -		       int slot, -		       int (*add_part)(int, struct cmdline_subpart *, void *), -		       void *param) - +int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, +		      int slot, +		      int (*add_part)(int, struct cmdline_subpart *, void *), +		      void *param)  {  	sector_t from = 0;  	struct cmdline_subpart *subpart; @@ -247,4 +248,7 @@ void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,  		if (add_part(slot, subpart, param))  			break;  	} + +	return slot;  } +EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index fbd5a67cb77..a0926a6094b 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -690,6 +690,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)  	case BLKROSET:  	case BLKDISCARD:  	case BLKSECDISCARD: +	case BLKZEROOUT:  	/*  	 * the ones below are implemented in blkdev_locked_ioctl,  	 * but we call blkdev_ioctl, which gets the lock for us diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 20614a33236..a753df2b3fc 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -106,7 +106,7 @@ deadline_add_request(struct request_queue *q, struct request *rq)  	/*  	 * set expire time and add to fifo list  	 */ -	rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); +	rq->fifo_time = jiffies + dd->fifo_expire[data_dir];  	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);  } @@ -174,9 +174,9 @@ deadline_merged_requests(struct request_queue *q, struct request *req,  	 * and move into next position (next will be deleted) in fifo  	 */  	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { -		if (time_before(rq_fifo_time(next), rq_fifo_time(req))) { +		if (time_before(next->fifo_time, req->fifo_time)) {  			list_move(&req->queuelist, &next->queuelist); -			rq_set_fifo_time(req, rq_fifo_time(next)); +			req->fifo_time = next->fifo_time;  		}  	} @@ -230,7 +230,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)  	/*  	 * rq is expired!  	 */ -	if (time_after_eq(jiffies, rq_fifo_time(rq))) +	if (time_after_eq(jiffies, rq->fifo_time))  		return 1;  	return 0; @@ -346,7 +346,7 @@ static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)  	if (!eq)  		return -ENOMEM; -	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); +	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);  	if (!dd) {  		kobject_put(&eq->kobj);  		return -ENOMEM; diff --git a/block/elevator.c b/block/elevator.c index 668394d1858..24c28b659bb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -155,7 +155,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,  {  	struct elevator_queue *eq; -	eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); +	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);  	if (unlikely(!eq))  		goto err; @@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name)  	struct elevator_type *e = NULL;  	int err; +	/* +	 * q->sysfs_lock must be held to provide mutual exclusion between +	 * elevator_switch() and here. +	 */ +	lockdep_assert_held(&q->sysfs_lock); +  	if (unlikely(q->elevator))  		return 0; @@ -241,6 +247,7 @@ EXPORT_SYMBOL(elevator_exit);  static inline void __elv_rqhash_del(struct request *rq)  {  	hash_del(&rq->hash); +	rq->cmd_flags &= ~REQ_HASHED;  }  static void elv_rqhash_del(struct request_queue *q, struct request *rq) @@ -255,6 +262,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)  	BUG_ON(ELV_ON_HASH(rq));  	hash_add(e->hash, &rq->hash, rq_hash_key(rq)); +	rq->cmd_flags |= REQ_HASHED;  }  static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) @@ -434,7 +442,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)  	/*  	 * See if our hash lookup can find a potential backmerge.  	 */ -	__rq = elv_rqhash_find(q, bio->bi_sector); +	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);  	if (__rq && elv_rq_merge_ok(__rq, bio)) {  		*req = __rq;  		return ELEVATOR_BACK_MERGE; @@ -721,26 +729,6 @@ int elv_may_queue(struct request_queue *q, int rw)  	return ELV_MQUEUE_MAY;  } -void elv_abort_queue(struct request_queue *q) -{ -	struct request *rq; - -	blk_abort_flushes(q); - -	while (!list_empty(&q->queue_head)) { -		rq = list_entry_rq(q->queue_head.next); -		rq->cmd_flags |= REQ_QUIET; -		trace_block_rq_abort(q, rq); -		/* -		 * Mark this request as started so we don't trigger -		 * any debug logic in the end I/O path. -		 */ -		blk_start_request(rq); -		__blk_end_request_all(rq, -EIO); -	} -} -EXPORT_SYMBOL(elv_abort_queue); -  void elv_completed_request(struct request_queue *q, struct request *rq)  {  	struct elevator_queue *e = q->elevator; @@ -959,7 +947,7 @@ fail_init:  /*   * Switch this queue to the given IO scheduler.   */ -int elevator_change(struct request_queue *q, const char *name) +static int __elevator_change(struct request_queue *q, const char *name)  {  	char elevator_name[ELV_NAME_MAX];  	struct elevator_type *e; @@ -981,6 +969,18 @@ int elevator_change(struct request_queue *q, const char *name)  	return elevator_switch(q, e);  } + +int elevator_change(struct request_queue *q, const char *name) +{ +	int ret; + +	/* Protect q->elevator from elevator_init() */ +	mutex_lock(&q->sysfs_lock); +	ret = __elevator_change(q, name); +	mutex_unlock(&q->sysfs_lock); + +	return ret; +}  EXPORT_SYMBOL(elevator_change);  ssize_t elv_iosched_store(struct request_queue *q, const char *name, @@ -991,7 +991,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,  	if (!q->elevator)  		return count; -	ret = elevator_change(q, name); +	ret = __elevator_change(q, name);  	if (!ret)  		return count; diff --git a/block/genhd.c b/block/genhd.c index dadf42b454a..791f4194313 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1252,8 +1252,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)  {  	struct gendisk *disk; -	disk = kmalloc_node(sizeof(struct gendisk), -				GFP_KERNEL | __GFP_ZERO, node_id); +	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);  	if (disk) {  		if (!init_part_stats(&disk->part0)) {  			kfree(disk); diff --git a/block/ioctl.c b/block/ioctl.c index a31d91d9bc5..7d5c3b20af4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -64,7 +64,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user  			part = add_partition(disk, partno, start, length,  					     ADDPART_FLAG_NONE, NULL);  			mutex_unlock(&bdev->bd_mutex); -			return IS_ERR(part) ? PTR_ERR(part) : 0; +			return PTR_ERR_OR_ZERO(part);  		case BLKPG_DEL_PARTITION:  			part = disk_get_part(disk, partno);  			if (!part) diff --git a/block/ioprio.c b/block/ioprio.c new file mode 100644 index 00000000000..e50170ca7c3 --- /dev/null +++ b/block/ioprio.c @@ -0,0 +1,241 @@ +/* + * fs/ioprio.c + * + * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> + * + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.txt + * + */ +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/ioprio.h> +#include <linux/blkdev.h> +#include <linux/capability.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/pid_namespace.h> + +int set_task_ioprio(struct task_struct *task, int ioprio) +{ +	int err; +	struct io_context *ioc; +	const struct cred *cred = current_cred(), *tcred; + +	rcu_read_lock(); +	tcred = __task_cred(task); +	if (!uid_eq(tcred->uid, cred->euid) && +	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { +		rcu_read_unlock(); +		return -EPERM; +	} +	rcu_read_unlock(); + +	err = security_task_setioprio(task, ioprio); +	if (err) +		return err; + +	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); +	if (ioc) { +		ioc->ioprio = ioprio; +		put_io_context(ioc); +	} + +	return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) +{ +	int class = IOPRIO_PRIO_CLASS(ioprio); +	int data = IOPRIO_PRIO_DATA(ioprio); +	struct task_struct *p, *g; +	struct user_struct *user; +	struct pid *pgrp; +	kuid_t uid; +	int ret; + +	switch (class) { +		case IOPRIO_CLASS_RT: +			if (!capable(CAP_SYS_ADMIN)) +				return -EPERM; +			/* fall through, rt has prio field too */ +		case IOPRIO_CLASS_BE: +			if (data >= IOPRIO_BE_NR || data < 0) +				return -EINVAL; + +			break; +		case IOPRIO_CLASS_IDLE: +			break; +		case IOPRIO_CLASS_NONE: +			if (data) +				return -EINVAL; +			break; +		default: +			return -EINVAL; +	} + +	ret = -ESRCH; +	rcu_read_lock(); +	switch (which) { +		case IOPRIO_WHO_PROCESS: +			if (!who) +				p = current; +			else +				p = find_task_by_vpid(who); +			if (p) +				ret = set_task_ioprio(p, ioprio); +			break; +		case IOPRIO_WHO_PGRP: +			if (!who) +				pgrp = task_pgrp(current); +			else +				pgrp = find_vpid(who); +			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { +				ret = set_task_ioprio(p, ioprio); +				if (ret) +					break; +			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p); +			break; +		case IOPRIO_WHO_USER: +			uid = make_kuid(current_user_ns(), who); +			if (!uid_valid(uid)) +				break; +			if (!who) +				user = current_user(); +			else +				user = find_user(uid); + +			if (!user) +				break; + +			do_each_thread(g, p) { +				if (!uid_eq(task_uid(p), uid)) +					continue; +				ret = set_task_ioprio(p, ioprio); +				if (ret) +					goto free_uid; +			} while_each_thread(g, p); +free_uid: +			if (who) +				free_uid(user); +			break; +		default: +			ret = -EINVAL; +	} + +	rcu_read_unlock(); +	return ret; +} + +static int get_task_ioprio(struct task_struct *p) +{ +	int ret; + +	ret = security_task_getioprio(p); +	if (ret) +		goto out; +	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); +	if (p->io_context) +		ret = p->io_context->ioprio; +out: +	return ret; +} + +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ +	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); +	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); + +	if (aclass == IOPRIO_CLASS_NONE) +		aclass = IOPRIO_CLASS_BE; +	if (bclass == IOPRIO_CLASS_NONE) +		bclass = IOPRIO_CLASS_BE; + +	if (aclass == bclass) +		return min(aprio, bprio); +	if (aclass > bclass) +		return bprio; +	else +		return aprio; +} + +SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +{ +	struct task_struct *g, *p; +	struct user_struct *user; +	struct pid *pgrp; +	kuid_t uid; +	int ret = -ESRCH; +	int tmpio; + +	rcu_read_lock(); +	switch (which) { +		case IOPRIO_WHO_PROCESS: +			if (!who) +				p = current; +			else +				p = find_task_by_vpid(who); +			if (p) +				ret = get_task_ioprio(p); +			break; +		case IOPRIO_WHO_PGRP: +			if (!who) +				pgrp = task_pgrp(current); +			else +				pgrp = find_vpid(who); +			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { +				tmpio = get_task_ioprio(p); +				if (tmpio < 0) +					continue; +				if (ret == -ESRCH) +					ret = tmpio; +				else +					ret = ioprio_best(ret, tmpio); +			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p); +			break; +		case IOPRIO_WHO_USER: +			uid = make_kuid(current_user_ns(), who); +			if (!who) +				user = current_user(); +			else +				user = find_user(uid); + +			if (!user) +				break; + +			do_each_thread(g, p) { +				if (!uid_eq(task_uid(p), user->uid)) +					continue; +				tmpio = get_task_ioprio(p); +				if (tmpio < 0) +					continue; +				if (ret == -ESRCH) +					ret = tmpio; +				else +					ret = ioprio_best(ret, tmpio); +			} while_each_thread(g, p); + +			if (who) +				free_uid(user); +			break; +		default: +			ret = -EINVAL; +	} + +	rcu_read_unlock(); +	return ret; +} diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 87a32086535..9b29a996c31 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -263,7 +263,7 @@ config SYSV68_PARTITION  config CMDLINE_PARTITION  	bool "Command line partition support" if PARTITION_ADVANCED -	select CMDLINE_PARSER +	select BLK_CMDLINE_PARSER  	help -	  Say Y here if you would read the partitions table from bootargs. +	  Say Y here if you want to read the partition table from bootargs.  	  The format for the command line is just like mtdparts. diff --git a/block/partitions/atari.h b/block/partitions/atari.h index fe2d32a89f3..f2ec43bfeec 100644 --- a/block/partitions/atari.h +++ b/block/partitions/atari.h @@ -11,6 +11,8 @@   * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)   */ +#include <linux/compiler.h> +  struct partition_info  {    u8 flg;			/* bit 0: active; bit 7: bootable */ @@ -29,6 +31,6 @@ struct rootsector    u32 bsl_st;			/* start of bad sector list */    u32 bsl_cnt;			/* length of bad sector list */    u16 checksum;			/* checksum for bootable disks */ -} __attribute__((__packed__)); +} __packed;  int atari_partition(struct parsed_partitions *state); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 56cf4ffad51..5141b563adf 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -2,15 +2,15 @@   * Copyright (C) 2013 HUAWEI   * Author: Cai Zhiyong <caizhiyong@huawei.com>   * - * Read block device partition table from command line. - * The partition used for fixed block device (eMMC) embedded device. - * It is no MBR, save storage space. Bootloader can be easily accessed + * Read block device partition table from the command line. + * Typically used for fixed block (eMMC) embedded devices. + * It has no MBR, so saves storage space. Bootloader can be easily accessed   * by absolute address of data on the block device.   * Users can easily change the partition.   *   * The format for the command line is just like mtdparts.   * - * Verbose config please reference "Documentation/block/cmdline-partition.txt" + * For further information, see "Documentation/block/cmdline-partition.txt"   *   */ diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 1eb09ee5311..dc51f467a56 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -96,6 +96,7 @@   * - Code works, detects all the partitions.   *   ************************************************************/ +#include <linux/kernel.h>  #include <linux/crc32.h>  #include <linux/ctype.h>  #include <linux/math64.h> @@ -222,11 +223,16 @@ check_hybrid:  	 * the disk size.  	 *  	 * Hybrid MBRs do not necessarily comply with this. +	 * +	 * Consider a bad value here to be a warning to support dd'ing +	 * an image from a smaller disk to a larger disk.  	 */  	if (ret == GPT_MBR_PROTECTIVE) {  		sz = le32_to_cpu(mbr->partition_record[part].size_in_lba);  		if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) -			ret = 0; +			pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", +				 sz, min_t(uint32_t, +					   total_sectors - 1, 0xFFFFFFFF));  	}  done:  	return ret; @@ -710,8 +716,8 @@ int efi_partition(struct parsed_partitions *state)  		efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);  		/* Naively convert UTF16-LE to 7 bits. */ -		label_max = min(sizeof(info->volname) - 1, -				sizeof(ptes[i].partition_name)); +		label_max = min(ARRAY_SIZE(info->volname) - 1, +				ARRAY_SIZE(ptes[i].partition_name));  		info->volname[label_max] = 0;  		while (label_count < label_max) {  			u8 c = ptes[i].partition_name[label_count] & 0xff; diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 4efcafba7e6..abd0b19288a 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -32,6 +32,7 @@  #include <linux/major.h>  #include <linux/string.h>  #include <linux/efi.h> +#include <linux/compiler.h>  #define MSDOS_MBR_SIGNATURE 0xaa55  #define EFI_PMBR_OSTYPE_EFI 0xEF @@ -87,13 +88,13 @@ typedef struct _gpt_header {  	 *  	 * uint8_t		reserved2[ BlockSize - 92 ];  	 */ -} __attribute__ ((packed)) gpt_header; +} __packed gpt_header;  typedef struct _gpt_entry_attributes {  	u64 required_to_function:1;  	u64 reserved:47;          u64 type_guid_specific:16; -} __attribute__ ((packed)) gpt_entry_attributes; +} __packed gpt_entry_attributes;  typedef struct _gpt_entry {  	efi_guid_t partition_type_guid; @@ -102,7 +103,7 @@ typedef struct _gpt_entry {  	__le64 ending_lba;  	gpt_entry_attributes attributes;  	efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; -} __attribute__ ((packed)) gpt_entry; +} __packed gpt_entry;  typedef struct _gpt_mbr_record {  	u8	boot_indicator; /* unused by EFI, set to 0x80 for bootable */ @@ -124,7 +125,7 @@ typedef struct _legacy_mbr {  	__le16 unknown;  	gpt_mbr_record partition_record[4];  	__le16 signature; -} __attribute__ ((packed)) legacy_mbr; +} __packed legacy_mbr;  /* Functions */  extern int efi_partition(struct parsed_partitions *state); diff --git a/block/partitions/karma.c b/block/partitions/karma.c index 0ea19312706..9721fa589bb 100644 --- a/block/partitions/karma.c +++ b/block/partitions/karma.c @@ -8,6 +8,7 @@  #include "check.h"  #include "karma.h" +#include <linux/compiler.h>  int karma_partition(struct parsed_partitions *state)  { @@ -26,7 +27,7 @@ int karma_partition(struct parsed_partitions *state)  		} d_partitions[2];  		u8 d_blank[208];  		__le16 d_magic; -	} __attribute__((packed)) *label; +	} __packed *label;  	struct d_partition *p;  	data = read_part_sector(state, 0, §); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a5ffcc988f0..14695c6221c 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -205,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)  	if (capable(CAP_SYS_RAWIO))  		return 0; -	/* if there's no filter set, assume we're filtering everything out */ -	if (!filter) -		return -EPERM; -  	/* Anybody who can open the device can do a read-safe command */  	if (test_bit(cmd[0], filter->read_ok))  		return 0; @@ -233,7 +229,6 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,  	 * fill in request structure  	 */  	rq->cmd_len = hdr->cmd_len; -	rq->cmd_type = REQ_TYPE_BLOCK_PC;  	rq->timeout = msecs_to_jiffies(hdr->timeout);  	if (!rq->timeout) @@ -286,7 +281,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,  		struct sg_io_hdr *hdr, fmode_t mode)  {  	unsigned long start_time; -	int writing = 0, ret = 0; +	ssize_t ret = 0; +	int writing = 0;  	struct request *rq;  	char sense[SCSI_SENSE_BUFFERSIZE];  	struct bio *bio; @@ -314,6 +310,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,  	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);  	if (!rq)  		return -ENOMEM; +	blk_rq_set_block_pc(rq);  	if (blk_fill_sghdr_rq(q, rq, hdr, mode)) {  		blk_put_request(rq); @@ -321,37 +318,18 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,  	}  	if (hdr->iovec_count) { -		const int size = sizeof(struct sg_iovec) * hdr->iovec_count;  		size_t iov_data_len; -		struct sg_iovec *sg_iov; -		struct iovec *iov; -		int i; - -		sg_iov = kmalloc(size, GFP_KERNEL); -		if (!sg_iov) { -			ret = -ENOMEM; -			goto out; -		} +		struct iovec *iov = NULL; -		if (copy_from_user(sg_iov, hdr->dxferp, size)) { -			kfree(sg_iov); -			ret = -EFAULT; +		ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, +					    0, NULL, &iov); +		if (ret < 0) { +			kfree(iov);  			goto out;  		} -		/* -		 * Sum up the vecs, making sure they don't overflow -		 */ -		iov = (struct iovec *) sg_iov; -		iov_data_len = 0; -		for (i = 0; i < hdr->iovec_count; i++) { -			if (iov_data_len + iov[i].iov_len < iov_data_len) { -				kfree(sg_iov); -				ret = -EINVAL; -				goto out; -			} -			iov_data_len += iov[i].iov_len; -		} +		iov_data_len = ret; +		ret = 0;  		/* SG_IO howto says that the shorter of the two wins */  		if (hdr->dxfer_len < iov_data_len) { @@ -361,9 +339,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,  			iov_data_len = hdr->dxfer_len;  		} -		ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, +		ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, +					  hdr->iovec_count,  					  iov_data_len, GFP_KERNEL); -		kfree(sg_iov); +		kfree(iov);  	} else if (hdr->dxfer_len)  		ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,  				      GFP_KERNEL); @@ -512,7 +491,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,  	memset(sense, 0, sizeof(sense));  	rq->sense = sense;  	rq->sense_len = 0; -	rq->cmd_type = REQ_TYPE_BLOCK_PC; +	blk_rq_set_block_pc(rq);  	blk_execute_rq(q, disk, rq, 0); @@ -545,7 +524,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,  	int err;  	rq = blk_get_request(q, WRITE, __GFP_WAIT); -	rq->cmd_type = REQ_TYPE_BLOCK_PC; +	blk_rq_set_block_pc(rq);  	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;  	rq->cmd[0] = cmd;  	rq->cmd[4] = data;  | 
