diff options
Diffstat (limited to 'block')
81 files changed, 21889 insertions, 5367 deletions
diff --git a/block/Kconfig b/block/Kconfig index 60be1e0455d..2429515c05c 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -65,6 +65,16 @@ config BLK_DEV_BSG If unsure, say Y. +config BLK_DEV_BSGLIB + bool "Block layer SG support v4 helper lib" + default n + select BLK_DEV_BSG + help + Subsystems will normally enable this if needed. Users will not + normally need to manually enable this. + + If unsure, say N. + config BLK_DEV_INTEGRITY bool "Block layer data integrity support" ---help--- @@ -79,7 +89,7 @@ config BLK_DEV_INTEGRITY config BLK_DEV_THROTTLING bool "Block layer bio throttling support" - depends on BLK_CGROUP=y && EXPERIMENTAL + depends on BLK_CGROUP=y default n ---help--- Block layer bio throttling support. It can be used to limit @@ -89,6 +99,23 @@ config BLK_DEV_THROTTLING See Documentation/cgroups/blkio-controller.txt for more information. +config BLK_CMDLINE_PARSER + bool "Block device command line partition parser" + default n + ---help--- + Enabling this option allows you to specify the partition layout from + the kernel boot args. This is typically of use for embedded devices + which don't otherwise have any standardized method for listing the + partitions on a block device. + + See Documentation/block/cmdline-partition.txt for more information. + +menu "Partition Types" + +source "block/partitions/Kconfig" + +endmenu + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76f795..421bef9c4c4 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,8 +23,6 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - # If BLK_CGROUP is a module, CFQ has to be built as module. - depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -34,8 +32,6 @@ config IOSCHED_CFQ This is the default I/O scheduler. - Note: If BLK_CGROUP=m, then CFQ can be built only as module. - config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP diff --git a/block/Makefile b/block/Makefile index 0fec4b3fab5..a2ce6ac935e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,12 +2,17 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + partitions/ +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o @@ -16,3 +21,5 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o diff --git a/block/bio-integrity.c b/block/bio-integrity.c new file mode 100644 index 00000000000..9e241063a61 --- /dev/null +++ b/block/bio-integrity.c @@ -0,0 +1,657 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/blkdev.h> +#include <linux/mempool.h> +#include <linux/export.h> +#include <linux/bio.h> +#include <linux/workqueue.h> +#include <linux/slab.h> + +#define BIP_INLINE_VECS 4 + +static struct kmem_cache *bip_slab; +static struct workqueue_struct *kintegrityd_wq; + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) +{ + struct bio_integrity_payload *bip; + struct bio_set *bs = bio->bi_pool; + unsigned long idx = BIO_POOL_NONE; + unsigned inline_vecs; + + if (!bs) { + bip = kmalloc(sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * nr_vecs, gfp_mask); + inline_vecs = nr_vecs; + } else { + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + inline_vecs = BIP_INLINE_VECS; + } + + if (unlikely(!bip)) + return NULL; + + memset(bip, 0, sizeof(*bip)); + + if (nr_vecs > inline_vecs) { + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + bs->bvec_integrity_pool); + if (!bip->bip_vec) + goto err; + } else { + bip->bip_vec = bip->bip_inline_vecs; + } + + bip->bip_slab = idx; + bip->bip_bio = bio; + bio->bi_integrity = bip; + + return bip; +err: + mempool_free(bip, bs->bio_integrity_pool); + return NULL; +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_set *bs = bio->bi_pool; + + if (bip->bip_owns_buf) + kfree(bip->bip_buf); + + if (bs) { + if (bip->bip_slab != BIO_POOL_NONE) + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + + mempool_free(bip, bs->bio_integrity_pool); + } else { + kfree(bip); + } + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) +{ + if (bip->bip_slab == BIO_POOL_NONE) + return BIP_INLINE_VECS; + + return bvec_nr_vecs(bip->bip_slab); +} + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_vec *iv; + + if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip->bip_vec + bip->bip_vcnt; + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +static int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return 1; + + return 0; +} + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ + if (!bio_is_rw(bio)) + return 0; + + /* Already protected? */ + if (bio_integrity(bio)) + return 0; + + return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi: blk_integrity profile for device + * @sectors: Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device. Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, + unsigned int sectors) +{ + /* At this point there are only 512b or 4096b DIF/EPP devices */ + if (bi->sector_size == 4096) + return sectors >>= 3; + + return sectors; +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio: bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + BUG_ON(bio->bi_iter.bi_size == 0); + + return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, + int set) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip->bip_buf == NULL); + + if (bi->tag_size == 0) + return -1; + + nr_sectors = bio_integrity_hw_sectors(bi, + DIV_ROUND_UP(len, bi->tag_size)); + + if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, + nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); + return -1; + } + + if (set) + bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + else + bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + + return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio: bio to attach buffer to + * @tag_buf: Pointer to a buffer containing tag data + * @len: Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection. The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != WRITE); + + return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio: bio to retrieve buffer from + * @tag_buf: Pointer to a buffer for the tag data + * @len: Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != READ); + + return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio + * @bio: bio to generate/verify integrity metadata for + * @operate: operate number, 1 for generate, 0 for verify + */ +static int bio_integrity_generate_verify(struct bio *bio, int operate) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector; + unsigned int sectors, ret = 0, i; + void *prot_buf = bio->bi_integrity->bip_buf; + + if (operate) + sector = bio->bi_iter.bi_sector; + else + sector = bio->bi_integrity->bip_iter.bi_sector; + + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment_all(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + if (operate) + bi->generate_fn(&bix); + else { + ret = bi->verify_fn(&bix); + if (ret) { + kunmap_atomic(kaddr); + return ret; + } + } + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + + kunmap_atomic(kaddr); + } + return ret; +} + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + bio_integrity_generate_verify(bio, 1); +} + +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int sectors; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -ENOMEM; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return -EIO; + } + + bip->bip_owns_buf = 1; + bip->bip_buf = buf; + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_generate(bio); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio: bio to verify + * + * Description: This function is called to verify the integrity of a + * bio. The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ + return bio_integrity_generate_verify(bio, 0); +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + int error; + + error = bio_integrity_verify(bio); + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + bio_endio_nodec(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip->bip_bio != bio); + + /* In case of an I/O error there is no point in verifying the + * integrity metadata. Restore original bio end_io handler + * and run it. + */ + if (error) { + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio, error); + + return; + } + + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); + + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, + unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + bio_integrity_advance(bio, offset << 9); + bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @gfp_mask: Memory allocation mask + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask) +{ + struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + + if (bip == NULL) + return -EIO; + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_iter = bip_src->bip_iter; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + if (bs->bio_integrity_pool) + return 0; + + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); + if (!bs->bio_integrity_pool) + return -1; + + bs->bvec_integrity_pool = biovec_create_pool(pool_size); + if (!bs->bvec_integrity_pool) { + mempool_destroy(bs->bio_integrity_pool); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); + + if (bs->bvec_integrity_pool) + mempool_destroy(bs->bvec_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init(void) +{ + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + bip_slab = kmem_cache_create("bio_integrity_payload", + sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * BIP_INLINE_VECS, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!bip_slab) + panic("Failed to create slab\n"); +} diff --git a/block/bio.c b/block/bio.c new file mode 100644 index 00000000000..0ec61c9e536 --- /dev/null +++ b/block/bio.c @@ -0,0 +1,2052 @@ +/* + * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + * + */ +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/uio.h> +#include <linux/iocontext.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mempool.h> +#include <linux/workqueue.h> +#include <linux/cgroup.h> +#include <scsi/sg.h> /* for struct sg_iovec */ + +#include <trace/events/block.h> + +/* + * Test patch to inline a certain number of bi_io_vec's inside the bio + * itself, to shrink a bio data allocation from two mempool calls to one + */ +#define BIO_INLINE_VECS 4 + +/* + * if you change this list, also change bvec_alloc or things will + * break badly! cannot be bigger than what you can fit into an + * unsigned short + */ +#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } +static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { + BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), +}; +#undef BV + +/* + * fs_bio_set is the bio_set containing bio and iovec memory pools used by + * IO code that does not need private memory pools. + */ +struct bio_set *fs_bio_set; +EXPORT_SYMBOL(fs_bio_set); + +/* + * Our slab pool management + */ +struct bio_slab { + struct kmem_cache *slab; + unsigned int slab_ref; + unsigned int slab_size; + char name[8]; +}; +static DEFINE_MUTEX(bio_slab_lock); +static struct bio_slab *bio_slabs; +static unsigned int bio_slab_nr, bio_slab_max; + +static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +{ + unsigned int sz = sizeof(struct bio) + extra_size; + struct kmem_cache *slab = NULL; + struct bio_slab *bslab, *new_bio_slabs; + unsigned int new_bio_slab_max; + unsigned int i, entry = -1; + + mutex_lock(&bio_slab_lock); + + i = 0; + while (i < bio_slab_nr) { + bslab = &bio_slabs[i]; + + if (!bslab->slab && entry == -1) + entry = i; + else if (bslab->slab_size == sz) { + slab = bslab->slab; + bslab->slab_ref++; + break; + } + i++; + } + + if (slab) + goto out_unlock; + + if (bio_slab_nr == bio_slab_max && entry == -1) { + new_bio_slab_max = bio_slab_max << 1; + new_bio_slabs = krealloc(bio_slabs, + new_bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!new_bio_slabs) + goto out_unlock; + bio_slab_max = new_bio_slab_max; + bio_slabs = new_bio_slabs; + } + if (entry == -1) + entry = bio_slab_nr++; + + bslab = &bio_slabs[entry]; + + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); + slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); + if (!slab) + goto out_unlock; + + bslab->slab = slab; + bslab->slab_ref = 1; + bslab->slab_size = sz; +out_unlock: + mutex_unlock(&bio_slab_lock); + return slab; +} + +static void bio_put_slab(struct bio_set *bs) +{ + struct bio_slab *bslab = NULL; + unsigned int i; + + mutex_lock(&bio_slab_lock); + + for (i = 0; i < bio_slab_nr; i++) { + if (bs->bio_slab == bio_slabs[i].slab) { + bslab = &bio_slabs[i]; + break; + } + } + + if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) + goto out; + + WARN_ON(!bslab->slab_ref); + + if (--bslab->slab_ref) + goto out; + + kmem_cache_destroy(bslab->slab); + bslab->slab = NULL; + +out: + mutex_unlock(&bio_slab_lock); +} + +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} + +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) +{ + BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); + + if (idx == BIOVEC_MAX_IDX) + mempool_free(bv, pool); + else { + struct biovec_slab *bvs = bvec_slabs + idx; + + kmem_cache_free(bvs->slab, bv); + } +} + +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) +{ + struct bio_vec *bvl; + + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; + default: + return NULL; + } + + /* + * idx now points to the pool we want to allocate from. only the + * 1-vec entry pool is mempool backed. + */ + if (*idx == BIOVEC_MAX_IDX) { +fallback: + bvl = mempool_alloc(pool, gfp_mask); + } else { + struct biovec_slab *bvs = bvec_slabs + *idx; + gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + + /* + * Make this allocation restricted and don't dump info on + * allocation failures, since we'll fallback to the mempool + * in case of failure. + */ + __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + /* + * Try a slab allocation. If this fails and __GFP_WAIT + * is set, retry with the 1-entry mempool + */ + bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + *idx = BIOVEC_MAX_IDX; + goto fallback; + } + } + + return bvl; +} + +static void __bio_free(struct bio *bio) +{ + bio_disassociate_task(bio); + + if (bio_integrity(bio)) + bio_integrity_free(bio); +} + +static void bio_free(struct bio *bio) +{ + struct bio_set *bs = bio->bi_pool; + void *p; + + __bio_free(bio); + + if (bs) { + if (bio_flagged(bio, BIO_OWNS_VEC)) + bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); + + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; + p -= bs->front_pad; + + mempool_free(p, bs->bio_pool); + } else { + /* Bio was allocated by bio_kmalloc() */ + kfree(bio); + } +} + +void bio_init(struct bio *bio) +{ + memset(bio, 0, sizeof(*bio)); + bio->bi_flags = 1 << BIO_UPTODATE; + atomic_set(&bio->bi_remaining, 1); + atomic_set(&bio->bi_cnt, 1); +} +EXPORT_SYMBOL(bio_init); + +/** + * bio_reset - reinitialize a bio + * @bio: bio to reset + * + * Description: + * After calling bio_reset(), @bio will be in the same state as a freshly + * allocated bio returned bio bio_alloc_bioset() - the only fields that are + * preserved are the ones that are initialized by bio_alloc_bioset(). See + * comment in struct bio. + */ +void bio_reset(struct bio *bio) +{ + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + + __bio_free(bio); + + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_flags = flags|(1 << BIO_UPTODATE); + atomic_set(&bio->bi_remaining, 1); +} +EXPORT_SYMBOL(bio_reset); + +static void bio_chain_endio(struct bio *bio, int error) +{ + bio_endio(bio->bi_private, error); + bio_put(bio); +} + +/** + * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the @bio's parent bio + * + * The caller won't have a bi_end_io called when @bio completes - instead, + * @parent's bi_end_io won't be called until both @parent and @bio have + * completed; the chained bio will also be freed when it completes. + * + * The caller must not set bi_private or bi_end_io in @bio. + */ +void bio_chain(struct bio *bio, struct bio *parent) +{ + BUG_ON(bio->bi_private || bio->bi_end_io); + + bio->bi_private = parent; + bio->bi_end_io = bio_chain_endio; + atomic_inc(&parent->bi_remaining); +} +EXPORT_SYMBOL(bio_chain); + +static void bio_alloc_rescue(struct work_struct *work) +{ + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); + struct bio *bio; + + while (1) { + spin_lock(&bs->rescue_lock); + bio = bio_list_pop(&bs->rescue_list); + spin_unlock(&bs->rescue_lock); + + if (!bio) + break; + + generic_make_request(bio); + } +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ + struct bio_list punt, nopunt; + struct bio *bio; + + /* + * In order to guarantee forward progress we must punt only bios that + * were allocated from this bio_set; otherwise, if there was a bio on + * there for a stacking driver higher up in the stack, processing it + * could require allocating bios from this bio_set, and doing that from + * our own rescuer would be bad. + * + * Since bio lists are singly linked, pop them all instead of trying to + * remove from the middle of the list: + */ + + bio_list_init(&punt); + bio_list_init(&nopunt); + + while ((bio = bio_list_pop(current->bio_list))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + + *current->bio_list = nopunt; + + spin_lock(&bs->rescue_lock); + bio_list_merge(&bs->rescue_list, &punt); + spin_unlock(&bs->rescue_lock); + + queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + +/** + * bio_alloc_bioset - allocate a bio for I/O + * @gfp_mask: the GFP_ mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * @bs: the bio_set to allocate from. + * + * Description: + * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + * backed by the @bs's mempool. + * + * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be + * able to allocate a bio. This is due to the mempool guarantees. To make this + * work, callers must never allocate more than 1 bio at a time from this pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. + * + * Note that when running under generic_make_request() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * generic_make_request() that converts recursion into iteration, to prevent + * stack overflows. + * + * This would normally mean allocating multiple bios under + * generic_make_request() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. + * + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * generic_make_request() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) +{ + gfp_t saved_gfp = gfp_mask; + unsigned front_pad; + unsigned inline_vecs; + unsigned long idx = BIO_POOL_NONE; + struct bio_vec *bvl = NULL; + struct bio *bio; + void *p; + + if (!bs) { + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + p = kmalloc(sizeof(struct bio) + + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + front_pad = 0; + inline_vecs = nr_iovecs; + } else { + /* + * generic_make_request() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath generic_make_request(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_WAIT; if that fails, we punt those bios we + * would be blocking to the rescuer workqueue before we retry + * with the original gfp_flags. + */ + + if (current->bio_list && !bio_list_empty(current->bio_list)) + gfp_mask &= ~__GFP_WAIT; + + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(bs->bio_pool, gfp_mask); + } + + front_pad = bs->front_pad; + inline_vecs = BIO_INLINE_VECS; + } + + if (unlikely(!p)) + return NULL; + + bio = p + front_pad; + bio_init(bio); + + if (nr_iovecs > inline_vecs) { + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + } + + if (unlikely(!bvl)) + goto err_free; + + bio->bi_flags |= 1 << BIO_OWNS_VEC; + } else if (nr_iovecs) { + bvl = bio->bi_inline_vecs; + } + + bio->bi_pool = bs; + bio->bi_flags |= idx << BIO_POOL_OFFSET; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bvl; + return bio; + +err_free: + mempool_free(p, bs->bio_pool); + return NULL; +} +EXPORT_SYMBOL(bio_alloc_bioset); + +void zero_fill_bio(struct bio *bio) +{ + unsigned long flags; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_segment(bv, bio, iter) { + char *data = bvec_kmap_irq(&bv, &flags); + memset(data, 0, bv.bv_len); + flush_dcache_page(bv.bv_page); + bvec_kunmap_irq(data, &flags); + } +} +EXPORT_SYMBOL(zero_fill_bio); + +/** + * bio_put - release a reference to a bio + * @bio: bio to release reference to + * + * Description: + * Put a reference to a &struct bio, either one you have gotten with + * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. + **/ +void bio_put(struct bio *bio) +{ + BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->bi_cnt)) + bio_free(bio); +} +EXPORT_SYMBOL(bio_put); + +inline int bio_phys_segments(struct request_queue *q, struct bio *bio) +{ + if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) + blk_recount_segments(q, bio); + + return bio->bi_phys_segments; +} +EXPORT_SYMBOL(bio_phys_segments); + +/** + * __bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: destination bio + * @bio_src: bio to clone + * + * Clone a &bio. Caller will own the returned bio, but not + * the actual data it points to. Reference count of returned + * bio will be one. + * + * Caller must ensure that @bio_src is not freed before @bio. + */ +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +{ + BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); + + /* + * most users will be overriding ->bi_bdev with a new target, + * so we don't set nor calculate new physical/hw segment counts here + */ + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_flags |= 1 << BIO_CLONED; + bio->bi_rw = bio_src->bi_rw; + bio->bi_iter = bio_src->bi_iter; + bio->bi_io_vec = bio_src->bi_io_vec; +} +EXPORT_SYMBOL(__bio_clone_fast); + +/** + * bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Like __bio_clone_fast, only also allocates the returned bio + */ +struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +{ + struct bio *b; + + b = bio_alloc_bioset(gfp_mask, 0, bs); + if (!b) + return NULL; + + __bio_clone_fast(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, gfp_mask); + + if (ret < 0) { + bio_put(b); + return NULL; + } + } + + return b; +} +EXPORT_SYMBOL(bio_clone_fast); + +/** + * bio_clone_bioset - clone a bio + * @bio_src: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Clone bio. Caller will own the returned bio, but not the actual data it + * points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + /* + * Pre immutable biovecs, __bio_clone() used to just do a memcpy from + * bio_src->bi_io_vec to bio->bi_io_vec. + * + * We can't do that anymore, because: + * + * - The point of cloning the biovec is to produce a bio with a biovec + * the caller can modify: bi_idx and bi_bvec_done should be 0. + * + * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * we tried to clone the whole thing bio_alloc_bioset() would fail. + * But the clone should succeed as long as the number of biovecs we + * actually need to allocate is fewer than BIO_MAX_PAGES. + * + * - Lastly, bi_vcnt should not be looked at or relied upon by code + * that does not own the bio - reason being drivers don't use it for + * iterating over the biovec anymore, so expecting it to be kept up + * to date (i.e. for clones that share the parent biovec) is just + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ + + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + if (!bio) + return NULL; + + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_rw = bio_src->bi_rw; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + if (bio->bi_rw & REQ_DISCARD) + goto integrity_clone; + + if (bio->bi_rw & REQ_WRITE_SAME) { + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + goto integrity_clone; + } + + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + +integrity_clone: + if (bio_integrity(bio_src)) { + int ret; + + ret = bio_integrity_clone(bio, bio_src, gfp_mask); + if (ret < 0) { + bio_put(bio); + return NULL; + } + } + + return bio; +} +EXPORT_SYMBOL(bio_clone_bioset); + +/** + * bio_get_nr_vecs - return approx number of vecs + * @bdev: I/O target + * + * Return the approximate number of pages we can send to this target. + * There's no guarantee that you will be able to fit this number of pages + * into a bio, it does not account for dynamic restrictions that vary + * on offset. + */ +int bio_get_nr_vecs(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + int nr_pages; + + nr_pages = min_t(unsigned, + queue_max_segments(q), + queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); + + return min_t(unsigned, nr_pages, BIO_MAX_PAGES); + +} +EXPORT_SYMBOL(bio_get_nr_vecs); + +static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page + *page, unsigned int len, unsigned int offset, + unsigned int max_sectors) +{ + int retried_segments = 0; + struct bio_vec *bvec; + + /* + * cloned bio must not modify vec list + */ + if (unlikely(bio_flagged(bio, BIO_CLONED))) + return 0; + + if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + return 0; + + /* + * For filesystems with a blocksize smaller than the pagesize + * we will often be called with the same page as last time and + * a consecutive offset. Optimize this special case. + */ + if (bio->bi_vcnt > 0) { + struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page == prev->bv_page && + offset == prev->bv_offset + prev->bv_len) { + unsigned int prev_bv_len = prev->bv_len; + prev->bv_len += len; + + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + /* prev_bvec is already charged in + bi_size, discharge it in order to + simulate merging updated prev_bvec + as new bvec. */ + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = bio->bi_iter.bi_size - + prev_bv_len, + .bi_rw = bio->bi_rw, + }; + + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { + prev->bv_len -= len; + return 0; + } + } + + goto done; + } + + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, disallow it. + */ + if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && + bvec_gap_to_prev(prev, offset)) + return 0; + } + + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + + /* + * we might lose a segment or two here, but rather that than + * make this too complex. + */ + + while (bio->bi_phys_segments >= queue_max_segments(q)) { + + if (retried_segments) + return 0; + + retried_segments = 1; + blk_recount_segments(q, bio); + } + + /* + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + + /* + * if queue has other restrictions (eg varying max sector size + * depending on offset), it can specify a merge_bvec_fn in the + * queue to get further control + */ + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = bio->bi_iter.bi_size, + .bi_rw = bio->bi_rw, + }; + + /* + * merge_bvec_fn() returns number of bytes it can accept + * at this offset + */ + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + return 0; + } + } + + /* If we may be able to merge these biovecs, force a recount */ + if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + + bio->bi_vcnt++; + bio->bi_phys_segments++; + done: + bio->bi_iter.bi_size += len; + return len; +} + +/** + * bio_add_pc_page - attempt to add page to bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. + */ +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return __bio_add_page(q, bio, page, len, offset, + queue_max_hw_sectors(q)); +} +EXPORT_SYMBOL(bio_add_pc_page); + +/** + * bio_add_page - attempt to add page to bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + */ +int bio_add_page(struct bio *bio, struct page *page, unsigned int len, + unsigned int offset) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int max_sectors; + + max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size) + max_sectors = len >> 9; + + return __bio_add_page(q, bio, page, len, offset, max_sectors); +} +EXPORT_SYMBOL(bio_add_page); + +struct submit_bio_ret { + struct completion event; + int error; +}; + +static void submit_bio_wait_endio(struct bio *bio, int error) +{ + struct submit_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + */ +int submit_bio_wait(int rw, struct bio *bio) +{ + struct submit_bio_ret ret; + + rw |= REQ_SYNC; + init_completion(&ret.event); + bio->bi_private = &ret; + bio->bi_end_io = submit_bio_wait_endio; + submit_bio(rw, bio); + wait_for_completion(&ret.event); + + return ret.error; +} +EXPORT_SYMBOL(submit_bio_wait); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + bio_integrity_advance(bio, bytes); + + bio_advance_iter(bio, &bio->bi_iter, bytes); +} +EXPORT_SYMBOL(bio_advance); + +/** + * bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} +EXPORT_SYMBOL(bio_alloc_pages); + +/** + * bio_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats + * @src and @dst as linked lists of bios. + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter, dst_iter; + struct bio_vec src_bv, dst_bv; + void *src_p, *dst_p; + unsigned bytes; + + src_iter = src->bi_iter; + dst_iter = dst->bi_iter; + + while (1) { + if (!src_iter.bi_size) { + src = src->bi_next; + if (!src) + break; + + src_iter = src->bi_iter; + } + + if (!dst_iter.bi_size) { + dst = dst->bi_next; + if (!dst) + break; + + dst_iter = dst->bi_iter; + } + + src_bv = bio_iter_iovec(src, src_iter); + dst_bv = bio_iter_iovec(dst, dst_iter); + + bytes = min(src_bv.bv_len, dst_bv.bv_len); + + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); + + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + bio_advance_iter(src, &src_iter, bytes); + bio_advance_iter(dst, &dst_iter, bytes); + } +} +EXPORT_SYMBOL(bio_copy_data); + +struct bio_map_data { + int nr_sgvecs; + int is_our_pages; + struct sg_iovec sgvecs[]; +}; + +static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, + const struct sg_iovec *iov, int iov_count, + int is_our_pages) +{ + memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); + bmd->nr_sgvecs = iov_count; + bmd->is_our_pages = is_our_pages; + bio->bi_private = bmd; +} + +static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, + gfp_t gfp_mask) +{ + if (iov_count > UIO_MAXIOV) + return NULL; + + return kmalloc(sizeof(struct bio_map_data) + + sizeof(struct sg_iovec) * iov_count, gfp_mask); +} + +static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, + int to_user, int from_user, int do_free_page) +{ + int ret = 0, i; + struct bio_vec *bvec; + int iov_idx = 0; + unsigned int iov_off = 0; + + bio_for_each_segment_all(bvec, bio, i) { + char *bv_addr = page_address(bvec->bv_page); + unsigned int bv_len = bvec->bv_len; + + while (bv_len && iov_idx < iov_count) { + unsigned int bytes; + char __user *iov_addr; + + bytes = min_t(unsigned int, + iov[iov_idx].iov_len - iov_off, bv_len); + iov_addr = iov[iov_idx].iov_base + iov_off; + + if (!ret) { + if (to_user) + ret = copy_to_user(iov_addr, bv_addr, + bytes); + + if (from_user) + ret = copy_from_user(bv_addr, iov_addr, + bytes); + + if (ret) + ret = -EFAULT; + } + + bv_len -= bytes; + bv_addr += bytes; + iov_addr += bytes; + iov_off += bytes; + + if (iov[iov_idx].iov_len == iov_off) { + iov_idx++; + iov_off = 0; + } + } + + if (do_free_page) + __free_page(bvec->bv_page); + } + + return ret; +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user() and write back data + * to user space in case of a read. + */ +int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + struct bio_vec *bvec; + int ret = 0, i; + + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free. + */ + if (current->mm) + ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, + bio_data_dir(bio) == READ, + 0, bmd->is_our_pages); + else if (bmd->is_our_pages) + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); + } + kfree(bmd); + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(bio_uncopy_user); + +/** + * bio_copy_user_iov - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iov: the iovec. + * @iov_count: number of elements in the iovec + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, + const struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + int i, ret; + int nr_pages = 0; + unsigned int len = 0; + unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; + + for (i = 0; i < iov_count; i++) { + unsigned long uaddr; + unsigned long end; + unsigned long start; + + uaddr = (unsigned long)iov[i].iov_base; + end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages += end - start; + len += iov[i].iov_len; + } + + if (offset) + nr_pages++; + + bmd = bio_alloc_map_data(iov_count, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + + if (!write_to_vm) + bio->bi_rw |= REQ_WRITE; + + ret = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + break; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) + break; + + len -= bytes; + offset = 0; + } + + if (ret) + goto cleanup; + + /* + * success + */ + if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); + if (ret) + goto cleanup; + } + + bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); + return bio; +cleanup: + if (!map_data) + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); + + bio_put(bio); +out_bmd: + kfree(bmd); + return ERR_PTR(ret); +} + +/** + * bio_copy_user - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, + unsigned long uaddr, unsigned int len, + int write_to_vm, gfp_t gfp_mask) +{ + struct sg_iovec iov; + + iov.iov_base = (void __user *)uaddr; + iov.iov_len = len; + + return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); +} +EXPORT_SYMBOL(bio_copy_user); + +static struct bio *__bio_map_user_iov(struct request_queue *q, + struct block_device *bdev, + const struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) +{ + int i, j; + int nr_pages = 0; + struct page **pages; + struct bio *bio; + int cur_page = 0; + int ret, offset; + + for (i = 0; i < iov_count; i++) { + unsigned long uaddr = (unsigned long)iov[i].iov_base; + unsigned long len = iov[i].iov_len; + unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = uaddr >> PAGE_SHIFT; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages += end - start; + /* + * buffer must be aligned to at least hardsector size for now + */ + if (uaddr & queue_dma_alignment(q)) + return ERR_PTR(-EINVAL); + } + + if (!nr_pages) + return ERR_PTR(-EINVAL); + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; + pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); + if (!pages) + goto out; + + for (i = 0; i < iov_count; i++) { + unsigned long uaddr = (unsigned long)iov[i].iov_base; + unsigned long len = iov[i].iov_len; + unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = uaddr >> PAGE_SHIFT; + const int local_nr_pages = end - start; + const int page_limit = cur_page + local_nr_pages; + + ret = get_user_pages_fast(uaddr, local_nr_pages, + write_to_vm, &pages[cur_page]); + if (ret < local_nr_pages) { + ret = -EFAULT; + goto out_unmap; + } + + offset = uaddr & ~PAGE_MASK; + for (j = cur_page; j < page_limit; j++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + /* + * sorry... + */ + if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < + bytes) + break; + + len -= bytes; + offset = 0; + } + + cur_page = j; + /* + * release the pages we didn't map into the bio, if any + */ + while (j < page_limit) + page_cache_release(pages[j++]); + } + + kfree(pages); + + /* + * set data direction, and check if mapped pages need bouncing + */ + if (!write_to_vm) + bio->bi_rw |= REQ_WRITE; + + bio->bi_bdev = bdev; + bio->bi_flags |= (1 << BIO_USER_MAPPED); + return bio; + + out_unmap: + for (i = 0; i < nr_pages; i++) { + if(!pages[i]) + break; + page_cache_release(pages[i]); + } + out: + kfree(pages); + bio_put(bio); + return ERR_PTR(ret); +} + +/** + * bio_map_user - map user address into bio + * @q: the struct request_queue for the bio + * @bdev: destination block device + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, + unsigned long uaddr, unsigned int len, int write_to_vm, + gfp_t gfp_mask) +{ + struct sg_iovec iov; + + iov.iov_base = (void __user *)uaddr; + iov.iov_len = len; + + return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); +} +EXPORT_SYMBOL(bio_map_user); + +/** + * bio_map_user_iov - map user sg_iovec table into bio + * @q: the struct request_queue for the bio + * @bdev: destination block device + * @iov: the iovec. + * @iov_count: number of elements in the iovec + * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, + const struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) +{ + struct bio *bio; + + bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, + gfp_mask); + if (IS_ERR(bio)) + return bio; + + /* + * subtle -- if __bio_map_user() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); + + return bio; +} + +static void __bio_unmap_user(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + /* + * make sure we dirty pages we wrote to + */ + bio_for_each_segment_all(bvec, bio, i) { + if (bio_data_dir(bio) == READ) + set_page_dirty_lock(bvec->bv_page); + + page_cache_release(bvec->bv_page); + } + + bio_put(bio); +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user(). Must be called with + * a process context. + * + * bio_unmap_user() may sleep. + */ +void bio_unmap_user(struct bio *bio) +{ + __bio_unmap_user(bio); + bio_put(bio); +} +EXPORT_SYMBOL(bio_unmap_user); + +static void bio_map_kern_endio(struct bio *bio, int err) +{ + bio_put(bio); +} + +static struct bio *__bio_map_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + offset) < bytes) + break; + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask) +{ + struct bio *bio; + + bio = __bio_map_kern(q, data, len, gfp_mask); + if (IS_ERR(bio)) + return bio; + + if (bio->bi_iter.bi_size == len) + return bio; + + /* + * Don't support partial mappings. + */ + bio_put(bio); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL(bio_map_kern); + +static void bio_copy_kern_endio(struct bio *bio, int err) +{ + struct bio_vec *bvec; + const int read = bio_data_dir(bio) == READ; + struct bio_map_data *bmd = bio->bi_private; + int i; + char *p = bmd->sgvecs[0].iov_base; + + bio_for_each_segment_all(bvec, bio, i) { + char *addr = page_address(bvec->bv_page); + + if (read) + memcpy(p, addr, bvec->bv_len); + + __free_page(bvec->bv_page); + p += bvec->bv_len; + } + + kfree(bmd); + bio_put(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask, int reading) +{ + struct bio *bio; + struct bio_vec *bvec; + int i; + + bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); + if (IS_ERR(bio)) + return bio; + + if (!reading) { + void *p = data; + + bio_for_each_segment_all(bvec, bio, i) { + char *addr = page_address(bvec->bv_page); + + memcpy(addr, p, bvec->bv_len); + p += bvec->bv_len; + } + } + + bio->bi_end_io = bio_copy_kern_endio; + + return bio; +} +EXPORT_SYMBOL(bio_copy_kern); + +/* + * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions + * for performing direct-IO in BIOs. + * + * The problem is that we cannot run set_page_dirty() from interrupt context + * because the required locks are not interrupt-safe. So what we can do is to + * mark the pages dirty _before_ performing IO. And in interrupt context, + * check that the pages are still dirty. If so, fine. If not, redirty them + * in process context. + * + * We special-case compound pages here: normally this means reads into hugetlb + * pages. The logic in here doesn't really work right for compound pages + * because the VM does not uniformly chase down the head page in all cases. + * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't + * handle them at all. So we skip compound pages here at an early stage. + * + * Note that this code is very hard to test under normal circumstances because + * direct-io pins the pages with get_user_pages(). This makes + * is_page_cache_freeable return false, and the VM will not clean the pages. + * But other code (eg, flusher threads) could clean the pages if they are mapped + * pagecache. + * + * Simply disabling the call to bio_set_pages_dirty() is a good way to test the + * deferred bio dirtying paths. + */ + +/* + * bio_set_pages_dirty() will mark all the bio's pages as dirty. + */ +void bio_set_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (page && !PageCompound(page)) + set_page_dirty_lock(page); + } +} + +static void bio_release_pages(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (page) + put_page(page); + } +} + +/* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. + * If they are, then fine. If, however, some pages are clean then they must + * have been written out during the direct-IO read. So we take another ref on + * the BIO and the offending pages and re-dirty the pages in process context. + * + * It is expected that bio_check_pages_dirty() will wholly own the BIO from + * here on. It will run one page_cache_release() against each page and will + * run one bio_put() against the BIO. + */ + +static void bio_dirty_fn(struct work_struct *work); + +static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); +static DEFINE_SPINLOCK(bio_dirty_lock); +static struct bio *bio_dirty_list; + +/* + * This runs in process context + */ +static void bio_dirty_fn(struct work_struct *work) +{ + unsigned long flags; + struct bio *bio; + + spin_lock_irqsave(&bio_dirty_lock, flags); + bio = bio_dirty_list; + bio_dirty_list = NULL; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + + while (bio) { + struct bio *next = bio->bi_private; + + bio_set_pages_dirty(bio); + bio_release_pages(bio); + bio_put(bio); + bio = next; + } +} + +void bio_check_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec; + int nr_clean_pages = 0; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (PageDirty(page) || PageCompound(page)) { + page_cache_release(page); + bvec->bv_page = NULL; + } else { + nr_clean_pages++; + } + } + + if (nr_clean_pages) { + unsigned long flags; + + spin_lock_irqsave(&bio_dirty_lock, flags); + bio->bi_private = bio_dirty_list; + bio_dirty_list = bio; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } else { + bio_put(bio); + } +} + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void bio_flush_dcache_pages(struct bio *bi) +{ + struct bio_vec bvec; + struct bvec_iter iter; + + bio_for_each_segment(bvec, bi, iter) + flush_dcache_page(bvec.bv_page); +} +EXPORT_SYMBOL(bio_flush_dcache_pages); +#endif + +/** + * bio_endio - end I/O on a bio + * @bio: bio + * @error: error, if any + * + * Description: + * bio_endio() will end I/O on the whole bio. bio_endio() is the + * preferred way to end I/O on a bio, it takes care of clearing + * BIO_UPTODATE on error. @error is 0 on success, and and one of the + * established -Exxxx (-EIO, for instance) error values in case + * something went wrong. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io + * function. + **/ +void bio_endio(struct bio *bio, int error) +{ + while (bio) { + BUG_ON(atomic_read(&bio->bi_remaining) <= 0); + + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; + + if (!atomic_dec_and_test(&bio->bi_remaining)) + return; + + /* + * Need to have a real endio function for chained bios, + * otherwise various corner cases will break (like stacking + * block devices that save/restore bi_end_io) - however, we want + * to avoid unbounded recursion and blowing the stack. Tail call + * optimization would handle this, but compiling with frame + * pointers also disables gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + struct bio *parent = bio->bi_private; + bio_put(bio); + bio = parent; + } else { + if (bio->bi_end_io) + bio->bi_end_io(bio, error); + bio = NULL; + } + } +} +EXPORT_SYMBOL(bio_endio); + +/** + * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining + * @bio: bio + * @error: error, if any + * + * For code that has saved and restored bi_end_io; thing hard before using this + * function, probably you should've cloned the entire bio. + **/ +void bio_endio_nodec(struct bio *bio, int error) +{ + atomic_inc(&bio->bi_remaining); + bio_endio(bio, error); +} +EXPORT_SYMBOL(bio_endio_nodec); + +/** + * bio_split - split a bio + * @bio: bio to split + * @sectors: number of sectors to split from the front of @bio + * @gfp: gfp mask + * @bs: bio set to allocate from + * + * Allocates and returns a new bio which represents @sectors from the start of + * @bio, and updates @bio to represent the remaining sectors. + * + * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's + * responsibility to ensure that @bio is not freed before the split. + */ +struct bio *bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs) +{ + struct bio *split = NULL; + + BUG_ON(sectors <= 0); + BUG_ON(sectors >= bio_sectors(bio)); + + split = bio_clone_fast(bio, gfp, bs); + if (!split) + return NULL; + + split->bi_iter.bi_size = sectors << 9; + + if (bio_integrity(split)) + bio_integrity_trim(split, 0, sectors); + + bio_advance(bio, split->bi_iter.bi_size); + + return split; +} +EXPORT_SYMBOL(bio_split); + +/** + * bio_trim - trim a bio + * @bio: bio to trim + * @offset: number of sectors to trim from the front of @bio + * @size: size we want to trim @bio to, in sectors + */ +void bio_trim(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + */ + + size <<= 9; + if (offset == 0 && size == bio->bi_iter.bi_size) + return; + + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + bio_advance(bio, offset << 9); + + bio->bi_iter.bi_size = size; +} +EXPORT_SYMBOL_GPL(bio_trim); + +/* + * create memory pools for biovec's in a bio_set. + * use the global biovec slabs created for general use. + */ +mempool_t *biovec_create_pool(int pool_entries) +{ + struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; + + return mempool_create_slab_pool(pool_entries, bp->slab); +} + +void bioset_free(struct bio_set *bs) +{ + if (bs->rescue_workqueue) + destroy_workqueue(bs->rescue_workqueue); + + if (bs->bio_pool) + mempool_destroy(bs->bio_pool); + + if (bs->bvec_pool) + mempool_destroy(bs->bvec_pool); + + bioset_integrity_free(bs); + bio_put_slab(bs); + + kfree(bs); +} +EXPORT_SYMBOL(bioset_free); + +/** + * bioset_create - Create a bio_set + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + */ +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) +{ + unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + struct bio_set *bs; + + bs = kzalloc(sizeof(*bs), GFP_KERNEL); + if (!bs) + return NULL; + + bs->front_pad = front_pad; + + spin_lock_init(&bs->rescue_lock); + bio_list_init(&bs->rescue_list); + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + if (!bs->bio_slab) { + kfree(bs); + return NULL; + } + + bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); + if (!bs->bio_pool) + goto bad; + + bs->bvec_pool = biovec_create_pool(pool_size); + if (!bs->bvec_pool) + goto bad; + + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + + return bs; +bad: + bioset_free(bs); + return NULL; +} +EXPORT_SYMBOL(bioset_create); + +#ifdef CONFIG_BLK_CGROUP +/** + * bio_associate_current - associate a bio with %current + * @bio: target bio + * + * Associate @bio with %current if it hasn't been associated yet. Block + * layer will treat @bio as if it were issued by %current no matter which + * task actually issues it. + * + * This function takes an extra reference of @task's io_context and blkcg + * which will be put when @bio is released. The caller must own @bio, + * ensure %current->io_context exists, and is responsible for synchronizing + * calls to this function. + */ +int bio_associate_current(struct bio *bio) +{ + struct io_context *ioc; + struct cgroup_subsys_state *css; + + if (bio->bi_ioc) + return -EBUSY; + + ioc = current->io_context; + if (!ioc) + return -ENOENT; + + /* acquire active ref on @ioc and associate */ + get_io_context_active(ioc); + bio->bi_ioc = ioc; + + /* associate blkcg if exists */ + rcu_read_lock(); + css = task_css(current, blkio_cgrp_id); + if (css && css_tryget_online(css)) + bio->bi_css = css; + rcu_read_unlock(); + + return 0; +} + +/** + * bio_disassociate_task - undo bio_associate_current() + * @bio: target bio + */ +void bio_disassociate_task(struct bio *bio) +{ + if (bio->bi_ioc) { + put_io_context(bio->bi_ioc); + bio->bi_ioc = NULL; + } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } +} + +#endif /* CONFIG_BLK_CGROUP */ + +static void __init biovec_init_slabs(void) +{ + int i; + + for (i = 0; i < BIOVEC_NR_POOLS; i++) { + int size; + struct biovec_slab *bvs = bvec_slabs + i; + + if (bvs->nr_vecs <= BIO_INLINE_VECS) { + bvs->slab = NULL; + continue; + } + + size = bvs->nr_vecs * sizeof(struct bio_vec); + bvs->slab = kmem_cache_create(bvs->name, size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + } +} + +static int __init init_bio(void) +{ + bio_slab_max = 2; + bio_slab_nr = 0; + bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); + if (!bio_slabs) + panic("bio: can't allocate bios\n"); + + bio_integrity_init(); + biovec_init_slabs(); + + fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!fs_bio_set) + panic("bio: can't allocate bios\n"); + + if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) + panic("bio: can't create integrity pool\n"); + + return 0; +} +subsys_initcall(init_bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 455768a3eb9..28d227c5ca7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -11,1522 +11,1143 @@ * Nauman Rafique <nauman@google.com> */ #include <linux/ioprio.h> -#include <linux/seq_file.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/slab.h> -#include "blk-cgroup.h" #include <linux/genhd.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include "blk-cgroup.h" +#include "blk.h" #define MAX_KEY_LEN 100 -static DEFINE_SPINLOCK(blkio_list_lock); -static LIST_HEAD(blkio_list); - -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; -EXPORT_SYMBOL_GPL(blkio_root_cgroup); - -static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, - struct cgroup *); -static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, - struct task_struct *, bool); -static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup *, struct task_struct *, bool); -static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); -static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); - -/* for encoding cft->private value on file */ -#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) -/* What policy owns the file, proportional or throttle */ -#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) -#define BLKIOFILE_ATTR(val) ((val) & 0xffff) - -struct cgroup_subsys blkio_subsys = { - .name = "blkio", - .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, - .destroy = blkiocg_destroy, - .populate = blkiocg_populate, -#ifdef CONFIG_BLK_CGROUP - /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ - .subsys_id = blkio_subsys_id, -#endif - .use_id = 1, - .module = THIS_MODULE, -}; -EXPORT_SYMBOL_GPL(blkio_subsys); - -static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) -{ - list_add(&pn->node, &blkcg->policy_list); -} +static DEFINE_MUTEX(blkcg_pol_mutex); -static inline bool cftype_blkg_same_policy(struct cftype *cft, - struct blkio_group *blkg) -{ - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); +struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, + .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +EXPORT_SYMBOL_GPL(blkcg_root); - if (blkg->plid == plid) - return 1; +static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; - return 0; -} - -/* Determines if policy node matches cgroup file being accessed */ -static inline bool pn_matches_cftype(struct cftype *cft, - struct blkio_policy_node *pn) +static bool blkcg_policy_enabled(struct request_queue *q, + const struct blkcg_policy *pol) { - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int fileid = BLKIOFILE_ATTR(cft->private); - - return (plid == pn->plid && fileid == pn->fileid); + return pol && test_bit(pol->plid, q->blkcg_pols); } -/* Must be called with blkcg->lock held */ -static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) -{ - list_del(&pn->node); -} - -/* Must be called with blkcg->lock held */ -static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, - enum blkio_policy_id plid, int fileid) +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkcg_gq *blkg) { - struct blkio_policy_node *pn; + int i; - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) - return pn; - } + if (!blkg) + return; - return NULL; -} + for (i = 0; i < BLKCG_MAX_POLS; i++) + kfree(blkg->pd[i]); -struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), - struct blkio_cgroup, css); + blk_exit_rl(&blkg->rl); + kfree(blkg); } -EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); -static inline void -blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +/** + * blkg_alloc - allocate a blkg + * @blkcg: block cgroup the new blkg is associated with + * @q: request_queue the new blkg is associated with + * @gfp_mask: allocation mask to use + * + * Allocate a new blkg assocating @blkcg and @q. + */ +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, + gfp_t gfp_mask) { - struct blkio_policy_type *blkiop; + struct blkcg_gq *blkg; + int i; - list_for_each_entry(blkiop, &blkio_list, list) { - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) - continue; - if (blkiop->ops.blkio_update_group_weight_fn) - blkiop->ops.blkio_update_group_weight_fn(blkg->key, - blkg, weight); + /* alloc and init base part */ + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); + if (!blkg) + return NULL; + + blkg->q = q; + INIT_LIST_HEAD(&blkg->q_node); + blkg->blkcg = blkcg; + atomic_set(&blkg->refcnt, 1); + + /* root blkg uses @q->root_rl, init rl only for !root blkgs */ + if (blkcg != &blkcg_root) { + if (blk_init_rl(&blkg->rl, q, gfp_mask)) + goto err_free; + blkg->rl.blkg = blkg; } -} - -static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, - int fileid) -{ - struct blkio_policy_type *blkiop; - list_for_each_entry(blkiop, &blkio_list, list) { + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkg_policy_data *pd; - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (!blkcg_policy_enabled(q, pol)) continue; - if (fileid == BLKIO_THROTL_read_bps_device - && blkiop->ops.blkio_update_group_read_bps_fn) - blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, - blkg, bps); + /* alloc per-policy data and attach it to blkg */ + pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + if (!pd) + goto err_free; - if (fileid == BLKIO_THROTL_write_bps_device - && blkiop->ops.blkio_update_group_write_bps_fn) - blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, - blkg, bps); + blkg->pd[i] = pd; + pd->blkg = blkg; + pd->plid = i; } -} -static inline void blkio_update_group_iops(struct blkio_group *blkg, - unsigned int iops, int fileid) -{ - struct blkio_policy_type *blkiop; + return blkg; - list_for_each_entry(blkiop, &blkio_list, list) { +err_free: + blkg_free(blkg); + return NULL; +} - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) - continue; +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, + bool update_hint) +{ + struct blkcg_gq *blkg; - if (fileid == BLKIO_THROTL_read_iops_device - && blkiop->ops.blkio_update_group_read_iops_fn) - blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, - blkg, iops); + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; - if (fileid == BLKIO_THROTL_write_iops_device - && blkiop->ops.blkio_update_group_write_iops_fn) - blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, - blkg,iops); + /* + * Hint didn't match. Look up from the radix tree. Note that the + * hint can only be updated under queue_lock as otherwise @blkg + * could have already been removed from blkg_tree. The caller is + * responsible for grabbing queue_lock if @update_hint. + */ + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q == q) { + if (update_hint) { + lockdep_assert_held(q->queue_lock); + rcu_assign_pointer(blkcg->blkg_hint, blkg); + } + return blkg; } + + return NULL; } -/* - * Add to the appropriate stat variable depending on the request type. - * This should be called with the blkg->stats_lock held. +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. */ -static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, - bool sync) +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { - if (direction) - stat[BLKIO_STAT_WRITE] += add; - else - stat[BLKIO_STAT_READ] += add; - if (sync) - stat[BLKIO_STAT_SYNC] += add; - else - stat[BLKIO_STAT_ASYNC] += add; + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q, false); } +EXPORT_SYMBOL_GPL(blkg_lookup); /* - * Decrements the appropriate stat variable if non-zero depending on the - * request type. Panics on value being zero. - * This should be called with the blkg->stats_lock held. + * If @new_blkg is %NULL, this function tries to allocate a new one as + * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. */ -static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) -{ - if (direction) { - BUG_ON(stat[BLKIO_STAT_WRITE] == 0); - stat[BLKIO_STAT_WRITE]--; - } else { - BUG_ON(stat[BLKIO_STAT_READ] == 0); - stat[BLKIO_STAT_READ]--; - } - if (sync) { - BUG_ON(stat[BLKIO_STAT_SYNC] == 0); - stat[BLKIO_STAT_SYNC]--; - } else { - BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); - stat[BLKIO_STAT_ASYNC]--; - } -} - -#ifdef CONFIG_DEBUG_BLK_CGROUP -/* This should be called with the blkg->stats_lock held. */ -static void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, + struct request_queue *q, + struct blkcg_gq *new_blkg) { - if (blkio_blkg_waiting(&blkg->stats)) - return; - if (blkg == curr_blkg) - return; - blkg->stats.start_group_wait_time = sched_clock(); - blkio_mark_blkg_waiting(&blkg->stats); -} + struct blkcg_gq *blkg; + int i, ret; -/* This should be called with the blkg->stats_lock held. */ -static void blkio_update_group_wait_time(struct blkio_group_stats *stats) -{ - unsigned long long now; + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); - if (!blkio_blkg_waiting(stats)) - return; + /* blkg holds a reference to blkcg */ + if (!css_tryget_online(&blkcg->css)) { + ret = -EINVAL; + goto err_free_blkg; + } - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) - stats->group_wait_time += now - stats->start_group_wait_time; - blkio_clear_blkg_waiting(stats); -} + /* allocate */ + if (!new_blkg) { + new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + if (unlikely(!new_blkg)) { + ret = -ENOMEM; + goto err_put_css; + } + } + blkg = new_blkg; + + /* link parent */ + if (blkcg_parent(blkcg)) { + blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + if (WARN_ON_ONCE(!blkg->parent)) { + ret = -EINVAL; + goto err_put_css; + } + blkg_get(blkg->parent); + } -/* This should be called with the blkg->stats_lock held. */ -static void blkio_end_empty_time(struct blkio_group_stats *stats) -{ - unsigned long long now; + /* invoke per-policy init */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; - if (!blkio_blkg_empty(stats)) - return; + if (blkg->pd[i] && pol->pd_init_fn) + pol->pd_init_fn(blkg); + } - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) - stats->empty_time += now - stats->start_empty_time; - blkio_clear_blkg_empty(stats); -} + /* insert */ + spin_lock(&blkcg->lock); + ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + if (likely(!ret)) { + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + list_add(&blkg->q_node, &q->blkg_list); -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) -{ - unsigned long flags; + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; - spin_lock_irqsave(&blkg->stats_lock, flags); - BUG_ON(blkio_blkg_idling(&blkg->stats)); - blkg->stats.start_idle_time = sched_clock(); - blkio_mark_blkg_idling(&blkg->stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + if (blkg->pd[i] && pol->pd_online_fn) + pol->pd_online_fn(blkg); + } + } + blkg->online = true; + spin_unlock(&blkcg->lock); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ - unsigned long flags; - unsigned long long now; - struct blkio_group_stats *stats; - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - if (blkio_blkg_idling(stats)) { - now = sched_clock(); - if (time_after64(now, stats->start_idle_time)) - stats->idle_time += now - stats->start_idle_time; - blkio_clear_blkg_idling(stats); + if (!ret) { + if (blkcg == &blkcg_root) { + q->root_blkg = blkg; + q->root_rl.blkg = blkg; + } + return blkg; } - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) -{ - unsigned long flags; - struct blkio_group_stats *stats; - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - stats->avg_queue_size_sum += - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; - stats->avg_queue_size_samples++; - blkio_update_group_wait_time(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + /* @blkg failed fully initialized, use the usual release path */ + blkg_put(blkg); + return ERR_PTR(ret); + +err_put_css: + css_put(&blkcg->css); +err_free_blkg: + blkg_free(new_blkg); + return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); -void blkiocg_set_start_empty_time(struct blkio_group *blkg) +/** + * blkg_lookup_create - lookup blkg, try to create one if not there + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * create one. blkg creation is performed recursively from blkcg_root such + * that all non-root blkg's have access to the parent blkg. This function + * should be called under RCU read lock and @q->queue_lock. + * + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). + */ +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { - unsigned long flags; - struct blkio_group_stats *stats; + struct blkcg_gq *blkg; - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); - if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); - return; - } + /* + * This could be the first entry point of blkcg implementation and + * we shouldn't allow anything to go through for a bypassing queue. + */ + if (unlikely(blk_queue_bypass(q))) + return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + return blkg; /* - * group is already marked empty. This can happen if cfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. */ - if(blkio_blkg_empty(stats)) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); - return; - } + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent = blkcg_parent(blkcg); - stats->start_empty_time = sched_clock(); - blkio_mark_blkg_empty(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } -void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) -{ - blkg->stats.dequeue += dequeue; -} -EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); -#else -static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) {} -static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} -#endif - -void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, - bool sync) -{ - unsigned long flags; - - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, - sync); - blkio_end_empty_time(&blkg->stats); - blkio_set_start_group_wait_time(blkg, curr_blkg); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + blkg = blkg_create(pos, q, NULL); + if (pos == blkcg || IS_ERR(blkg)) + return blkg; + } } -EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); +EXPORT_SYMBOL_GPL(blkg_lookup_create); -void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) +static void blkg_destroy(struct blkcg_gq *blkg) { - unsigned long flags; + struct blkcg *blkcg = blkg->blkcg; + int i; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkcg->lock); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) -{ - unsigned long flags; + /* Something wrong if we are trying to remove same group twice */ + WARN_ON_ONCE(list_empty(&blkg->q_node)); + WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); - spin_lock_irqsave(&blkg->stats_lock, flags); - blkg->stats.time += time; - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; -void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) -{ - struct blkio_group_stats *stats; - unsigned long flags; - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - stats->sectors += bytes >> 9; - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, - sync); - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); + if (blkg->pd[i] && pol->pd_offline_fn) + pol->pd_offline_fn(blkg); + } + blkg->online = false; -void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) -{ - struct blkio_group_stats *stats; - unsigned long flags; - unsigned long long now = sched_clock(); - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - if (time_after64(now, io_start_time)) - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], - now - io_start_time, direction, sync); - if (time_after64(io_start_time, start_time)) - blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], - io_start_time - start_time, direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); + list_del_init(&blkg->q_node); + hlist_del_init_rcu(&blkg->blkcg_node); -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync) -{ - unsigned long flags; + /* + * Both setting lookup hint to and clearing it from @blkg are done + * under queue_lock. If it's not pointing to @blkg now, it never + * will. Hint assignment itself can race safely. + */ + if (rcu_access_pointer(blkcg->blkg_hint) == blkg) + rcu_assign_pointer(blkcg->blkg_hint, NULL); - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, - sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); + /* + * If root blkg is destroyed. Just clear the pointer since root_rl + * does not take reference on root blkg. + */ + if (blkcg == &blkcg_root) { + blkg->q->root_blkg = NULL; + blkg->q->root_rl.blkg = NULL; + } -void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) -{ - unsigned long flags; - - spin_lock_irqsave(&blkcg->lock, flags); - spin_lock_init(&blkg->stats_lock); - rcu_assign_pointer(blkg->key, key); - blkg->blkcg_id = css_id(&blkcg->css); - hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - blkg->plid = plid; - spin_unlock_irqrestore(&blkcg->lock, flags); - /* Need to take css reference ? */ - cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); - blkg->dev = dev; + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + blkg_put(blkg); } -EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); -static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +/** + * blkg_destroy_all - destroy all blkgs associated with a request_queue + * @q: request_queue of interest + * + * Destroy all blkgs associated with @q. + */ +static void blkg_destroy_all(struct request_queue *q) { - hlist_del_init_rcu(&blkg->blkcg_node); - blkg->blkcg_id = 0; + struct blkcg_gq *blkg, *n; + + lockdep_assert_held(q->queue_lock); + + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); + blkg_destroy(blkg); + spin_unlock(&blkcg->lock); + } } /* - * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 - * indicating that blk_group was unhashed by the time we got to it. + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. */ -int blkiocg_del_blkio_group(struct blkio_group *blkg) +void __blkg_release_rcu(struct rcu_head *rcu_head) { - struct blkio_cgroup *blkcg; - unsigned long flags; - struct cgroup_subsys_state *css; - int ret = 1; + struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); + int i; - rcu_read_lock(); - css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (css) { - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; - } - spin_unlock_irqrestore(&blkcg->lock, flags); + /* tell policies that this one is being freed */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_exit_fn) + pol->pd_exit_fn(blkg); } - rcu_read_unlock(); - return ret; + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + + blkg_free(blkg); } -EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); +EXPORT_SYMBOL_GPL(__blkg_release_rcu); -/* called under rcu_read_lock(). */ -struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +/* + * The next function used by blk_queue_for_each_rl(). It's a bit tricky + * because the root blkg uses @q->root_rl instead of its own rl. + */ +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q) { - struct blkio_group *blkg; - struct hlist_node *n; - void *__key; + struct list_head *ent; + struct blkcg_gq *blkg; - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - __key = blkg->key; - if (__key == key) - return blkg; + /* + * Determine the current blkg list_head. The first entry is + * root_rl which is off @q->blkg_list and mapped to the head. + */ + if (rl == &q->root_rl) { + ent = &q->blkg_list; + /* There are no more block groups, hence no request lists */ + if (list_empty(ent)) + return NULL; + } else { + blkg = container_of(rl, struct blkcg_gq, rl); + ent = &blkg->q_node; } - return NULL; + /* walk to the next list_head, skip root blkcg */ + ent = ent->next; + if (ent == &q->root_blkg->q_node) + ent = ent->next; + if (ent == &q->blkg_list) + return NULL; + + blkg = container_of(ent, struct blkcg_gq, q_node); + return &blkg->rl; } -EXPORT_SYMBOL_GPL(blkiocg_lookup_group); -static int -blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) +static int blkcg_reset_stats(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) { - struct blkio_cgroup *blkcg; - struct blkio_group *blkg; - struct blkio_group_stats *stats; - struct hlist_node *n; - uint64_t queued[BLKIO_STAT_TOTAL]; + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; int i; -#ifdef CONFIG_DEBUG_BLK_CGROUP - bool idling, waiting, empty; - unsigned long long now = sched_clock(); -#endif - blkcg = cgroup_to_blkio_cgroup(cgroup); + /* + * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex + * which ends up putting cgroup's internal cgroup_tree_mutex under + * it; however, cgroup_tree_mutex is nested above cgroup file + * active protection and grabbing blkcg_pol_mutex from a cgroup + * file operation creates a possible circular dependency. cgroup + * internal locking is planned to go through further simplification + * and this issue should go away soon. For now, let's trylock + * blkcg_pol_mutex and restart the write on failure. + * + * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com + */ + if (!mutex_trylock(&blkcg_pol_mutex)) + return restart_syscall(); spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - spin_lock(&blkg->stats_lock); - stats = &blkg->stats; -#ifdef CONFIG_DEBUG_BLK_CGROUP - idling = blkio_blkg_idling(stats); - waiting = blkio_blkg_waiting(stats); - empty = blkio_blkg_empty(stats); -#endif - for (i = 0; i < BLKIO_STAT_TOTAL; i++) - queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; - memset(stats, 0, sizeof(struct blkio_group_stats)); - for (i = 0; i < BLKIO_STAT_TOTAL; i++) - stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; -#ifdef CONFIG_DEBUG_BLK_CGROUP - if (idling) { - blkio_mark_blkg_idling(stats); - stats->start_idle_time = now; - } - if (waiting) { - blkio_mark_blkg_waiting(stats); - stats->start_group_wait_time = now; - } - if (empty) { - blkio_mark_blkg_empty(stats); - stats->start_empty_time = now; + + /* + * Note that stat reset is racy - it doesn't synchronize against + * stat updates. This is a debug feature which shouldn't exist + * anyway. If you get hit by a race, retry. + */ + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkcg_policy_enabled(blkg->q, pol) && + pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg); } -#endif - spin_unlock(&blkg->stats_lock); } + spin_unlock_irq(&blkcg->lock); + mutex_unlock(&blkcg_pol_mutex); return 0; } -static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, - int chars_left, bool diskname_only) +static const char *blkg_dev_name(struct blkcg_gq *blkg) { - snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); - chars_left -= strlen(str); - if (chars_left <= 0) { - printk(KERN_WARNING - "Possibly incorrect cgroup stat display format"); - return; - } - if (diskname_only) - return; - switch (type) { - case BLKIO_STAT_READ: - strlcat(str, " Read", chars_left); - break; - case BLKIO_STAT_WRITE: - strlcat(str, " Write", chars_left); - break; - case BLKIO_STAT_SYNC: - strlcat(str, " Sync", chars_left); - break; - case BLKIO_STAT_ASYNC: - strlcat(str, " Async", chars_left); - break; - case BLKIO_STAT_TOTAL: - strlcat(str, " Total", chars_left); - break; - default: - strlcat(str, " Invalid", chars_left); - } + /* some drivers (floppy) instantiate a queue w/o disk registered */ + if (blkg->q->backing_dev_info.dev) + return dev_name(blkg->q->backing_dev_info.dev); + return NULL; } -static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, - struct cgroup_map_cb *cb, dev_t dev) +/** + * blkcg_print_blkgs - helper for printing per-blkg data + * @sf: seq_file to print to + * @blkcg: blkcg of interest + * @prfill: fill function to print out a blkg + * @pol: policy in question + * @data: data to be passed to @prfill + * @show_total: to print out sum of prfill return values or not + * + * This function invokes @prfill on each blkg of @blkcg if pd for the + * policy specified by @pol exists. @prfill is invoked with @sf, the + * policy data and @data and the matching queue lock held. If @show_total + * is %true, the sum of the return values from @prfill is printed with + * "Total" label at the end. + * + * This is to be used to construct print functions for + * cftype->read_seq_string method. + */ +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total) { - blkio_get_key_name(0, dev, str, chars_left, true); - cb->fill(cb, str, val); - return val; -} + struct blkcg_gq *blkg; + u64 total = 0; -/* This should be called with blkg->stats_lock held */ -static uint64_t blkio_get_stat(struct blkio_group *blkg, - struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) -{ - uint64_t disk_total; - char key_str[MAX_KEY_LEN]; - enum stat_sub_type sub_type; - - if (type == BLKIO_STAT_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.time, cb, dev); - if (type == BLKIO_STAT_SECTORS) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.sectors, cb, dev); -#ifdef CONFIG_DEBUG_BLK_CGROUP - if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { - uint64_t sum = blkg->stats.avg_queue_size_sum; - uint64_t samples = blkg->stats.avg_queue_size_samples; - if (samples) - do_div(sum, samples); - else - sum = 0; - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); - } - if (type == BLKIO_STAT_GROUP_WAIT_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.group_wait_time, cb, dev); - if (type == BLKIO_STAT_IDLE_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.idle_time, cb, dev); - if (type == BLKIO_STAT_EMPTY_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.empty_time, cb, dev); - if (type == BLKIO_STAT_DEQUEUE) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.dequeue, cb, dev); -#endif - - for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; - sub_type++) { - blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); - cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(blkg->q->queue_lock); + if (blkcg_policy_enabled(blkg->q, pol)) + total += prfill(sf, blkg->pd[pol->plid], data); + spin_unlock_irq(blkg->q->queue_lock); } - disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + - blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; - blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); - cb->fill(cb, key_str, disk_total); - return disk_total; + rcu_read_unlock(); + + if (show_total) + seq_printf(sf, "Total %llu\n", (unsigned long long)total); } +EXPORT_SYMBOL_GPL(blkcg_print_blkgs); -static int blkio_check_dev_num(dev_t dev) +/** + * __blkg_prfill_u64 - prfill helper for a single u64 value + * @sf: seq_file to print to + * @pd: policy private data of interest + * @v: value to print + * + * Print @v to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { - int part = 0; - struct gendisk *disk; + const char *dname = blkg_dev_name(pd->blkg); - disk = get_gendisk(dev, &part); - if (!disk || part) - return -ENODEV; + if (!dname) + return 0; - return 0; + seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); + return v; } +EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -static int blkio_policy_parse_and_set(char *buf, - struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) +/** + * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @rwstat: rwstat to print + * + * Print @rwstat to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat) { - char *s[4], *p, *major_s = NULL, *minor_s = NULL; - int ret; - unsigned long major, minor, temp; - int i = 0; - dev_t dev; - u64 bps, iops; - - memset(s, 0, sizeof(s)); - - while ((p = strsep(&buf, " ")) != NULL) { - if (!*p) - continue; - - s[i++] = p; - - /* Prevent from inputing too many things */ - if (i == 3) - break; - } + static const char *rwstr[] = { + [BLKG_RWSTAT_READ] = "Read", + [BLKG_RWSTAT_WRITE] = "Write", + [BLKG_RWSTAT_SYNC] = "Sync", + [BLKG_RWSTAT_ASYNC] = "Async", + }; + const char *dname = blkg_dev_name(pd->blkg); + u64 v; + int i; - if (i != 2) - return -EINVAL; + if (!dname) + return 0; - p = strsep(&s[0], ":"); - if (p != NULL) - major_s = p; - else - return -EINVAL; + for (i = 0; i < BLKG_RWSTAT_NR; i++) + seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], + (unsigned long long)rwstat->cnt[i]); - minor_s = s[0]; - if (!minor_s) - return -EINVAL; + v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); + return v; +} +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); - ret = strict_strtoul(major_s, 10, &major); - if (ret) - return -EINVAL; +/** + * blkg_prfill_stat - prfill callback for blkg_stat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * prfill callback for printing a blkg_stat. + */ +u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) +{ + return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); +} +EXPORT_SYMBOL_GPL(blkg_prfill_stat); - ret = strict_strtoul(minor_s, 10, &minor); - if (ret) - return -EINVAL; +/** + * blkg_prfill_rwstat - prfill callback for blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_rwstat in @pd + * + * prfill callback for printing a blkg_rwstat. + */ +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); - dev = MKDEV(major, minor); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} +EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); - ret = blkio_check_dev_num(dev); - if (ret) - return ret; +/** + * blkg_stat_recursive_sum - collect hierarchical blkg_stat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_stat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + u64 sum = 0; - newpn->dev = dev; + lockdep_assert_held(pd->blkg->q->queue_lock); - if (s[1] == NULL) - return -EINVAL; + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_stat *stat = (void *)pos_pd + off; - switch (plid) { - case BLKIO_POLICY_PROP: - ret = strict_strtoul(s[1], 10, &temp); - if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || - temp > BLKIO_WEIGHT_MAX) - return -EINVAL; - - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.weight = temp; - break; - case BLKIO_POLICY_THROTL: - switch(fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - ret = strict_strtoull(s[1], 10, &bps); - if (ret) - return -EINVAL; - - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.bps = bps; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - ret = strict_strtoull(s[1], 10, &iops); - if (ret) - return -EINVAL; - - if (iops > THROTL_IOPS_MAX) - return -EINVAL; - - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.iops = (unsigned int)iops; - break; - } - break; - default: - BUG(); + if (pos_blkg->online) + sum += blkg_stat_read(stat); } + rcu_read_unlock(); - return 0; + return sum; } +EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); -unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, - dev_t dev) +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_rwstat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off) { - struct blkio_policy_node *pn; + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + struct blkg_rwstat sum = { }; + int i; - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, - BLKIO_PROP_weight_device); - if (pn) - return pn->val.weight; - else - return blkcg->weight; -} -EXPORT_SYMBOL_GPL(blkcg_get_weight); + lockdep_assert_held(pd->blkg->q->queue_lock); -uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_rwstat *rwstat = (void *)pos_pd + off; + struct blkg_rwstat tmp; - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_bps_device); - if (pn) - return pn->val.bps; - else - return -1; -} + if (!pos_blkg->online) + continue; -uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_bps_device); - if (pn) - return pn->val.bps; - else - return -1; -} + tmp = blkg_rwstat_read(rwstat); -unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum.cnt[i] += tmp.cnt[i]; + } + rcu_read_unlock(); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_iops_device); - if (pn) - return pn->val.iops; - else - return -1; + return sum; } - -unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); + +/** + * blkg_conf_prep - parse and prepare for per-blkg config update + * @blkcg: target block cgroup + * @pol: target policy + * @input: input string + * @ctx: blkg_conf_ctx to be filled + * + * Parse per-blkg config update from @input and initialize @ctx with the + * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new + * value. This function returns with RCU read lock and queue lock held and + * must be paired with blkg_conf_finish(). + */ +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx) + __acquires(rcu) __acquires(disk->queue->queue_lock) { - struct blkio_policy_node *pn; - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_iops_device); - if (pn) - return pn->val.iops; + struct gendisk *disk; + struct blkcg_gq *blkg; + unsigned int major, minor; + unsigned long long v; + int part, ret; + + if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + return -EINVAL; + + disk = get_gendisk(MKDEV(major, minor), &part); + if (!disk || part) + return -EINVAL; + + rcu_read_lock(); + spin_lock_irq(disk->queue->queue_lock); + + if (blkcg_policy_enabled(disk->queue, pol)) + blkg = blkg_lookup_create(blkcg, disk->queue); else - return -1; -} + blkg = ERR_PTR(-EINVAL); -/* Checks whether user asked for deleting a policy rule */ -static bool blkio_delete_rule_command(struct blkio_policy_node *pn) -{ - switch(pn->plid) { - case BLKIO_POLICY_PROP: - if (pn->val.weight == 0) - return 1; - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - if (pn->val.bps == 0) - return 1; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - if (pn->val.iops == 0) - return 1; + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + rcu_read_unlock(); + spin_unlock_irq(disk->queue->queue_lock); + put_disk(disk); + /* + * If queue was bypassing, we should retry. Do so after a + * short msleep(). It isn't strictly necessary but queue + * can be bypassing for some time and it's always nice to + * avoid busy looping. + */ + if (ret == -EBUSY) { + msleep(10); + ret = restart_syscall(); } - break; - default: - BUG(); + return ret; } + ctx->disk = disk; + ctx->blkg = blkg; + ctx->v = v; return 0; } +EXPORT_SYMBOL_GPL(blkg_conf_prep); -static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, - struct blkio_policy_node *newpn) -{ - switch(oldpn->plid) { - case BLKIO_POLICY_PROP: - oldpn->val.weight = newpn->val.weight; - break; - case BLKIO_POLICY_THROTL: - switch(newpn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - oldpn->val.bps = newpn->val.bps; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - oldpn->val.iops = newpn->val.iops; - } - break; - default: - BUG(); - } -} - -/* - * Some rules/values in blkg have changed. Propogate those to respective - * policies. +/** + * blkg_conf_finish - finish up per-blkg config update + * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * + * Finish up after per-blkg config update. This function must be paired + * with blkg_conf_prep(). */ -static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, struct blkio_policy_node *pn) +void blkg_conf_finish(struct blkg_conf_ctx *ctx) + __releases(ctx->disk->queue->queue_lock) __releases(rcu) { - unsigned int weight, iops; - u64 bps; - - switch(pn->plid) { - case BLKIO_POLICY_PROP: - weight = pn->val.weight ? pn->val.weight : - blkcg->weight; - blkio_update_group_weight(blkg, weight); - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - bps = pn->val.bps ? pn->val.bps : (-1); - blkio_update_group_bps(blkg, bps, pn->fileid); - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - iops = pn->val.iops ? pn->val.iops : (-1); - blkio_update_group_iops(blkg, iops, pn->fileid); - break; - } - break; - default: - BUG(); - } + spin_unlock_irq(ctx->disk->queue->queue_lock); + rcu_read_unlock(); + put_disk(ctx->disk); } +EXPORT_SYMBOL_GPL(blkg_conf_finish); -/* - * A policy node rule has been updated. Propogate this update to all the - * block groups which might be affected by this update. +struct cftype blkcg_files[] = { + { + .name = "reset_stats", + .write_u64 = blkcg_reset_stats, + }, + { } /* terminate */ +}; + +/** + * blkcg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away and responsible + * for shooting down all blkgs associated with @css. blkgs should be + * removed while holding both q and blkcg locks. As blkcg lock is nested + * inside q lock, this function performs reverse double lock dancing. + * + * This is the blkcg counterpart of ioc_release_fn(). */ -static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) +static void blkcg_css_offline(struct cgroup_subsys_state *css) { - struct blkio_group *blkg; - struct hlist_node *n; + struct blkcg *blkcg = css_to_blkcg(css); - spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (pn->dev != blkg->dev || pn->plid != blkg->plid) - continue; - blkio_update_blkg_policy(blkcg, blkg, pn); + while (!hlist_empty(&blkcg->blkg_list)) { + struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, + struct blkcg_gq, blkcg_node); + struct request_queue *q = blkg->q; + + if (spin_trylock(q->queue_lock)) { + blkg_destroy(blkg); + spin_unlock(q->queue_lock); + } else { + spin_unlock_irq(&blkcg->lock); + cpu_relax(); + spin_lock_irq(&blkcg->lock); + } } spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); } -static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static void blkcg_css_free(struct cgroup_subsys_state *css) { - int ret = 0; - char *buf; - struct blkio_policy_node *newpn, *pn; - struct blkio_cgroup *blkcg; - int keep_newpn = 0; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int fileid = BLKIOFILE_ATTR(cft->private); - - buf = kstrdup(buffer, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); - if (!newpn) { - ret = -ENOMEM; - goto free_buf; - } - - ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); - if (ret) - goto free_newpn; - - blkcg = cgroup_to_blkio_cgroup(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); - spin_lock_irq(&blkcg->lock); + if (blkcg != &blkcg_root) + kfree(blkcg); +} - pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); - if (!pn) { - if (!blkio_delete_rule_command(newpn)) { - blkio_policy_insert_node(blkcg, newpn); - keep_newpn = 1; - } - spin_unlock_irq(&blkcg->lock); - goto update_io_group; - } +static struct cgroup_subsys_state * +blkcg_css_alloc(struct cgroup_subsys_state *parent_css) +{ + static atomic64_t id_seq = ATOMIC64_INIT(0); + struct blkcg *blkcg; - if (blkio_delete_rule_command(newpn)) { - blkio_policy_delete_node(pn); - spin_unlock_irq(&blkcg->lock); - goto update_io_group; + if (!parent_css) { + blkcg = &blkcg_root; + goto done; } - spin_unlock_irq(&blkcg->lock); - blkio_update_policy_rule(pn, newpn); + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) + return ERR_PTR(-ENOMEM); -update_io_group: - blkio_update_policy_node_blkg(blkcg, newpn); + blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; + blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; + blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ +done: + spin_lock_init(&blkcg->lock); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_HLIST_HEAD(&blkcg->blkg_list); -free_newpn: - if (!keep_newpn) - kfree(newpn); -free_buf: - kfree(buf); - return ret; + return &blkcg->css; } -static void -blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) +/** + * blkcg_init_queue - initialize blkcg part of request queue + * @q: request_queue to initialize + * + * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * part of new request_queue @q. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkcg_init_queue(struct request_queue *q) { - switch(pn->plid) { - case BLKIO_POLICY_PROP: - if (pn->fileid == BLKIO_PROP_weight_device) - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.weight); - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.bps); - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.iops); - break; - } - break; - default: - BUG(); - } -} + might_sleep(); -/* cgroup files which read their data from policy nodes end up here */ -static void blkio_read_policy_node_files(struct cftype *cft, - struct blkio_cgroup *blkcg, struct seq_file *m) -{ - struct blkio_policy_node *pn; - - if (!list_empty(&blkcg->policy_list)) { - spin_lock_irq(&blkcg->lock); - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (!pn_matches_cftype(cft, pn)) - continue; - blkio_print_policy_node(m, pn); - } - spin_unlock_irq(&blkcg->lock); - } + return blk_throtl_init(q); } -static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m) +/** + * blkcg_drain_queue - drain blkcg part of request_queue + * @q: request_queue to drain + * + * Called from blk_drain_queue(). Responsible for draining blkcg part. + */ +void blkcg_drain_queue(struct request_queue *q) { - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); - - blkcg = cgroup_to_blkio_cgroup(cgrp); - - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_weight_device: - blkio_read_policy_node_files(cft, blkcg, m); - return 0; - default: - BUG(); - } - break; - case BLKIO_POLICY_THROTL: - switch(name){ - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - blkio_read_policy_node_files(cft, blkcg, m); - return 0; - default: - BUG(); - } - break; - default: - BUG(); - } + lockdep_assert_held(q->queue_lock); - return 0; + /* + * @q could be exiting and already have destroyed all blkgs as + * indicated by NULL root_blkg. If so, don't confuse policies. + */ + if (!q->root_blkg) + return; + + blk_throtl_drain(q); } -static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, - struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, - bool show_total) +/** + * blkcg_exit_queue - exit and release blkcg part of request_queue + * @q: request_queue being released + * + * Called from blk_release_queue(). Responsible for exiting blkcg part. + */ +void blkcg_exit_queue(struct request_queue *q) { - struct blkio_group *blkg; - struct hlist_node *n; - uint64_t cgroup_total = 0; + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); - rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (blkg->dev) { - if (!cftype_blkg_same_policy(cft, blkg)) - continue; - spin_lock_irq(&blkg->stats_lock); - cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, - type); - spin_unlock_irq(&blkg->stats_lock); - } - } - if (show_total) - cb->fill(cb, "Total", cgroup_total); - rcu_read_unlock(); - return 0; + blk_throtl_exit(q); } -/* All map kind of cgroup file get serviced by this function */ -static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. + */ +static int blkcg_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); - - blkcg = cgroup_to_blkio_cgroup(cgrp); - - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_TIME, 0); - case BLKIO_PROP_sectors: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SECTORS, 0); - case BLKIO_PROP_io_service_bytes: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_BYTES, 1); - case BLKIO_PROP_io_serviced: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICED, 1); - case BLKIO_PROP_io_service_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_TIME, 1); - case BLKIO_PROP_io_wait_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_WAIT_TIME, 1); - case BLKIO_PROP_io_merged: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_MERGED, 1); - case BLKIO_PROP_io_queued: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_QUEUED, 1); -#ifdef CONFIG_DEBUG_BLK_CGROUP - case BLKIO_PROP_dequeue: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_DEQUEUE, 0); - case BLKIO_PROP_avg_queue_size: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_AVG_QUEUE_SIZE, 0); - case BLKIO_PROP_group_wait_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_GROUP_WAIT_TIME, 0); - case BLKIO_PROP_idle_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_IDLE_TIME, 0); - case BLKIO_PROP_empty_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_EMPTY_TIME, 0); -#endif - default: - BUG(); - } - break; - case BLKIO_POLICY_THROTL: - switch(name){ - case BLKIO_THROTL_io_service_bytes: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_BYTES, 1); - case BLKIO_THROTL_io_serviced: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICED, 1); - default: - BUG(); - } - break; - default: - BUG(); - } + struct task_struct *task; + struct io_context *ioc; + int ret = 0; - return 0; + /* task_lock() is needed to avoid races with exit_io_context() */ + cgroup_taskset_for_each(task, tset) { + task_lock(task); + ioc = task->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(task); + if (ret) + break; + } + return ret; } -static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) -{ - struct blkio_group *blkg; - struct hlist_node *n; - struct blkio_policy_node *pn; +struct cgroup_subsys blkio_cgrp_subsys = { + .css_alloc = blkcg_css_alloc, + .css_offline = blkcg_css_offline, + .css_free = blkcg_css_free, + .can_attach = blkcg_can_attach, + .base_cftypes = blkcg_files, +}; +EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); - if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) - return -EINVAL; +/** + * blkcg_activate_policy - activate a blkcg policy on a request_queue + * @q: request_queue of interest + * @pol: blkcg policy to activate + * + * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * bypass mode to populate its blkgs with policy_data for @pol. + * + * Activation happens with @q bypassed, so nobody would be accessing blkgs + * from IO path. Update of each blkg is protected by both queue and blkcg + * locks so that holding either lock and testing blkcg_policy_enabled() is + * always enough for dereferencing policy data. + * + * The caller is responsible for synchronizing [de]activations and policy + * [un]registerations. Returns 0 on success, -errno on failure. + */ +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) +{ + LIST_HEAD(pds); + struct blkcg_gq *blkg, *new_blkg; + struct blkg_policy_data *pd, *n; + int cnt = 0, ret; + bool preloaded; + + if (blkcg_policy_enabled(q, pol)) + return 0; + + /* preallocations for root blkg */ + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - blkcg->weight = (unsigned int)val; + blk_queue_bypass_start(q); - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - pn = blkio_policy_search_node(blkcg, blkg->dev, - BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); - if (pn) - continue; + preloaded = !radix_tree_preload(GFP_KERNEL); - blkio_update_group_weight(blkg, blkcg->weight); - } - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); - return 0; -} + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code it. + */ + spin_lock_irq(q->queue_lock); -static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); + rcu_read_lock(); + blkg = __blkg_lookup(&blkcg_root, q, false); + if (blkg) + blkg_free(new_blkg); + else + blkg = blkg_create(&blkcg_root, q, new_blkg); + rcu_read_unlock(); - blkcg = cgroup_to_blkio_cgroup(cgrp); + if (preloaded) + radix_tree_preload_end(); - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_weight: - return (u64)blkcg->weight; - } - break; - default: - BUG(); + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + goto out_unlock; } - return 0; -} -static int -blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) -{ - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); + list_for_each_entry(blkg, &q->blkg_list, q_node) + cnt++; - blkcg = cgroup_to_blkio_cgroup(cgrp); + spin_unlock_irq(q->queue_lock); - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_weight: - return blkio_weight_write(blkcg, val); + /* allocate policy_data for all existing blkgs */ + while (cnt--) { + pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); + if (!pd) { + ret = -ENOMEM; + goto out_free; } - break; - default: - BUG(); + list_add_tail(&pd->alloc_node, &pds); } - return 0; -} - -struct cftype blkio_files[] = { - { - .name = "weight_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_weight_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - { - .name = "weight", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_weight), - .read_u64 = blkiocg_file_read_u64, - .write_u64 = blkiocg_file_write_u64, - }, - { - .name = "time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "sectors", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_sectors), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_service_bytes", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_service_bytes), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_serviced", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_serviced), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_service_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_service_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_wait_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_wait_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_merged", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_merged), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_queued", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_queued), - .read_map = blkiocg_file_read_map, - }, - { - .name = "reset_stats", - .write_u64 = blkiocg_reset_stats, - }, -#ifdef CONFIG_BLK_DEV_THROTTLING - { - .name = "throttle.read_bps_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_bps_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, + /* + * Install the allocated pds. With @q bypassing, no new blkg + * should have been created while the queue lock was dropped. + */ + spin_lock_irq(q->queue_lock); - { - .name = "throttle.write_bps_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_bps_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, + list_for_each_entry(blkg, &q->blkg_list, q_node) { + if (WARN_ON(list_empty(&pds))) { + /* umm... this shouldn't happen, just abort */ + ret = -ENOMEM; + goto out_unlock; + } + pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); + list_del_init(&pd->alloc_node); - { - .name = "throttle.read_iops_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_iops_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, + /* grab blkcg lock too while installing @pd on @blkg */ + spin_lock(&blkg->blkcg->lock); - { - .name = "throttle.write_iops_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_iops_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - { - .name = "throttle.io_service_bytes", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_io_service_bytes), - .read_map = blkiocg_file_read_map, - }, - { - .name = "throttle.io_serviced", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_io_serviced), - .read_map = blkiocg_file_read_map, - }, -#endif /* CONFIG_BLK_DEV_THROTTLING */ + blkg->pd[pol->plid] = pd; + pd->blkg = blkg; + pd->plid = pol->plid; + pol->pd_init_fn(blkg); -#ifdef CONFIG_DEBUG_BLK_CGROUP - { - .name = "avg_queue_size", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_avg_queue_size), - .read_map = blkiocg_file_read_map, - }, - { - .name = "group_wait_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_group_wait_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "idle_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_idle_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "empty_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_empty_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "dequeue", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_dequeue), - .read_map = blkiocg_file_read_map, - }, -#endif -}; + spin_unlock(&blkg->blkcg->lock); + } -static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) -{ - return cgroup_add_files(cgroup, subsys, blkio_files, - ARRAY_SIZE(blkio_files)); + __set_bit(pol->plid, q->blkcg_pols); + ret = 0; +out_unlock: + spin_unlock_irq(q->queue_lock); +out_free: + blk_queue_bypass_end(q); + list_for_each_entry_safe(pd, n, &pds, alloc_node) + kfree(pd); + return ret; } +EXPORT_SYMBOL_GPL(blkcg_activate_policy); -static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +/** + * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue + * @q: request_queue of interest + * @pol: blkcg policy to deactivate + * + * Deactivate @pol on @q. Follows the same synchronization rules as + * blkcg_activate_policy(). + */ +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - unsigned long flags; - struct blkio_group *blkg; - void *key; - struct blkio_policy_type *blkiop; - struct blkio_policy_node *pn, *pntmp; + struct blkcg_gq *blkg; - rcu_read_lock(); - do { - spin_lock_irqsave(&blkcg->lock, flags); + if (!blkcg_policy_enabled(q, pol)) + return; - if (hlist_empty(&blkcg->blkg_list)) { - spin_unlock_irqrestore(&blkcg->lock, flags); - break; - } + blk_queue_bypass_start(q); + spin_lock_irq(q->queue_lock); - blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, - blkcg_node); - key = rcu_dereference(blkg->key); - __blkiocg_del_blkio_group(blkg); + __clear_bit(pol->plid, q->blkcg_pols); - spin_unlock_irqrestore(&blkcg->lock, flags); + /* if no policy is left, no need for blkgs - shoot them down */ + if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) + blkg_destroy_all(q); - /* - * This blkio_group is being unlinked as associated cgroup is - * going away. Let all the IO controlling policies know about - * this event. - */ - spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) { - if (blkiop->plid != blkg->plid) - continue; - blkiop->ops.blkio_unlink_group_fn(key, blkg); - } - spin_unlock(&blkio_list_lock); - } while (1); + list_for_each_entry(blkg, &q->blkg_list, q_node) { + /* grab blkcg lock too while removing @pd from @blkg */ + spin_lock(&blkg->blkcg->lock); - list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { - blkio_policy_delete_node(pn); - kfree(pn); - } + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg); + if (pol->pd_exit_fn) + pol->pd_exit_fn(blkg); - free_css_id(&blkio_subsys, &blkcg->css); - rcu_read_unlock(); - if (blkcg != &blkio_root_cgroup) - kfree(blkcg); -} + kfree(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; -static struct cgroup_subsys_state * -blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) -{ - struct blkio_cgroup *blkcg; - struct cgroup *parent = cgroup->parent; - - if (!parent) { - blkcg = &blkio_root_cgroup; - goto done; + spin_unlock(&blkg->blkcg->lock); } - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) - return ERR_PTR(-ENOMEM); - - blkcg->weight = BLKIO_WEIGHT_DEFAULT; -done: - spin_lock_init(&blkcg->lock); - INIT_HLIST_HEAD(&blkcg->blkg_list); - - INIT_LIST_HEAD(&blkcg->policy_list); - return &blkcg->css; + spin_unlock_irq(q->queue_lock); + blk_queue_bypass_end(q); } +EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); -/* - * We cannot support shared io contexts, as we have no mean to support - * two tasks with the same ioc in two different groups without major rework - * of the main cic data structures. For now we allow a task to change - * its cgroup only if it's the only owner of its ioc. +/** + * blkcg_policy_register - register a blkcg policy + * @pol: blkcg policy to register + * + * Register @pol with blkcg core. Might sleep and @pol may be modified on + * successful registration. Returns 0 on success and -errno on failure. */ -static int blkiocg_can_attach(struct cgroup_subsys *subsys, - struct cgroup *cgroup, struct task_struct *tsk, - bool threadgroup) +int blkcg_policy_register(struct blkcg_policy *pol) { - struct io_context *ioc; - int ret = 0; + int i, ret; - /* task_lock() is needed to avoid races with exit_io_context() */ - task_lock(tsk); - ioc = tsk->io_context; - if (ioc && atomic_read(&ioc->nr_tasks) > 1) - ret = -EINVAL; - task_unlock(tsk); - - return ret; -} + if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) + return -EINVAL; -static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, - struct cgroup *prev, struct task_struct *tsk, - bool threadgroup) -{ - struct io_context *ioc; + mutex_lock(&blkcg_pol_mutex); - task_lock(tsk); - ioc = tsk->io_context; - if (ioc) - ioc->cgroup_changed = 1; - task_unlock(tsk); + /* find an empty slot */ + ret = -ENOSPC; + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (!blkcg_policy[i]) + break; + if (i >= BLKCG_MAX_POLS) + goto out_unlock; + + /* register and update blkgs */ + pol->plid = i; + blkcg_policy[i] = pol; + + /* everything is in place, add intf files for the new policy */ + if (pol->cftypes) + WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes)); + ret = 0; +out_unlock: + mutex_unlock(&blkcg_pol_mutex); + return ret; } +EXPORT_SYMBOL_GPL(blkcg_policy_register); -void blkio_policy_register(struct blkio_policy_type *blkiop) +/** + * blkcg_policy_unregister - unregister a blkcg policy + * @pol: blkcg policy to unregister + * + * Undo blkcg_policy_register(@pol). Might sleep. + */ +void blkcg_policy_unregister(struct blkcg_policy *pol) { - spin_lock(&blkio_list_lock); - list_add_tail(&blkiop->list, &blkio_list); - spin_unlock(&blkio_list_lock); -} -EXPORT_SYMBOL_GPL(blkio_policy_register); + mutex_lock(&blkcg_pol_mutex); -void blkio_policy_unregister(struct blkio_policy_type *blkiop) -{ - spin_lock(&blkio_list_lock); - list_del_init(&blkiop->list); - spin_unlock(&blkio_list_lock); -} -EXPORT_SYMBOL_GPL(blkio_policy_unregister); + if (WARN_ON(blkcg_policy[pol->plid] != pol)) + goto out_unlock; -static int __init init_cgroup_blkio(void) -{ - return cgroup_load_subsys(&blkio_subsys); -} + /* kill the intf files first */ + if (pol->cftypes) + cgroup_rm_cftypes(pol->cftypes); -static void __exit exit_cgroup_blkio(void) -{ - cgroup_unload_subsys(&blkio_subsys); + /* unregister and update blkgs */ + blkcg_policy[pol->plid] = NULL; +out_unlock: + mutex_unlock(&blkcg_pol_mutex); } - -module_init(init_cgroup_blkio); -module_exit(exit_cgroup_blkio); -MODULE_LICENSE("GPL"); +EXPORT_SYMBOL_GPL(blkcg_policy_unregister); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index ea4861bdd54..d3fd7aa3d2a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -14,322 +14,593 @@ */ #include <linux/cgroup.h> - -enum blkio_policy_id { - BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ - BLKIO_POLICY_THROTL, /* Throttling */ -}; +#include <linux/u64_stats_sync.h> +#include <linux/seq_file.h> +#include <linux/radix-tree.h> +#include <linux/blkdev.h> +#include <linux/atomic.h> /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) - -#ifndef CONFIG_BLK_CGROUP -/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ -extern struct cgroup_subsys blkio_subsys; -#define blkio_subsys_id blkio_subsys.subsys_id -#endif - -enum stat_type { - /* Total time spent (in ns) between request dispatch to the driver and - * request completion for IOs doen by this cgroup. This may not be - * accurate when NCQ is turned on. */ - BLKIO_STAT_SERVICE_TIME = 0, - /* Total bytes transferred */ - BLKIO_STAT_SERVICE_BYTES, - /* Total IOs serviced, post merge */ - BLKIO_STAT_SERVICED, - /* Total time spent waiting in scheduler queue in ns */ - BLKIO_STAT_WAIT_TIME, - /* Number of IOs merged */ - BLKIO_STAT_MERGED, - /* Number of IOs queued up */ - BLKIO_STAT_QUEUED, - /* All the single valued stats go below this */ - BLKIO_STAT_TIME, - BLKIO_STAT_SECTORS, -#ifdef CONFIG_DEBUG_BLK_CGROUP - BLKIO_STAT_AVG_QUEUE_SIZE, - BLKIO_STAT_IDLE_TIME, - BLKIO_STAT_EMPTY_TIME, - BLKIO_STAT_GROUP_WAIT_TIME, - BLKIO_STAT_DEQUEUE -#endif -}; +/* CFQ specific, out here for blkcg->cfq_weight */ +#define CFQ_WEIGHT_MIN 10 +#define CFQ_WEIGHT_MAX 1000 +#define CFQ_WEIGHT_DEFAULT 500 -enum stat_sub_type { - BLKIO_STAT_READ = 0, - BLKIO_STAT_WRITE, - BLKIO_STAT_SYNC, - BLKIO_STAT_ASYNC, - BLKIO_STAT_TOTAL -}; +#ifdef CONFIG_BLK_CGROUP -/* blkg state flags */ -enum blkg_state_flags { - BLKG_waiting = 0, - BLKG_idling, - BLKG_empty, -}; +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, -/* cgroup files owned by proportional weight policy */ -enum blkcg_file_name_prop { - BLKIO_PROP_weight = 1, - BLKIO_PROP_weight_device, - BLKIO_PROP_io_service_bytes, - BLKIO_PROP_io_serviced, - BLKIO_PROP_time, - BLKIO_PROP_sectors, - BLKIO_PROP_io_service_time, - BLKIO_PROP_io_wait_time, - BLKIO_PROP_io_merged, - BLKIO_PROP_io_queued, - BLKIO_PROP_avg_queue_size, - BLKIO_PROP_group_wait_time, - BLKIO_PROP_idle_time, - BLKIO_PROP_empty_time, - BLKIO_PROP_dequeue, + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, }; -/* cgroup files owned by throttle policy */ -enum blkcg_file_name_throtl { - BLKIO_THROTL_read_bps_device, - BLKIO_THROTL_write_bps_device, - BLKIO_THROTL_read_iops_device, - BLKIO_THROTL_write_iops_device, - BLKIO_THROTL_io_service_bytes, - BLKIO_THROTL_io_serviced, -}; +struct blkcg_gq; + +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + + struct radix_tree_root blkg_tree; + struct blkcg_gq *blkg_hint; + struct hlist_head blkg_list; -struct blkio_cgroup { - struct cgroup_subsys_state css; - unsigned int weight; - spinlock_t lock; - struct hlist_head blkg_list; - struct list_head policy_list; /* list of blkio_policy_node */ + /* for policies to test whether associated blkcg has changed */ + uint64_t id; + + /* TODO: per-policy storage in blkcg */ + unsigned int cfq_weight; /* belongs to cfq */ + unsigned int cfq_leaf_weight; }; -struct blkio_group_stats { - /* total disk time and nr sectors dispatched by this group */ - uint64_t time; - uint64_t sectors; - uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; -#ifdef CONFIG_DEBUG_BLK_CGROUP - /* Sum of number of IOs queued across all samples */ - uint64_t avg_queue_size_sum; - /* Count of samples taken for average */ - uint64_t avg_queue_size_samples; - /* How many times this group has been removed from service tree */ - unsigned long dequeue; - - /* Total time spent waiting for it to be assigned a timeslice. */ - uint64_t group_wait_time; - uint64_t start_group_wait_time; - - /* Time spent idling for this blkio_group */ - uint64_t idle_time; - uint64_t start_idle_time; - /* - * Total time when we have requests queued and do not contain the - * current active queue. - */ - uint64_t empty_time; - uint64_t start_empty_time; - uint16_t flags; -#endif +struct blkg_stat { + struct u64_stats_sync syncp; + uint64_t cnt; }; -struct blkio_group { - /* An rcu protected unique identifier for the group */ - void *key; - struct hlist_node blkcg_node; - unsigned short blkcg_id; - /* Store cgroup path */ - char path[128]; - /* The device MKDEV(major, minor), this group has been created for */ - dev_t dev; - /* policy which owns this blk group */ - enum blkio_policy_id plid; - - /* Need to serialize the stats in the case of reset/update */ - spinlock_t stats_lock; - struct blkio_group_stats stats; +struct blkg_rwstat { + struct u64_stats_sync syncp; + uint64_t cnt[BLKG_RWSTAT_NR]; }; -struct blkio_policy_node { - struct list_head node; - dev_t dev; - /* This node belongs to max bw policy or porportional weight policy */ - enum blkio_policy_id plid; - /* cgroup file to which this rule belongs to */ - int fileid; - - union { - unsigned int weight; - /* - * Rate read/write in terms of byptes per second - * Whether this rate represents read or write is determined - * by file type "fileid". - */ - u64 bps; - unsigned int iops; - } val; +/* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each has its private + * data on each blkg, the size of which is determined by + * blkcg_policy->pd_size. blkcg core allocates and frees such areas + * together with blkg and invokes pd_init/exit_fn() methods. + * + * Such private data must embed struct blkg_policy_data (pd) at the + * beginning and pd_size can't be smaller than pd. + */ +struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; + + /* used during policy activation */ + struct list_head alloc_node; }; -extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, - dev_t dev); -extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, - dev_t dev); -extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, - dev_t dev); -extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, - dev_t dev); -extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, - dev_t dev); - -typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); - -typedef void (blkio_update_group_weight_fn) (void *key, - struct blkio_group *blkg, unsigned int weight); -typedef void (blkio_update_group_read_bps_fn) (void * key, - struct blkio_group *blkg, u64 read_bps); -typedef void (blkio_update_group_write_bps_fn) (void *key, - struct blkio_group *blkg, u64 write_bps); -typedef void (blkio_update_group_read_iops_fn) (void *key, - struct blkio_group *blkg, unsigned int read_iops); -typedef void (blkio_update_group_write_iops_fn) (void *key, - struct blkio_group *blkg, unsigned int write_iops); - -struct blkio_policy_ops { - blkio_unlink_group_fn *blkio_unlink_group_fn; - blkio_update_group_weight_fn *blkio_update_group_weight_fn; - blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; - blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; - blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; - blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* request allocation list for this blkcg-q pair */ + struct request_list rl; + + /* reference count */ + atomic_t refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + struct rcu_head rcu_head; }; -struct blkio_policy_type { - struct list_head list; - struct blkio_policy_ops ops; - enum blkio_policy_id plid; +typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); + +struct blkcg_policy { + int plid; + /* policy specific private data size */ + size_t pd_size; + /* cgroup files for the policy */ + struct cftype *cftypes; + + /* operations */ + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; + blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; +extern struct blkcg blkcg_root; + +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); +int blkcg_init_queue(struct request_queue *q); +void blkcg_drain_queue(struct request_queue *q); +void blkcg_exit_queue(struct request_queue *q); + /* Blkio controller policy registration */ -extern void blkio_policy_register(struct blkio_policy_type *); -extern void blkio_policy_unregister(struct blkio_policy_type *); +int blkcg_policy_register(struct blkcg_policy *pol); +void blkcg_policy_unregister(struct blkcg_policy *pol); +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat); +u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); + +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + +struct blkg_conf_ctx { + struct gendisk *disk; + struct blkcg_gq *blkg; + u64 v; +}; + +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx); +void blkg_conf_finish(struct blkg_conf_ctx *ctx); + + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + +static inline struct blkcg *task_blkcg(struct task_struct *tsk) +{ + return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); +} + +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + return task_blkcg(current); +} + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) +{ + return blkg ? blkg->pd[pol->plid] : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) +{ + return pd ? pd->blkg : NULL; +} + +/** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ +static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) +{ + char *p; + + p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + if (!p) { + strncpy(buf, "<unavailable>", buflen); + return -ENAMETOOLONG; + } + + memmove(buf, p, buf + buflen - p); + return 0; +} + +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding an existing reference. + */ +static inline void blkg_get(struct blkcg_gq *blkg) +{ + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); +} -static inline char *blkg_path(struct blkio_group *blkg) +void __blkg_release_rcu(struct rcu_head *rcu); + +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + */ +static inline void blkg_put(struct blkcg_gq *blkg) { - return blkg->path; + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); } -#else +struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, + bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Similar to blkg_for_each_descendant_pre() but performs post-order + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. + */ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; +root_rl: + rcu_read_unlock(); + return &q->root_rl; +} -struct blkio_group { +/** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ +static inline void blk_put_rl(struct request_list *rl) +{ + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); +} + +/** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) +{ + rq->rl = rl; +} + +/** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ +static inline struct request_list *blk_rq_rl(struct request *rq) +{ + return rq->rl; +} + +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); +/** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + +static inline void blkg_stat_init(struct blkg_stat *stat) +{ + u64_stats_init(&stat->syncp); +} + +/** + * blkg_stat_add - add a value to a blkg_stat + * @stat: target blkg_stat + * @val: value to add + * + * Add @val to @stat. The caller is responsible for synchronizing calls to + * this function. + */ +static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) +{ + u64_stats_update_begin(&stat->syncp); + stat->cnt += val; + u64_stats_update_end(&stat->syncp); +} + +/** + * blkg_stat_read - read the current value of a blkg_stat + * @stat: blkg_stat to read + * + * Read the current value of @stat. This function can be called without + * synchroniztion and takes care of u64 atomicity. + */ +static inline uint64_t blkg_stat_read(struct blkg_stat *stat) +{ + unsigned int start; + uint64_t v; + + do { + start = u64_stats_fetch_begin_irq(&stat->syncp); + v = stat->cnt; + } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); + + return v; +} + +/** + * blkg_stat_reset - reset a blkg_stat + * @stat: blkg_stat to reset + */ +static inline void blkg_stat_reset(struct blkg_stat *stat) +{ + stat->cnt = 0; +} + +/** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + +static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +{ + u64_stats_init(&rwstat->syncp); +} + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @rw: mask of REQ_{WRITE|SYNC} + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + int rw, uint64_t val) +{ + u64_stats_update_begin(&rwstat->syncp); + + if (rw & REQ_WRITE) + rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + else + rwstat->cnt[BLKG_RWSTAT_READ] += val; + if (rw & REQ_SYNC) + rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + else + rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + + u64_stats_update_end(&rwstat->syncp); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it as the return value. + * This function can be called without synchronization and takes care of + * u64 atomicity. + */ +static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) +{ + unsigned int start; + struct blkg_rwstat tmp; + + do { + start = u64_stats_fetch_begin_irq(&rwstat->syncp); + tmp = *rwstat; + } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + + return tmp; +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); + + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); +} + +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + +#else /* CONFIG_BLK_CGROUP */ + +struct cgroup; +struct blkcg; + +struct blkg_policy_data { }; -struct blkio_policy_type { +struct blkcg_gq { }; -static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } -static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } - -static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } - -#endif - -#define BLKIO_WEIGHT_MIN 100 -#define BLKIO_WEIGHT_MAX 1000 -#define BLKIO_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_DEBUG_BLK_CGROUP -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); -void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue); -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg); -void blkiocg_set_start_empty_time(struct blkio_group *blkg); - -#define BLKG_FLAG_FNS(name) \ -static inline void blkio_mark_blkg_##name( \ - struct blkio_group_stats *stats) \ -{ \ - stats->flags |= (1 << BLKG_##name); \ -} \ -static inline void blkio_clear_blkg_##name( \ - struct blkio_group_stats *stats) \ -{ \ - stats->flags &= ~(1 << BLKG_##name); \ -} \ -static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ -{ \ - return (stats->flags & (1 << BLKG_##name)) != 0; \ -} \ - -BLKG_FLAG_FNS(waiting) -BLKG_FLAG_FNS(idling) -BLKG_FLAG_FNS(empty) -#undef BLKG_FLAG_FNS -#else -static inline void blkiocg_update_avg_queue_size_stats( - struct blkio_group *blkg) {} -static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) {} -static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) -{} -static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} -static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} -#endif - -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) -extern struct blkio_cgroup blkio_root_cgroup; -extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); -extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid); -extern int blkiocg_del_blkio_group(struct blkio_group *blkg); -extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, - void *key); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time); -void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, - bool direction, bool sync); -void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync); -void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync); -void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync); -#else -struct cgroup; -static inline struct blkio_cgroup * -cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } - -static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) {} - -static inline int -blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } - -static inline struct blkio_group * -blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } -static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time) {} -static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) {} -static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, - bool sync) {} -static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) {} -static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -#endif -#endif /* _BLK_CGROUP_H */ +struct blkcg_policy { +}; + +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline int blkcg_init_queue(struct request_queue *q) { return 0; } +static inline void blkcg_drain_queue(struct request_queue *q) { } +static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } +static inline int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { return 0; } +static inline void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { } + +static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } +static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } +static inline void blkg_get(struct blkcg_gq *blkg) { } +static inline void blkg_put(struct blkcg_gq *blkg) { } + +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } +static inline void blk_put_rl(struct request_list *rl) { } +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } +static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + +#endif /* CONFIG_BLK_CGROUP */ +#endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 518dd423a5f..6f8dba161bf 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,6 +16,7 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/kernel_stat.h> @@ -27,22 +28,30 @@ #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> +#include <linux/list_sort.h> +#include <linux/delay.h> +#include <linux/ratelimit.h> +#include <linux/pm_runtime.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" +#include "blk-mq.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); -static int __make_request(struct request_queue *q, struct bio *bio); +DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -static struct kmem_cache *request_cachep; +struct kmem_cache *request_cachep = NULL; /* * For queue allocation @@ -54,42 +63,6 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; -static void drive_stat_acct(struct request *rq, int new_io) -{ - struct hd_struct *part; - int rw = rq_data_dir(rq); - int cpu; - - if (!blk_do_io_stat(rq)) - return; - - cpu = part_stat_lock(); - - if (!new_io) { - part = rq->part; - part_stat_inc(cpu, part, merges[rw]); - } else { - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!hd_struct_try_get(part)) { - /* - * The partition is already being removed, - * the request will be accounted on the disk only - * - * We take a reference on disk->part0 although that - * partition will never be deleted, so we can treat - * it as any other partition. - */ - part = &rq->rq_disk->part0; - hd_struct_get(part); - } - part_round_stats(cpu, part); - part_inc_in_flight(part, rw); - rq->part = part; - } - - part_stat_unlock(); -} - void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -139,7 +112,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->cmd = rq->__cmd; rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; - rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; @@ -149,54 +121,34 @@ EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { - struct request_queue *q = rq->q; + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; - if (&q->flush_rq != rq) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (unlikely(nbytes > bio->bi_size)) { - printk(KERN_ERR "%s: want %u bytes done, %u left\n", - __func__, nbytes, bio->bi_size); - nbytes = bio->bi_size; - } + if (unlikely(rq->cmd_flags & REQ_QUIET)) + set_bit(BIO_QUIET, &bio->bi_flags); - if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + bio_advance(bio, nbytes); - bio->bi_size -= nbytes; - bio->bi_sector += (nbytes >> 9); - - if (bio_integrity(bio)) - bio_integrity_advance(bio, nbytes); - - if (bio->bi_size == 0) - bio_endio(bio, error); - } else { - /* - * Okay, this is the sequenced flush request in - * progress, just record the error; - */ - if (error && !q->flush_err) - q->flush_err = error; - } + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + bio_endio(bio, error); } void blk_dump_rq_flags(struct request *rq, char *msg) { int bit; - printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, + printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, - rq->cmd_flags); + (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", - rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); @@ -207,136 +159,33 @@ void blk_dump_rq_flags(struct request *rq, char *msg) } EXPORT_SYMBOL(blk_dump_rq_flags); -/* - * "plug" the device if there are no outstanding requests: this will - * force the transfer to start only after we have put all the requests - * on the list. - * - * This is called with interrupts off and no requests on the queue and - * with the queue lock held. - */ -void blk_plug_device(struct request_queue *q) +static void blk_delay_work(struct work_struct *work) { - WARN_ON(!irqs_disabled()); - - /* - * don't plug a stopped queue, it must be paired with blk_start_queue() - * which will restart the queueing - */ - if (blk_queue_stopped(q)) - return; + struct request_queue *q; - if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { - mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); - trace_block_plug(q); - } + q = container_of(work, struct request_queue, delay_work.work); + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); } -EXPORT_SYMBOL(blk_plug_device); /** - * blk_plug_device_unlocked - plug a device without queue lock held - * @q: The &struct request_queue to plug + * blk_delay_queue - restart queueing after defined interval + * @q: The &struct request_queue in question + * @msecs: Delay in msecs * * Description: - * Like @blk_plug_device(), but grabs the queue lock and disables - * interrupts. - **/ -void blk_plug_device_unlocked(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blk_plug_device(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_plug_device_unlocked); - -/* - * remove the queue from the plugged list, if present. called with - * queue lock held and interrupts disabled. + * Sometimes queueing needs to be postponed for a little while, to allow + * resources to come back. This function will make sure that queueing is + * restarted around the specified time. Queue lock must be held. */ -int blk_remove_plug(struct request_queue *q) -{ - WARN_ON(!irqs_disabled()); - - if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) - return 0; - - del_timer(&q->unplug_timer); - return 1; -} -EXPORT_SYMBOL(blk_remove_plug); - -/* - * remove the plug and let it rip.. - */ -void __generic_unplug_device(struct request_queue *q) -{ - if (unlikely(blk_queue_stopped(q))) - return; - if (!blk_remove_plug(q) && !blk_queue_nonrot(q)) - return; - - q->request_fn(q); -} - -/** - * generic_unplug_device - fire a request queue - * @q: The &struct request_queue in question - * - * Description: - * Linux uses plugging to build bigger requests queues before letting - * the device have at them. If a queue is plugged, the I/O scheduler - * is still adding and merging requests on the queue. Once the queue - * gets unplugged, the request_fn defined for the queue is invoked and - * transfers started. - **/ -void generic_unplug_device(struct request_queue *q) -{ - if (blk_queue_plugged(q)) { - spin_lock_irq(q->queue_lock); - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - } -} -EXPORT_SYMBOL(generic_unplug_device); - -static void blk_backing_dev_unplug(struct backing_dev_info *bdi, - struct page *page) +void blk_delay_queue(struct request_queue *q, unsigned long msecs) { - struct request_queue *q = bdi->unplug_io_data; - - blk_unplug(q); -} - -void blk_unplug_work(struct work_struct *work) -{ - struct request_queue *q = - container_of(work, struct request_queue, unplug_work); - - trace_block_unplug_io(q); - q->unplug_fn(q); -} - -void blk_unplug_timeout(unsigned long data) -{ - struct request_queue *q = (struct request_queue *)data; - - trace_block_unplug_timer(q); - kblockd_schedule_work(q, &q->unplug_work); -} - -void blk_unplug(struct request_queue *q) -{ - /* - * devices don't necessarily have an ->unplug_fn defined - */ - if (q->unplug_fn) { - trace_block_unplug_io(q); - q->unplug_fn(q); - } + if (likely(!blk_queue_dead(q))) + queue_delayed_work(kblockd_workqueue, &q->delay_work, + msecs_to_jiffies(msecs)); } -EXPORT_SYMBOL(blk_unplug); +EXPORT_SYMBOL(blk_delay_queue); /** * blk_start_queue - restart a previously stopped queue @@ -352,7 +201,7 @@ void blk_start_queue(struct request_queue *q) WARN_ON(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); - __blk_run_queue(q, false); + __blk_run_queue(q); } EXPORT_SYMBOL(blk_start_queue); @@ -372,7 +221,7 @@ EXPORT_SYMBOL(blk_start_queue); **/ void blk_stop_queue(struct request_queue *q) { - blk_remove_plug(q); + cancel_delayed_work(&q->delay_work); queue_flag_set(QUEUE_FLAG_STOPPED, q); } EXPORT_SYMBOL(blk_stop_queue); @@ -390,51 +239,90 @@ EXPORT_SYMBOL(blk_stop_queue); * that its ->make_request_fn will not re-add plugging prior to calling * this function. * + * This function does not cancel any asynchronous activity arising + * out of elevator or throttling code. That would require elevaotor_exit() + * and blkcg_exit_queue() to be called with queue lock initialized. + * */ void blk_sync_queue(struct request_queue *q) { - del_timer_sync(&q->unplug_timer); del_timer_sync(&q->timeout); - cancel_work_sync(&q->unplug_work); - throtl_shutdown_timer_wq(q); + + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + cancel_delayed_work_sync(&hctx->run_work); + cancel_delayed_work_sync(&hctx->delay_work); + } + } else { + cancel_delayed_work_sync(&q->delay_work); + } } EXPORT_SYMBOL(blk_sync_queue); /** + * __blk_run_queue_uncond - run a queue whether or not it has been stopped + * @q: The queue to run + * + * Description: + * Invoke request handling on a queue if there are any pending requests. + * May be used to restart request handling after a request has completed. + * This variant runs the queue whether or not the queue has been + * stopped. Must be called with the queue lock held and interrupts + * disabled. See also @blk_run_queue. + */ +inline void __blk_run_queue_uncond(struct request_queue *q) +{ + if (unlikely(blk_queue_dead(q))) + return; + + /* + * Some request_fn implementations, e.g. scsi_request_fn(), unlock + * the queue lock internally. As a result multiple threads may be + * running such a request function concurrently. Keep track of the + * number of active request_fn invocations such that blk_drain_queue() + * can wait until all these request_fn calls have finished. + */ + q->request_fn_active++; + q->request_fn(q); + q->request_fn_active--; +} + +/** * __blk_run_queue - run a single device queue * @q: The queue to run - * @force_kblockd: Don't run @q->request_fn directly. Use kblockd. * * Description: * See @blk_run_queue. This variant must be called with the queue lock * held and interrupts disabled. - * */ -void __blk_run_queue(struct request_queue *q, bool force_kblockd) +void __blk_run_queue(struct request_queue *q) { - blk_remove_plug(q); - if (unlikely(blk_queue_stopped(q))) return; - if (elv_queue_empty(q)) - return; - - /* - * Only recurse once to avoid overrunning the stack, let the unplug - * handling reinvoke the handler shortly if we already got there. - */ - if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - queue_flag_set(QUEUE_FLAG_PLUGGED, q); - kblockd_schedule_work(q, &q->unplug_work); - } + __blk_run_queue_uncond(q); } EXPORT_SYMBOL(__blk_run_queue); /** + * blk_run_queue_async - run a single device queue in workqueue context + * @q: The queue to run + * + * Description: + * Tells kblockd to perform the equivalent of @blk_run_queue on behalf + * of us. The caller must hold the queue lock. + */ +void blk_run_queue_async(struct request_queue *q) +{ + if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) + mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); +} +EXPORT_SYMBOL(blk_run_queue_async); + +/** * blk_run_queue - run a single device queue * @q: The queue to run * @@ -447,7 +335,7 @@ void blk_run_queue(struct request_queue *q) unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - __blk_run_queue(q, false); + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_run_queue); @@ -456,54 +344,226 @@ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } +EXPORT_SYMBOL(blk_put_queue); -void blk_cleanup_queue(struct request_queue *q) +/** + * __blk_drain_queue - drain requests from request_queue + * @q: queue to drain + * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV + * + * Drain requests from @q. If @drain_all is set, all requests are drained. + * If not, only ELVPRIV requests are drained. The caller is responsible + * for ensuring that no new requests which need to be drained are queued. + */ +static void __blk_drain_queue(struct request_queue *q, bool drain_all) + __releases(q->queue_lock) + __acquires(q->queue_lock) { + int i; + + lockdep_assert_held(q->queue_lock); + + while (true) { + bool drain = false; + + /* + * The caller might be trying to drain @q before its + * elevator is initialized. + */ + if (q->elevator) + elv_drain_elevator(q); + + blkcg_drain_queue(q); + + /* + * This function might be called on a queue which failed + * driver init after queue creation or is not yet fully + * active yet. Some drivers (e.g. fd and loop) get unhappy + * in such cases. Kick queue iff dispatch queue has + * something on it and @q has request_fn set. + */ + if (!list_empty(&q->queue_head) && q->request_fn) + __blk_run_queue(q); + + drain |= q->nr_rqs_elvpriv; + drain |= q->request_fn_active; + + /* + * Unfortunately, requests are queued at and tracked from + * multiple places and there's no single counter which can + * be drained. Check all the queues and counters. + */ + if (drain_all) { + drain |= !list_empty(&q->queue_head); + for (i = 0; i < 2; i++) { + drain |= q->nr_rqs[i]; + drain |= q->in_flight[i]; + drain |= !list_empty(&q->flush_queue[i]); + } + } + + if (!drain) + break; + + spin_unlock_irq(q->queue_lock); + + msleep(10); + + spin_lock_irq(q->queue_lock); + } + /* - * We know we have process context here, so we can be a little - * cautious and ensure that pending block actions on this device - * are done before moving on. Going into this function, we should - * not have processes doing IO to this device. + * With queue marked dead, any woken up waiter will fail the + * allocation path, so the wakeup chaining is lost and we're + * left with hung waiters. We need to wake up those waiters. */ - blk_sync_queue(q); + if (q->request_fn) { + struct request_list *rl; - del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + blk_queue_for_each_rl(rl, q) + for (i = 0; i < ARRAY_SIZE(rl->wait); i++) + wake_up_all(&rl->wait[i]); + } +} + +/** + * blk_queue_bypass_start - enter queue bypass mode + * @q: queue of interest + * + * In bypass mode, only the dispatch FIFO queue of @q is used. This + * function makes @q enter bypass mode and drains all requests which were + * throttled or issued before. On return, it's guaranteed that no request + * is being throttled or has ELVPRIV set and blk_queue_bypass() %true + * inside queue or RCU read lock. + */ +void blk_queue_bypass_start(struct request_queue *q) +{ + bool drain; + + spin_lock_irq(q->queue_lock); + drain = !q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + spin_unlock_irq(q->queue_lock); + + if (drain) { + spin_lock_irq(q->queue_lock); + __blk_drain_queue(q, false); + spin_unlock_irq(q->queue_lock); + + /* ensure blk_queue_bypass() is %true inside RCU read lock */ + synchronize_rcu(); + } +} +EXPORT_SYMBOL_GPL(blk_queue_bypass_start); + +/** + * blk_queue_bypass_end - leave queue bypass mode + * @q: queue of interest + * + * Leave bypass mode and restore the normal queueing behavior. + */ +void blk_queue_bypass_end(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + if (!--q->bypass_depth) + queue_flag_clear(QUEUE_FLAG_BYPASS, q); + WARN_ON_ONCE(q->bypass_depth < 0); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_queue_bypass_end); + +/** + * blk_cleanup_queue - shutdown a request queue + * @q: request queue to shutdown + * + * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and + * put it. All future requests will be failed immediately with -ENODEV. + */ +void blk_cleanup_queue(struct request_queue *q) +{ + spinlock_t *lock = q->queue_lock; + + /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); - queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); + queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + spin_lock_irq(lock); + + /* + * A dying queue is permanently in bypass mode till released. Note + * that, unlike blk_queue_bypass_start(), we aren't performing + * synchronize_rcu() after entering bypass mode to avoid the delay + * as some drivers create and destroy a lot of queues while + * probing. This is still safe because blk_release_queue() will be + * called only after the queue refcnt drops to zero and nothing, + * RCU or not, would be traversing the queue by then. + */ + q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + + queue_flag_set(QUEUE_FLAG_NOMERGES, q); + queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + queue_flag_set(QUEUE_FLAG_DYING, q); + spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); - if (q->elevator) - elevator_exit(q->elevator); + /* + * Drain all requests queued before DYING marking. Set DEAD flag to + * prevent that q->request_fn() gets invoked after draining finished. + */ + if (q->mq_ops) { + blk_mq_drain_queue(q); + spin_lock_irq(lock); + } else { + spin_lock_irq(lock); + __blk_drain_queue(q, true); + } + queue_flag_set(QUEUE_FLAG_DEAD, q); + spin_unlock_irq(lock); + + /* @q won't process any more request, flush async actions */ + del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + blk_sync_queue(q); + + spin_lock_irq(lock); + if (q->queue_lock != &q->__queue_lock) + q->queue_lock = &q->__queue_lock; + spin_unlock_irq(lock); + /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); -static int blk_init_free_list(struct request_queue *q) +int blk_init_rl(struct request_list *rl, struct request_queue *q, + gfp_t gfp_mask) { - struct request_list *rl = &q->rq; - if (unlikely(rl->rq_pool)) return 0; + rl->q = q; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - rl->elvpriv = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, q->node); - + mempool_free_slab, request_cachep, + gfp_mask, q->node); if (!rl->rq_pool) return -ENOMEM; return 0; } +void blk_exit_rl(struct request_list *rl) +{ + if (rl->rq_pool) + mempool_destroy(rl->rq_pool); +} + struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { - return blk_alloc_queue_node(gfp_mask, -1); + return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); } EXPORT_SYMBOL(blk_alloc_queue); @@ -517,39 +577,69 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; - q->backing_dev_info.unplug_io_data = q; + q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); + if (q->id < 0) + goto fail_q; + q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; q->backing_dev_info.name = "block"; + q->node = node_id; err = bdi_init(&q->backing_dev_info); - if (err) { - kmem_cache_free(blk_requestq_cachep, q); - return NULL; - } - - if (blk_throtl_init(q)) { - kmem_cache_free(blk_requestq_cachep, q); - return NULL; - } + if (err) + goto fail_id; setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); - init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); + INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); - INIT_LIST_HEAD(&q->pending_flushes); - INIT_WORK(&q->unplug_work, blk_unplug_work); + INIT_LIST_HEAD(&q->icq_list); +#ifdef CONFIG_BLK_CGROUP + INIT_LIST_HEAD(&q->blkg_list); +#endif + INIT_LIST_HEAD(&q->flush_queue[0]); + INIT_LIST_HEAD(&q->flush_queue[1]); + INIT_LIST_HEAD(&q->flush_data_in_flight); + INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); + /* + * By default initialize queue_lock to internal lock and driver can + * override it later if need be. + */ + q->queue_lock = &q->__queue_lock; + + /* + * A queue starts its life with bypass turned on to avoid + * unnecessary bypass on/off overhead and nasty surprises during + * init. The initial bypass will be finished when the queue is + * registered by blk_register_queue(). + */ + q->bypass_depth = 1; + __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); + + init_waitqueue_head(&q->mq_freeze_wq); + + if (blkcg_init_queue(q)) + goto fail_bdi; + return q; + +fail_bdi: + bdi_destroy(&q->backing_dev_info); +fail_id: + ida_simple_remove(&blk_queue_ida, q->id); +fail_q: + kmem_cache_free(blk_requestq_cachep, q); + return NULL; } EXPORT_SYMBOL(blk_alloc_queue_node); @@ -588,7 +678,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node); struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) { - return blk_init_queue_node(rfn, lock, -1); + return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); } EXPORT_SYMBOL(blk_init_queue); @@ -601,7 +691,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) if (!uninit_q) return NULL; - q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) blk_cleanup_queue(uninit_q); @@ -613,85 +703,71 @@ struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) { - return blk_init_allocated_queue_node(q, rfn, lock, -1); -} -EXPORT_SYMBOL(blk_init_allocated_queue); - -struct request_queue * -blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, - spinlock_t *lock, int node_id) -{ if (!q) return NULL; - q->node = node_id; - if (blk_init_free_list(q)) + q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); + if (!q->flush_rq) return NULL; + if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) + goto fail; + q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; - q->unplug_fn = generic_unplug_device; - q->queue_flags = QUEUE_FLAG_DEFAULT; - q->queue_lock = lock; + q->queue_flags |= QUEUE_FLAG_DEFAULT; + + /* Override internal queue lock with supplied lock pointer */ + if (lock) + q->queue_lock = lock; /* * This also sets hw/phys segments, boundary and size */ - blk_queue_make_request(q, __make_request); + blk_queue_make_request(q, blk_queue_bio); q->sg_reserved_size = INT_MAX; - /* - * all done - */ - if (!elevator_init(q, NULL)) { - blk_queue_congestion_threshold(q); - return q; + /* Protect q->elevator from elevator_change */ + mutex_lock(&q->sysfs_lock); + + /* init elevator */ + if (elevator_init(q, NULL)) { + mutex_unlock(&q->sysfs_lock); + goto fail; } + mutex_unlock(&q->sysfs_lock); + + return q; + +fail: + kfree(q->flush_rq); return NULL; } -EXPORT_SYMBOL(blk_init_allocated_queue_node); +EXPORT_SYMBOL(blk_init_allocated_queue); -int blk_get_queue(struct request_queue *q) +bool blk_get_queue(struct request_queue *q) { - if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { - kobject_get(&q->kobj); - return 0; + if (likely(!blk_queue_dying(q))) { + __blk_get_queue(q); + return true; } - return 1; -} - -static inline void blk_free_request(struct request_queue *q, struct request *rq) -{ - if (rq->cmd_flags & REQ_ELVPRIV) - elv_put_request(q, rq); - mempool_free(rq, q->rq.rq_pool); + return false; } +EXPORT_SYMBOL(blk_get_queue); -static struct request * -blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) +static inline void blk_free_request(struct request_list *rl, struct request *rq) { - struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); - - if (!rq) - return NULL; - - blk_rq_init(q, rq); - - rq->cmd_flags = flags | REQ_ALLOCED; - - if (priv) { - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); - return NULL; - } - rq->cmd_flags |= REQ_ELVPRIV; + if (rq->cmd_flags & REQ_ELVPRIV) { + elv_put_request(rl->q, rq); + if (rq->elv.icq) + put_io_context(rq->elv.icq->ioc); } - return rq; + mempool_free(rq, rl->rq_pool); } /* @@ -728,18 +804,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) ioc->last_waited = jiffies; } -static void __freed_request(struct request_queue *q, int sync) +static void __freed_request(struct request_list *rl, int sync) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; - if (rl->count[sync] < queue_congestion_off_threshold(q)) + /* + * bdi isn't aware of blkcg yet. As all async IOs end up root + * blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl && + rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); - blk_clear_queue_full(q, sync); + blk_clear_rl_full(rl, sync); } } @@ -747,33 +828,125 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, int sync, int priv) +static void freed_request(struct request_list *rl, unsigned int flags) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; + int sync = rw_is_sync(flags); + q->nr_rqs[sync]--; rl->count[sync]--; - if (priv) - rl->elvpriv--; + if (flags & REQ_ELVPRIV) + q->nr_rqs_elvpriv--; - __freed_request(q, sync); + __freed_request(rl, sync); if (unlikely(rl->starved[sync ^ 1])) - __freed_request(q, sync ^ 1); + __freed_request(rl, sync ^ 1); +} + +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct request_list *rl; + + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; + blk_queue_congestion_threshold(q); + + /* congestion isn't cgroup aware and follows root blkcg for now */ + rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_ASYNC); + + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + + spin_unlock_irq(q->queue_lock); + return 0; } /* - * Get a free request, queue_lock must be held. - * Returns NULL on failure, with queue_lock held. - * Returns !NULL on success, with queue_lock *not held*. + * Determine if elevator data should be initialized when allocating the + * request associated with @bio. */ -static struct request *get_request(struct request_queue *q, int rw_flags, - struct bio *bio, gfp_t gfp_mask) +static bool blk_rq_should_init_elevator(struct bio *bio) { - struct request *rq = NULL; - struct request_list *rl = &q->rq; - struct io_context *ioc = NULL; + if (!bio) + return true; + + /* + * Flush requests do not use the elevator so skip initialization. + * This allows a request to share the flush and elevator data. + */ + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) + return false; + + return true; +} + +/** + * rq_ioc - determine io_context for request allocation + * @bio: request being allocated is for this bio (can be %NULL) + * + * Determine io_context to use for request allocation for @bio. May return + * %NULL if %current->io_context doesn't exist. + */ +static struct io_context *rq_ioc(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (bio && bio->bi_ioc) + return bio->bi_ioc; +#endif + return current->io_context; +} + +/** + * __get_request - get a free request + * @rl: request list to allocate from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) + * @gfp_mask: allocation mask + * + * Get a free request from @q. This function may fail under memory + * pressure or if @q is dead. + * + * Must be callled with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. + */ +static struct request *__get_request(struct request_list *rl, int rw_flags, + struct bio *bio, gfp_t gfp_mask) +{ + struct request_queue *q = rl->q; + struct request *rq; + struct elevator_type *et = q->elevator->type; + struct io_context *ioc = rq_ioc(bio); + struct io_cq *icq = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - int may_queue, priv; + int may_queue; + + if (unlikely(blk_queue_dying(q))) + return NULL; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -781,16 +954,15 @@ static struct request *get_request(struct request_queue *q, int rw_flags, if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { if (rl->count[is_sync]+1 >= q->nr_requests) { - ioc = current_io_context(GFP_ATOMIC, q->node); /* * The queue will fill after this allocation, so set * it as full, and mark this process as "batching". * This process will be allowed to complete a batch of * requests, others will be blocked. */ - if (!blk_queue_full(q, is_sync)) { + if (!blk_rl_full(rl, is_sync)) { ioc_set_batching(q, ioc); - blk_set_queue_full(q, is_sync); + blk_set_rl_full(rl, is_sync); } else { if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { @@ -799,11 +971,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * process is not a "batcher", and not * exempted by the IO scheduler */ - goto out; + return NULL; } } } - blk_set_queue_congested(q, is_sync); + /* + * bdi isn't aware of blkcg yet. As all async IOs end up + * root blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl) + blk_set_queue_congested(q, is_sync); } /* @@ -812,45 +989,60 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - goto out; + return NULL; + q->nr_rqs[is_sync]++; rl->count[is_sync]++; rl->starved[is_sync] = 0; - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; + /* + * Decide whether the new request will be managed by elevator. If + * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will + * prevent the current elevator from being destroyed until the new + * request is freed. This guarantees icq's won't be destroyed and + * makes creating new ones safe. + * + * Also, lookup icq while holding queue_lock. If it doesn't exist, + * it will be created after releasing queue_lock. + */ + if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { + rw_flags |= REQ_ELVPRIV; + q->nr_rqs_elvpriv++; + if (et->icq_cache && ioc) + icq = ioc_lookup_icq(ioc, q); + } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); - if (unlikely(!rq)) { - /* - * Allocation failed presumably due to memory. Undo anything - * we might have messed up. - * - * Allocating task should really be put onto the front of the - * wait queue, but this is pretty rare. - */ - spin_lock_irq(q->queue_lock); - freed_request(q, is_sync, priv); + /* allocate and init request */ + rq = mempool_alloc(rl->rq_pool, gfp_mask); + if (!rq) + goto fail_alloc; - /* - * in the very unlikely event that allocation failed and no - * requests for this direction was pending, mark us starved - * so that freeing of a request in the other direction will - * notice us. another possible fix would be to split the - * rq mempool into READ and WRITE - */ -rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; + blk_rq_init(q, rq); + blk_rq_set_rl(rq, rl); + rq->cmd_flags = rw_flags | REQ_ALLOCED; + + /* init elvpriv */ + if (rw_flags & REQ_ELVPRIV) { + if (unlikely(et->icq_cache && !icq)) { + if (ioc) + icq = ioc_create_icq(ioc, q, gfp_mask); + if (!icq) + goto fail_elvpriv; + } - goto out; - } + rq->elv.icq = icq; + if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) + goto fail_elvpriv; + /* @rq->elv.icq holds io_context until @rq is freed */ + if (icq) + get_io_context(icq->ioc); + } +out: /* * ioc may be NULL here, and ioc_batching will be false. That's * OK, if the queue is under the request limit then requests need @@ -861,73 +1053,131 @@ rq_starved: ioc->nr_batch_requests--; trace_block_getrq(q, bio, rw_flags & 1); -out: return rq; + +fail_elvpriv: + /* + * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed + * and may fail indefinitely under memory pressure and thus + * shouldn't stall IO. Treat this request as !elvpriv. This will + * disturb iosched and blkcg but weird is bettern than dead. + */ + printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", + dev_name(q->backing_dev_info.dev)); + + rq->cmd_flags &= ~REQ_ELVPRIV; + rq->elv.icq = NULL; + + spin_lock_irq(q->queue_lock); + q->nr_rqs_elvpriv--; + spin_unlock_irq(q->queue_lock); + goto out; + +fail_alloc: + /* + * Allocation failed presumably due to memory. Undo anything we + * might have messed up. + * + * Allocating task should really be put onto the front of the wait + * queue, but this is pretty rare. + */ + spin_lock_irq(q->queue_lock); + freed_request(rl, rw_flags); + + /* + * in the very unlikely event that allocation failed and no + * requests for this direction was pending, mark us starved so that + * freeing of a request in the other direction will notice + * us. another possible fix would be to split the rq mempool into + * READ and WRITE + */ +rq_starved: + if (unlikely(rl->count[is_sync] == 0)) + rl->starved[is_sync] = 1; + return NULL; } -/* - * No available requests for this queue, unplug the device and wait for some - * requests to become available. - * - * Called with q->queue_lock held, and returns with it unlocked. +/** + * get_request - get a free request + * @q: request_queue to allocate request from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) + * @gfp_mask: allocation mask + * + * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this + * function keeps retrying under memory pressure and fails iff @q is dead. + * + * Must be callled with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. */ -static struct request *get_request_wait(struct request_queue *q, int rw_flags, - struct bio *bio) +static struct request *get_request(struct request_queue *q, int rw_flags, + struct bio *bio, gfp_t gfp_mask) { const bool is_sync = rw_is_sync(rw_flags) != 0; + DEFINE_WAIT(wait); + struct request_list *rl; struct request *rq; - rq = get_request(q, rw_flags, bio, GFP_NOIO); - while (!rq) { - DEFINE_WAIT(wait); - struct io_context *ioc; - struct request_list *rl = &q->rq; + rl = blk_get_rl(q, bio); /* transferred to @rq on success */ +retry: + rq = __get_request(rl, rw_flags, bio, gfp_mask); + if (rq) + return rq; - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, - TASK_UNINTERRUPTIBLE); + if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + blk_put_rl(rl); + return NULL; + } - trace_block_sleeprq(q, bio, rw_flags & 1); + /* wait on @rl and retry */ + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, + TASK_UNINTERRUPTIBLE); - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - io_schedule(); + trace_block_sleeprq(q, bio, rw_flags & 1); - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - ioc = current_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, ioc); + spin_unlock_irq(q->queue_lock); + io_schedule(); - spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[is_sync], &wait); + /* + * After sleeping, we become a "batching" process and will be able + * to allocate at least one request, and up to a big batch of them + * for a small period time. See ioc_batching, ioc_set_batching + */ + ioc_set_batching(q, current->io_context); - rq = get_request(q, rw_flags, bio, GFP_NOIO); - }; + spin_lock_irq(q->queue_lock); + finish_wait(&rl->wait[is_sync], &wait); - return rq; + goto retry; } -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +static struct request *blk_old_get_request(struct request_queue *q, int rw, + gfp_t gfp_mask) { struct request *rq; BUG_ON(rw != READ && rw != WRITE); + /* create ioc upfront */ + create_io_context(gfp_mask, q->node); + spin_lock_irq(q->queue_lock); - if (gfp_mask & __GFP_WAIT) { - rq = get_request_wait(q, rw, NULL); - } else { - rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) - spin_unlock_irq(q->queue_lock); - } + rq = get_request(q, rw, NULL, gfp_mask); + if (!rq) + spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ return rq; } + +struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +{ + if (q->mq_ops) + return blk_mq_alloc_request(q, rw, gfp_mask, false); + else + return blk_old_get_request(q, rw, gfp_mask); +} EXPORT_SYMBOL(blk_get_request); /** @@ -969,6 +1219,8 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, if (unlikely(!rq)) return ERR_PTR(-ENOMEM); + blk_rq_set_block_pc(rq); + for_each_bio(bio) { struct bio *bounce_bio = bio; int ret; @@ -986,6 +1238,22 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, EXPORT_SYMBOL(blk_make_request); /** + * blk_rq_set_block_pc - initialize a requeest to type BLOCK_PC + * @rq: request to be initialized + * + */ +void blk_rq_set_block_pc(struct request *rq) +{ + rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); + rq->cmd = rq->__cmd; +} +EXPORT_SYMBOL(blk_rq_set_block_pc); + +/** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted @@ -1010,64 +1278,25 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_requeue_request); -/** - * blk_insert_request - insert a special request into a request queue - * @q: request queue where request should be inserted - * @rq: request to be inserted - * @at_head: insert request at head or tail of queue - * @data: private data - * - * Description: - * Many block devices need to execute commands asynchronously, so they don't - * block the whole kernel from preemption during request execution. This is - * accomplished normally by inserting aritficial requests tagged as - * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them - * be scheduled for actual execution by the request queue. - * - * We have the option of inserting the head or the tail of the queue. - * Typically we use the tail for new ioctls and so forth. We use the head - * of the queue for things like a QUEUE_FULL message from a device, or a - * host that is unable to accept a particular command. - */ -void blk_insert_request(struct request_queue *q, struct request *rq, - int at_head, void *data) +static void add_acct_request(struct request_queue *q, struct request *rq, + int where) { - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - unsigned long flags; - - /* - * tell I/O scheduler that this isn't a regular read/write (ie it - * must not attempt merges on this) and that it acts as a soft - * barrier - */ - rq->cmd_type = REQ_TYPE_SPECIAL; - - rq->special = data; - - spin_lock_irqsave(q->queue_lock, flags); - - /* - * If command is tagged, release the tag - */ - if (blk_rq_tagged(rq)) - blk_queue_end_tag(q, rq); - - drive_stat_acct(rq, 1); - __elv_add_request(q, rq, where, 0); - __blk_run_queue(q, false); - spin_unlock_irqrestore(q->queue_lock, flags); + blk_account_io_start(rq, true); + __elv_add_request(q, rq, where); } -EXPORT_SYMBOL(blk_insert_request); static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { + int inflight; + if (now == part->stamp) return; - if (part_in_flight(part)) { + inflight = part_in_flight(part); + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1099,6 +1328,16 @@ void part_round_stats(int cpu, struct hd_struct *part) } EXPORT_SYMBOL_GPL(part_round_stats); +#ifdef CONFIG_PM_RUNTIME +static void blk_pm_put_request(struct request *rq) +{ + if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) + pm_runtime_mark_last_busy(rq->q->dev); +} +#else +static inline void blk_pm_put_request(struct request *rq) {} +#endif + /* * queue lock must be held */ @@ -1106,8 +1345,13 @@ void __blk_put_request(struct request_queue *q, struct request *req) { if (unlikely(!q)) return; - if (unlikely(--req->ref_count)) + + if (q->mq_ops) { + blk_mq_free_request(req); return; + } + + blk_pm_put_request(req); elv_completed_request(q, req); @@ -1119,26 +1363,32 @@ void __blk_put_request(struct request_queue *q, struct request *req) * it didn't come out of our reserved rq pools */ if (req->cmd_flags & REQ_ALLOCED) { - int is_sync = rq_is_sync(req) != 0; - int priv = req->cmd_flags & REQ_ELVPRIV; + unsigned int flags = req->cmd_flags; + struct request_list *rl = blk_rq_rl(req); BUG_ON(!list_empty(&req->queuelist)); - BUG_ON(!hlist_unhashed(&req->hash)); + BUG_ON(ELV_ON_HASH(req)); - blk_free_request(q, req); - freed_request(q, is_sync, priv); + blk_free_request(rl, req); + freed_request(rl, flags); + blk_put_rl(rl); } } EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) { - unsigned long flags; struct request_queue *q = req->q; - spin_lock_irqsave(q->queue_lock, flags); - __blk_put_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); + if (q->mq_ops) + blk_mq_free_request(req); + else { + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __blk_put_request(q, req); + spin_unlock_irqrestore(q->queue_lock, flags); + } } EXPORT_SYMBOL(blk_put_request); @@ -1164,19 +1414,124 @@ void blk_add_request_payload(struct request *rq, struct page *page, bio->bi_io_vec->bv_offset = 0; bio->bi_io_vec->bv_len = len; - bio->bi_size = len; + bio->bi_iter.bi_size = len; bio->bi_vcnt = 1; bio->bi_phys_segments = 1; rq->__data_len = rq->resid_len = len; rq->nr_phys_segments = 1; - rq->buffer = bio_data(bio); } EXPORT_SYMBOL_GPL(blk_add_request_payload); +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, + struct bio *bio) +{ + const int ff = bio->bi_rw & REQ_FAILFAST_MASK; + + if (!ll_back_merge_fn(q, req, bio)) + return false; + + trace_block_bio_backmerge(q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); + + blk_account_io_start(req, false); + return true; +} + +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, + struct bio *bio) +{ + const int ff = bio->bi_rw & REQ_FAILFAST_MASK; + + if (!ll_front_merge_fn(q, req, bio)) + return false; + + trace_block_bio_frontmerge(q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + bio->bi_next = req->bio; + req->bio = bio; + + req->__sector = bio->bi_iter.bi_sector; + req->__data_len += bio->bi_iter.bi_size; + req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); + + blk_account_io_start(req, false); + return true; +} + +/** + * blk_attempt_plug_merge - try to merge with %current's plugged list + * @q: request_queue new bio is being queued at + * @bio: new bio being queued + * @request_count: out parameter for number of traversed plugged requests + * + * Determine whether @bio being queued on @q can be merged with a request + * on %current's plugged list. Returns %true if merge was successful, + * otherwise %false. + * + * Plugging coalesces IOs from the same issuer for the same purpose without + * going through @q->queue_lock. As such it's more of an issuing mechanism + * than scheduling, and the request, while may have elvpriv data, is not + * added on the elevator at this point. In addition, we don't have + * reliable access to the elevator outside queue lock. Only check basic + * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. + */ +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count) +{ + struct blk_plug *plug; + struct request *rq; + bool ret = false; + struct list_head *plug_list; + + plug = current->plug; + if (!plug) + goto out; + *request_count = 0; + + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry_reverse(rq, plug_list, queuelist) { + int el_ret; + + if (rq->q == q) + (*request_count)++; + + if (rq->q != q || !blk_rq_merge_ok(rq, bio)) + continue; + + el_ret = blk_try_merge(rq, bio); + if (el_ret == ELEVATOR_BACK_MERGE) { + ret = bio_attempt_back_merge(q, rq, bio); + if (ret) + break; + } else if (el_ret == ELEVATOR_FRONT_MERGE) { + ret = bio_attempt_front_merge(q, rq, bio); + if (ret) + break; + } + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { - req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; @@ -1184,31 +1539,18 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_flags |= REQ_FAILFAST_MASK; req->errors = 0; - req->__sector = bio->bi_sector; + req->__sector = bio->bi_iter.bi_sector; req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); } -/* - * Only disabling plugging for non-rotational devices if it does tagging - * as well, otherwise we do need the proper merging - */ -static inline bool queue_should_plug(struct request_queue *q) +void blk_queue_bio(struct request_queue *q, struct bio *bio) { - return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); -} - -static int __make_request(struct request_queue *q, struct bio *bio) -{ - struct request *req; - int el_ret; - unsigned int bytes = bio->bi_size; - const unsigned short prio = bio_prio(bio); const bool sync = !!(bio->bi_rw & REQ_SYNC); - const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); - const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; - int where = ELEVATOR_INSERT_SORT; - int rw_flags; + struct blk_plug *plug; + int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; + struct request *req; + unsigned int request_count = 0; /* * low level driver can indicate that it wants pages above a @@ -1217,78 +1559,42 @@ static int __make_request(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); - spin_lock_irq(q->queue_lock); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { - where = ELEVATOR_INSERT_FRONT; + spin_lock_irq(q->queue_lock); + where = ELEVATOR_INSERT_FLUSH; goto get_rq; } - if (elv_queue_empty(q)) - goto get_rq; - - el_ret = elv_merge(q, &req, bio); - switch (el_ret) { - case ELEVATOR_BACK_MERGE: - BUG_ON(!rq_mergeable(req)); - - if (!ll_back_merge_fn(q, req, bio)) - break; - - trace_block_bio_backmerge(q, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bytes; - req->ioprio = ioprio_best(req->ioprio, prio); - if (!blk_rq_cpu_valid(req)) - req->cpu = bio->bi_comp_cpu; - drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); - if (!attempt_back_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out; - - case ELEVATOR_FRONT_MERGE: - BUG_ON(!rq_mergeable(req)); - - if (!ll_front_merge_fn(q, req, bio)) - break; + /* + * Check if we can merge with the plugged list before grabbing + * any locks. + */ + if (!blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) + return; - trace_block_bio_frontmerge(q, bio); + spin_lock_irq(q->queue_lock); - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { - blk_rq_set_mixed_merge(req); - req->cmd_flags &= ~REQ_FAILFAST_MASK; - req->cmd_flags |= ff; + el_ret = elv_merge(q, &req, bio); + if (el_ret == ELEVATOR_BACK_MERGE) { + if (bio_attempt_back_merge(q, req, bio)) { + elv_bio_merged(q, req, bio); + if (!attempt_back_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out_unlock; + } + } else if (el_ret == ELEVATOR_FRONT_MERGE) { + if (bio_attempt_front_merge(q, req, bio)) { + elv_bio_merged(q, req, bio); + if (!attempt_front_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out_unlock; } - - bio->bi_next = req->bio; - req->bio = bio; - - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); - req->__sector = bio->bi_sector; - req->__data_len += bytes; - req->ioprio = ioprio_best(req->ioprio, prio); - if (!blk_rq_cpu_valid(req)) - req->cpu = bio->bi_comp_cpu; - drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); - if (!attempt_front_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out; - - /* ELV_NO_MERGE: elevator says don't/can't merge. */ - default: - ; } get_rq: @@ -1305,7 +1611,11 @@ get_rq: * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request_wait(q, rw_flags, bio); + req = get_request(q, rw_flags, bio, GFP_NOIO); + if (unlikely(!req)) { + bio_endio(bio, -ENODEV); /* @q is dead */ + goto out_unlock; + } /* * After dropping the lock and possibly sleeping here, our request @@ -1315,22 +1625,34 @@ get_rq: */ init_request_from_bio(req, bio); - spin_lock_irq(q->queue_lock); - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) - req->cpu = blk_cpu_to_group(smp_processor_id()); - if (queue_should_plug(q) && elv_queue_empty(q)) - blk_plug_device(q); - - /* insert the request into the elevator */ - drive_stat_acct(req, 1); - __elv_add_request(q, req, where, 0); -out: - if (unplug || !queue_should_plug(q)) - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - return 0; + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) + req->cpu = raw_smp_processor_id(); + + plug = current->plug; + if (plug) { + /* + * If this is the first request added after a plug, fire + * of a plug trace. + */ + if (!request_count) + trace_block_plug(q); + else { + if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); + } + } + list_add_tail(&req->queuelist, &plug->list); + blk_account_io_start(req, true); + } else { + spin_lock_irq(q->queue_lock); + add_acct_request(q, req, where); + __blk_run_queue(q); +out_unlock: + spin_unlock_irq(q->queue_lock); + } } +EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location @@ -1342,12 +1664,12 @@ static inline void blk_partition_remap(struct bio *bio) if (bio_sectors(bio) && bdev != bdev->bd_contains) { struct hd_struct *p = bdev->bd_part; - bio->bi_sector += p->start_sect; + bio->bi_iter.bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, bdev->bd_dev, - bio->bi_sector - p->start_sect); + bio->bi_iter.bi_sector - p->start_sect); } } @@ -1359,7 +1681,7 @@ static void handle_bad_sector(struct bio *bio) printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", bdevname(bio->bi_bdev, b), bio->bi_rw, - (unsigned long long)bio->bi_sector + bio_sectors(bio), + (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); set_bit(BIO_EOF, &bio->bi_flags); @@ -1375,29 +1697,27 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static int should_fail_request(struct bio *bio) +static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { - struct hd_struct *part = bio->bi_bdev->bd_part; - - if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) - return should_fail(&fail_make_request, bio->bi_size); - - return 0; + return part->make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { - return init_fault_attr_dentries(&fail_make_request, - "fail_make_request"); + struct dentry *dir = fault_create_debugfs_attr("fail_make_request", + NULL, &fail_make_request); + + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ -static inline int should_fail_request(struct bio *bio) +static inline bool should_fail_request(struct hd_struct *part, + unsigned int bytes) { - return 0; + return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ @@ -1415,7 +1735,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) /* Test device or partition size, when known. */ maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (maxsector) { - sector_t sector = bio->bi_sector; + sector_t sector = bio->bi_iter.bi_sector; if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { /* @@ -1431,160 +1751,144 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; } -/** - * generic_make_request - hand a buffer to its device driver for I/O - * @bio: The bio describing the location in memory and on the device. - * - * generic_make_request() is used to make I/O requests of block - * devices. It is passed a &struct bio, which describes the I/O that needs - * to be done. - * - * generic_make_request() does not return any status. The - * success/failure status of the request, along with notification of - * completion, is delivered asynchronously through the bio->bi_end_io - * function described (one day) else where. - * - * The caller of generic_make_request must make sure that bi_io_vec - * are set to describe the memory buffer, and that bi_dev and bi_sector are - * set to describe the device address, and the - * bi_end_io and optionally bi_private are set to describe how - * completion notification should be signaled. - * - * generic_make_request and the drivers it calls may use bi_next if this - * bio happens to be merged with someone else, and may change bi_dev and - * bi_sector for remaps as it sees fit. So the values of these fields - * should NOT be depended on after the call to generic_make_request. - */ -static inline void __generic_make_request(struct bio *bio) +static noinline_for_stack bool +generic_make_request_checks(struct bio *bio) { struct request_queue *q; - sector_t old_sector; - int ret, nr_sectors = bio_sectors(bio); - dev_t old_dev; + int nr_sectors = bio_sectors(bio); int err = -EIO; + char b[BDEVNAME_SIZE]; + struct hd_struct *part; might_sleep(); if (bio_check_eod(bio, nr_sectors)) goto end_io; - /* - * Resolve the mapping until finished. (drivers are - * still free to implement/resolve their own stacking - * by explicitly returning 0) - * - * NOTE: we don't repeat the blk_size check for each new device. - * Stacking drivers are expected to know what they are doing. - */ - old_sector = -1; - old_dev = 0; - do { - char b[BDEVNAME_SIZE]; - - q = bdev_get_queue(bio->bi_bdev); - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bdevname(bio->bi_bdev, b), - (long long) bio->bi_sector); - goto end_io; - } - - if (unlikely(!(bio->bi_rw & REQ_DISCARD) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - goto end_io; - - if (should_fail_request(bio)) - goto end_io; - - /* - * If this device has partitions, remap block n - * of partition p to block n+start(p) of the disk. - */ - blk_partition_remap(bio); - - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) - goto end_io; + q = bdev_get_queue(bio->bi_bdev); + if (unlikely(!q)) { + printk(KERN_ERR + "generic_make_request: Trying to access " + "nonexistent block-device %s (%Lu)\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_iter.bi_sector); + goto end_io; + } - if (old_sector != -1) - trace_block_bio_remap(q, bio, old_dev, old_sector); + if (likely(bio_is_rw(bio) && + nr_sectors > queue_max_hw_sectors(q))) { + printk(KERN_ERR "bio too big device %s (%u > %u)\n", + bdevname(bio->bi_bdev, b), + bio_sectors(bio), + queue_max_hw_sectors(q)); + goto end_io; + } - old_sector = bio->bi_sector; - old_dev = bio->bi_bdev->bd_dev; + part = bio->bi_bdev->bd_part; + if (should_fail_request(part, bio->bi_iter.bi_size) || + should_fail_request(&part_to_disk(part)->part0, + bio->bi_iter.bi_size)) + goto end_io; - if (bio_check_eod(bio, nr_sectors)) - goto end_io; + /* + * If this device has partitions, remap block n + * of partition p to block n+start(p) of the disk. + */ + blk_partition_remap(bio); - /* - * Filter flush bio's early so that make_request based - * drivers without flush support don't have to worry - * about them. - */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { - bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); - if (!nr_sectors) { - err = 0; - goto end_io; - } - } + if (bio_check_eod(bio, nr_sectors)) + goto end_io; - if ((bio->bi_rw & REQ_DISCARD) && - (!blk_queue_discard(q) || - ((bio->bi_rw & REQ_SECURE) && - !blk_queue_secdiscard(q)))) { - err = -EOPNOTSUPP; + /* + * Filter flush bio's early so that make_request based + * drivers without flush support don't have to worry + * about them. + */ + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); + if (!nr_sectors) { + err = 0; goto end_io; } + } - blk_throtl_bio(q, &bio); + if ((bio->bi_rw & REQ_DISCARD) && + (!blk_queue_discard(q) || + ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { + err = -EOPNOTSUPP; + goto end_io; + } - /* - * If bio = NULL, bio has been throttled and will be submitted - * later. - */ - if (!bio) - break; + if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { + err = -EOPNOTSUPP; + goto end_io; + } - trace_block_bio_queue(q, bio); + /* + * Various block parts want %current->io_context and lazy ioc + * allocation ends up trading a lot of pain for a small amount of + * memory. Just allocate it upfront. This may fail and block + * layer knows how to live with it. + */ + create_io_context(GFP_ATOMIC, q->node); - ret = q->make_request_fn(q, bio); - } while (ret); + if (blk_throtl_bio(q, bio)) + return false; /* throttled, will be resubmitted later */ - return; + trace_block_bio_queue(q, bio); + return true; end_io: bio_endio(bio, err); + return false; } -/* - * We only want one ->make_request_fn to be active at a time, - * else stack usage with stacked devices could be a problem. - * So use current->bio_list to keep a list of requests - * submited by a make_request_fn function. - * current->bio_list is also used as a flag to say if - * generic_make_request is currently active in this task or not. - * If it is NULL, then no make_request is active. If it is non-NULL, - * then a make_request is active, and new requests should be added - * at the tail +/** + * generic_make_request - hand a buffer to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. + * + * generic_make_request() is used to make I/O requests of block + * devices. It is passed a &struct bio, which describes the I/O that needs + * to be done. + * + * generic_make_request() does not return any status. The + * success/failure status of the request, along with notification of + * completion, is delivered asynchronously through the bio->bi_end_io + * function described (one day) else where. + * + * The caller of generic_make_request must make sure that bi_io_vec + * are set to describe the memory buffer, and that bi_dev and bi_sector are + * set to describe the device address, and the + * bi_end_io and optionally bi_private are set to describe how + * completion notification should be signaled. + * + * generic_make_request and the drivers it calls may use bi_next if this + * bio happens to be merged with someone else, and may resubmit the bio to + * a lower device by calling into generic_make_request recursively, which + * means the bio should NOT be touched after the call to ->make_request_fn. */ void generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + if (!generic_make_request_checks(bio)) + return; + + /* + * We only want one ->make_request_fn to be active at a time, else + * stack usage with stacked devices could be a problem. So use + * current->bio_list to keep a list of requests submited by a + * make_request_fn function. current->bio_list is also used as a + * flag to say if generic_make_request is currently active in this + * task or not. If it is NULL, then no make_request is active. If + * it is non-NULL, then a make_request is active, and new requests + * should be added at the tail + */ if (current->bio_list) { - /* make_request is active */ bio_list_add(current->bio_list, bio); return; } + /* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers @@ -1592,22 +1896,21 @@ void generic_make_request(struct bio *bio) * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be - * added. __generic_make_request may indeed add some more bios + * added. ->make_request() may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from - * bio_list, and call into __generic_make_request again. - * - * The loop was structured like this to make only one call to - * __generic_make_request (which is important as it is large and - * inlined) and to keep the structure simple. + * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack); current->bio_list = &bio_list_on_stack; do { - __generic_make_request(bio); + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + q->make_request_fn(q, bio); + bio = bio_list_pop(current->bio_list); } while (bio); current->bio_list = NULL; /* deactivate */ @@ -1626,19 +1929,24 @@ EXPORT_SYMBOL(generic_make_request); */ void submit_bio(int rw, struct bio *bio) { - int count = bio_sectors(bio); - bio->bi_rw |= rw; /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ - if (bio_has_data(bio) && !(rw & REQ_DISCARD)) { + if (bio_has_data(bio)) { + unsigned int count; + + if (unlikely(rw & REQ_WRITE_SAME)) + count = bdev_logical_block_size(bio->bi_bdev) >> 9; + else + count = bio_sectors(bio); + if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { - task_io_account_read(bio->bi_size); + task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } @@ -1647,7 +1955,7 @@ void submit_bio(int rw, struct bio *bio) printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", - (unsigned long long)bio->bi_sector, + (unsigned long long)bio->bi_iter.bi_sector, bdevname(bio->bi_bdev, b), count); } @@ -1673,18 +1981,17 @@ EXPORT_SYMBOL(submit_bio); * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests agaist + * Such request stacking drivers should check those requests against * the new queue limits again when they dispatch those requests, * although such checkings are also done against the old queue limits * when submitting requests. */ int blk_rq_check_limits(struct request_queue *q, struct request *rq) { - if (rq->cmd_flags & REQ_DISCARD) + if (!rq_mergeable(rq)) return 0; - if (blk_rq_sectors(rq) > queue_max_sectors(q) || - blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) { + if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } @@ -1713,17 +2020,20 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits); int blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; + int where = ELEVATOR_INSERT_BACK; if (blk_rq_check_limits(q, rq)) return -EIO; -#ifdef CONFIG_FAIL_MAKE_REQUEST - if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && - should_fail(&fail_make_request, blk_rq_bytes(rq))) + if (rq->rq_disk && + should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return -EIO; -#endif spin_lock_irqsave(q->queue_lock, flags); + if (unlikely(blk_queue_dying(q))) { + spin_unlock_irqrestore(q->queue_lock, flags); + return -ENODEV; + } /* * Submitting request must be dequeued before calling this function @@ -1731,9 +2041,12 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - drive_stat_acct(rq, 1); - __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); + if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA)) + where = ELEVATOR_INSERT_FLUSH; + add_acct_request(q, rq, where); + if (where == ELEVATOR_INSERT_FLUSH) + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -1775,7 +2088,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_rw & ff) != ff) break; - bytes += bio->bi_size; + bytes += bio->bi_iter.bi_size; } /* this could lead to infinite loop */ @@ -1784,7 +2097,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void blk_account_io_completion(struct request *req, unsigned int bytes) +void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { const int rw = rq_data_dir(req); @@ -1798,14 +2111,14 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) } } -static void blk_account_io_done(struct request *req) +void blk_account_io_done(struct request *req) { /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && req != &req->q->flush_rq) { + if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; @@ -1824,6 +2137,64 @@ static void blk_account_io_done(struct request *req) } } +#ifdef CONFIG_PM_RUNTIME +/* + * Don't process normal requests when queue is suspended + * or in the process of suspending/resuming + */ +static struct request *blk_pm_peek_request(struct request_queue *q, + struct request *rq) +{ + if (q->dev && (q->rpm_status == RPM_SUSPENDED || + (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM)))) + return NULL; + else + return rq; +} +#else +static inline struct request *blk_pm_peek_request(struct request_queue *q, + struct request *rq) +{ + return rq; +} +#endif + +void blk_account_io_start(struct request *rq, bool new_io) +{ + struct hd_struct *part; + int rw = rq_data_dir(rq); + int cpu; + + if (!blk_do_io_stat(rq)) + return; + + cpu = part_stat_lock(); + + if (!new_io) { + part = rq->part; + part_stat_inc(cpu, part, merges[rw]); + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!hd_struct_try_get(part)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + hd_struct_get(part); + } + part_round_stats(cpu, part); + part_inc_in_flight(part, rw); + rq->part = part; + } + + part_stat_unlock(); +} + /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at @@ -1846,6 +2217,11 @@ struct request *blk_peek_request(struct request_queue *q) int ret; while ((rq = __elv_next_request(q)) != NULL) { + + rq = blk_pm_peek_request(q, rq); + if (!rq) + break; + if (!(rq->cmd_flags & REQ_STARTED)) { /* * This is the first time the device driver @@ -1970,6 +2346,7 @@ void blk_start_request(struct request *req) if (unlikely(blk_bidi_rq(req))) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); + BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); @@ -2024,13 +2401,12 @@ EXPORT_SYMBOL(blk_fetch_request); **/ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { - int total_bytes, bio_nbytes, next_idx = 0; - struct bio *bio; + int total_bytes; if (!req->bio) return false; - trace_block_rq_complete(req->q, req); + trace_block_rq_complete(req->q, req, nr_bytes); /* * For fs requests, rq is just carrier of independent bio's @@ -2045,63 +2421,56 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (error && req->cmd_type == REQ_TYPE_FS && !(req->cmd_flags & REQ_QUIET)) { - printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", - req->rq_disk ? req->rq_disk->disk_name : "?", - (unsigned long long)blk_rq_pos(req)); + char *error_type; + + switch (error) { + case -ENOLINK: + error_type = "recoverable transport"; + break; + case -EREMOTEIO: + error_type = "critical target"; + break; + case -EBADE: + error_type = "critical nexus"; + break; + case -ETIMEDOUT: + error_type = "timeout"; + break; + case -ENOSPC: + error_type = "critical space allocation"; + break; + case -ENODATA: + error_type = "critical medium"; + break; + case -EIO: + default: + error_type = "I/O"; + break; + } + printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", + error_type, req->rq_disk ? + req->rq_disk->disk_name : "?", + (unsigned long long)blk_rq_pos(req)); + } blk_account_io_completion(req, nr_bytes); - total_bytes = bio_nbytes = 0; - while ((bio = req->bio) != NULL) { - int nbytes; + total_bytes = 0; + while (req->bio) { + struct bio *bio = req->bio; + unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - if (nr_bytes >= bio->bi_size) { + if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; - nbytes = bio->bi_size; - req_bio_endio(req, bio, nbytes, error); - next_idx = 0; - bio_nbytes = 0; - } else { - int idx = bio->bi_idx + next_idx; - if (unlikely(idx >= bio->bi_vcnt)) { - blk_dump_rq_flags(req, "__end_that"); - printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", - __func__, idx, bio->bi_vcnt); - break; - } + req_bio_endio(req, bio, bio_bytes, error); - nbytes = bio_iovec_idx(bio, idx)->bv_len; - BIO_BUG_ON(nbytes > bio->bi_size); + total_bytes += bio_bytes; + nr_bytes -= bio_bytes; - /* - * not a complete bvec done - */ - if (unlikely(nbytes > nr_bytes)) { - bio_nbytes += nr_bytes; - total_bytes += nr_bytes; - break; - } - - /* - * advance to the next vector - */ - next_idx++; - bio_nbytes += nbytes; - } - - total_bytes += nbytes; - nr_bytes -= nbytes; - - bio = req->bio; - if (bio) { - /* - * end more in this run, or just return 'not-done' - */ - if (unlikely(nr_bytes <= 0)) - break; - } + if (!nr_bytes) + break; } /* @@ -2117,21 +2486,10 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) return false; } - /* - * if the request wasn't completed, update state - */ - if (bio_nbytes) { - req_bio_endio(req, bio, bio_nbytes, error); - bio->bi_idx += next_idx; - bio_iovec(bio)->bv_offset += nr_bytes; - bio_iovec(bio)->bv_len -= nr_bytes; - } - req->__data_len -= total_bytes; - req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ - if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD)) + if (req->cmd_type == REQ_TYPE_FS) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ @@ -2145,7 +2503,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { - printk(KERN_ERR "blk: request botched\n"); + blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } @@ -2197,7 +2555,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); /* * queue lock must be held */ -static void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, int error) { if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); @@ -2212,7 +2570,6 @@ static void blk_finish_request(struct request *req, int error) if (req->cmd_flags & REQ_DONTPREP) blk_unprep_request(req); - blk_account_io_done(req); if (req->end_io) @@ -2224,6 +2581,7 @@ static void blk_finish_request(struct request *req, int error) __blk_put_request(req->q, req); } } +EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request @@ -2273,7 +2631,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %false - we are done with this request * %true - still buffers pending for this request **/ -static bool __blk_end_bidi_request(struct request *rq, int error, +bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) @@ -2447,11 +2805,10 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= bio->bi_rw & REQ_WRITE; - if (bio_has_data(bio)) { + if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); - rq->buffer = bio_data(bio); - } - rq->__data_len = bio->bi_size; + + rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; if (bio->bi_bdev) @@ -2469,10 +2826,10 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, void rq_flush_dcache_pages(struct request *rq) { struct req_iterator iter; - struct bio_vec *bvec; + struct bio_vec bvec; rq_for_each_segment(bvec, rq, iter) - flush_dcache_page(bvec->bv_page); + flush_dcache_page(bvec.bv_page); } EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); #endif @@ -2526,7 +2883,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. + * The actual data parts (e.g. ->cmd, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { @@ -2552,7 +2909,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) + * The actual data parts of @rq_src (e.g. ->cmd, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. @@ -2572,16 +2929,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, blk_rq_init(NULL, rq); __rq_for_each_bio(bio_src, rq_src) { - bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs); + bio = bio_clone_bioset(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; - __bio_clone(bio, bio_src); - - if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, gfp_mask, bs)) - goto free_and_out; - if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; @@ -2598,19 +2949,362 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, free_and_out: if (bio) - bio_free(bio, bs); + bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work(kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work); + +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); + +/** + * blk_start_plug - initialize blk_plug and track it inside the task_struct + * @plug: The &struct blk_plug that needs to be initialized + * + * Description: + * Tracking blk_plug inside the task_struct will help with auto-flushing the + * pending I/O should the task end up blocking between blk_start_plug() and + * blk_finish_plug(). This is important from a performance perspective, but + * also ensures that we don't deadlock. For instance, if the task is blocking + * for a memory allocation, memory reclaim could end up wanting to free a + * page belonging to that request that is currently residing in our private + * plug. By flushing the pending I/O when the process goes to sleep, we avoid + * this kind of deadlock. + */ +void blk_start_plug(struct blk_plug *plug) +{ + struct task_struct *tsk = current; + + INIT_LIST_HEAD(&plug->list); + INIT_LIST_HEAD(&plug->mq_list); + INIT_LIST_HEAD(&plug->cb_list); + + /* + * If this is a nested plug, don't actually assign it. It will be + * flushed on its own. + */ + if (!tsk->plug) { + /* + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier + */ + tsk->plug = plug; + } +} +EXPORT_SYMBOL(blk_start_plug); + +static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return !(rqa->q < rqb->q || + (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb))); +} + +/* + * If 'from_schedule' is true, then postpone the dispatch of requests + * until a safe kblockd context. We due this to avoid accidental big + * additional stack usage in driver dispatch, in places where the originally + * plugger did not intend it. + */ +static void queue_unplugged(struct request_queue *q, unsigned int depth, + bool from_schedule) + __releases(q->queue_lock) +{ + trace_block_unplug(q, depth, !from_schedule); + + if (from_schedule) + blk_run_queue_async(q); + else + __blk_run_queue(q); + spin_unlock(q->queue_lock); +} + +static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) +{ + LIST_HEAD(callbacks); + + while (!list_empty(&plug->cb_list)) { + list_splice_init(&plug->cb_list, &callbacks); + + while (!list_empty(&callbacks)) { + struct blk_plug_cb *cb = list_first_entry(&callbacks, + struct blk_plug_cb, + list); + list_del(&cb->list); + cb->callback(cb, from_schedule); + } + } +} + +struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, + int size) +{ + struct blk_plug *plug = current->plug; + struct blk_plug_cb *cb; + + if (!plug) + return NULL; + + list_for_each_entry(cb, &plug->cb_list, list) + if (cb->callback == unplug && cb->data == data) + return cb; + + /* Not currently on the callback list */ + BUG_ON(size < sizeof(*cb)); + cb = kzalloc(size, GFP_ATOMIC); + if (cb) { + cb->data = data; + cb->callback = unplug; + list_add(&cb->list, &plug->cb_list); + } + return cb; +} +EXPORT_SYMBOL(blk_check_plugged); + +void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + struct request_queue *q; + unsigned long flags; + struct request *rq; + LIST_HEAD(list); + unsigned int depth; + + flush_plug_callbacks(plug, from_schedule); + + if (!list_empty(&plug->mq_list)) + blk_mq_flush_plug_list(plug, from_schedule); + + if (list_empty(&plug->list)) + return; + + list_splice_init(&plug->list, &list); + + list_sort(NULL, &list, plug_rq_cmp); + + q = NULL; + depth = 0; + + /* + * Save and disable interrupts here, to avoid doing it for every + * queue lock we have to take. + */ + local_irq_save(flags); + while (!list_empty(&list)) { + rq = list_entry_rq(list.next); + list_del_init(&rq->queuelist); + BUG_ON(!rq->q); + if (rq->q != q) { + /* + * This drops the queue lock + */ + if (q) + queue_unplugged(q, depth, from_schedule); + q = rq->q; + depth = 0; + spin_lock(q->queue_lock); + } + + /* + * Short-circuit if @q is dead + */ + if (unlikely(blk_queue_dying(q))) { + __blk_end_request_all(rq, -ENODEV); + continue; + } + + /* + * rq is already accounted, so use raw insert + */ + if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) + __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); + else + __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); + + depth++; + } + + /* + * This drops the queue lock + */ + if (q) + queue_unplugged(q, depth, from_schedule); + + local_irq_restore(flags); +} + +void blk_finish_plug(struct blk_plug *plug) +{ + blk_flush_plug_list(plug, false); + + if (plug == current->plug) + current->plug = NULL; +} +EXPORT_SYMBOL(blk_finish_plug); + +#ifdef CONFIG_PM_RUNTIME +/** + * blk_pm_runtime_init - Block layer runtime PM initialization routine + * @q: the queue of the device + * @dev: the device the queue belongs to + * + * Description: + * Initialize runtime-PM-related fields for @q and start auto suspend for + * @dev. Drivers that want to take advantage of request-based runtime PM + * should call this function after @dev has been initialized, and its + * request queue @q has been allocated, and runtime PM for it can not happen + * yet(either due to disabled/forbidden or its usage_count > 0). In most + * cases, driver should call this function before any I/O has taken place. + * + * This function takes care of setting up using auto suspend for the device, + * the autosuspend delay is set to -1 to make runtime suspend impossible + * until an updated value is either set by user or by driver. Drivers do + * not need to touch other autosuspend settings. + * + * The block layer runtime PM is request based, so only works for drivers + * that use request as their IO unit instead of those directly use bio's. + */ +void blk_pm_runtime_init(struct request_queue *q, struct device *dev) +{ + q->dev = dev; + q->rpm_status = RPM_ACTIVE; + pm_runtime_set_autosuspend_delay(q->dev, -1); + pm_runtime_use_autosuspend(q->dev); +} +EXPORT_SYMBOL(blk_pm_runtime_init); + +/** + * blk_pre_runtime_suspend - Pre runtime suspend check + * @q: the queue of the device + * + * Description: + * This function will check if runtime suspend is allowed for the device + * by examining if there are any requests pending in the queue. If there + * are requests pending, the device can not be runtime suspended; otherwise, + * the queue's status will be updated to SUSPENDING and the driver can + * proceed to suspend the device. + * + * For the not allowed case, we mark last busy for the device so that + * runtime PM core will try to autosuspend it some time later. + * + * This function should be called near the start of the device's + * runtime_suspend callback. + * + * Return: + * 0 - OK to runtime suspend the device + * -EBUSY - Device should not be runtime suspended + */ +int blk_pre_runtime_suspend(struct request_queue *q) +{ + int ret = 0; + + spin_lock_irq(q->queue_lock); + if (q->nr_pending) { + ret = -EBUSY; + pm_runtime_mark_last_busy(q->dev); + } else { + q->rpm_status = RPM_SUSPENDING; + } + spin_unlock_irq(q->queue_lock); + return ret; +} +EXPORT_SYMBOL(blk_pre_runtime_suspend); + +/** + * blk_post_runtime_suspend - Post runtime suspend processing + * @q: the queue of the device + * @err: return value of the device's runtime_suspend function + * + * Description: + * Update the queue's runtime status according to the return value of the + * device's runtime suspend function and mark last busy for the device so + * that PM core will try to auto suspend the device at a later time. + * + * This function should be called near the end of the device's + * runtime_suspend callback. + */ +void blk_post_runtime_suspend(struct request_queue *q, int err) +{ + spin_lock_irq(q->queue_lock); + if (!err) { + q->rpm_status = RPM_SUSPENDED; + } else { + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + } + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL(blk_post_runtime_suspend); + +/** + * blk_pre_runtime_resume - Pre runtime resume processing + * @q: the queue of the device + * + * Description: + * Update the queue's runtime status to RESUMING in preparation for the + * runtime resume of the device. + * + * This function should be called near the start of the device's + * runtime_resume callback. + */ +void blk_pre_runtime_resume(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + q->rpm_status = RPM_RESUMING; + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL(blk_pre_runtime_resume); + +/** + * blk_post_runtime_resume - Post runtime resume processing + * @q: the queue of the device + * @err: return value of the device's runtime_resume function + * + * Description: + * Update the queue's runtime status according to the return value of the + * device's runtime_resume function. If it is successfully resumed, process + * the requests that are queued into the device's queue when it is resuming + * and then mark last busy and initiate autosuspend for it. + * + * This function should be called near the end of the device's + * runtime_resume callback. + */ +void blk_post_runtime_resume(struct request_queue *q, int err) +{ + spin_lock_irq(q->queue_lock); + if (!err) { + q->rpm_status = RPM_ACTIVE; + __blk_run_queue(q); + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); + } else { + q->rpm_status = RPM_SUSPENDED; + } + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL(blk_post_runtime_resume); +#endif + int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * diff --git a/block/blk-exec.c b/block/blk-exec.c index cf1456a02ac..f4d27b12c90 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -5,6 +5,8 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/sched/sysctl.h> #include "blk.h" @@ -23,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error) struct completion *waiting = rq->end_io_data; rq->end_io_data = NULL; - __blk_put_request(rq->q, rq); /* * complete last, if this is a stack request the process (and thus @@ -43,22 +44,52 @@ static void blk_end_sync_rq(struct request *rq, int error) * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. + * + * Note: + * This function will invoke @done directly if the queue is dead. */ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + bool is_pm_resume; + + WARN_ON(irqs_disabled()); rq->rq_disk = bd_disk; rq->end_io = done; - WARN_ON(irqs_disabled()); + + /* + * don't check dying flag for MQ because the request won't + * be resued after dying flag is set + */ + if (q->mq_ops) { + blk_mq_insert_request(rq, at_head, true, false); + return; + } + + /* + * need to check this before __blk_run_queue(), because rq can + * be freed before that returns. + */ + is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; + spin_lock_irq(q->queue_lock); - __elv_add_request(q, rq, where, 1); - __generic_unplug_device(q); - /* the queue is stopped so it won't be plugged+unplugged */ - if (rq->cmd_type == REQ_TYPE_PM_RESUME) - q->request_fn(q); + + if (unlikely(blk_queue_dying(q))) { + rq->cmd_flags |= REQ_QUIET; + rq->errors = -ENXIO; + __blk_end_request_all(rq, rq->errors); + spin_unlock_irq(q->queue_lock); + return; + } + + __elv_add_request(q, rq, where); + __blk_run_queue(q); + /* the queue is stopped so it won't be run */ + if (is_pm_resume) + __blk_run_queue_uncond(q); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -82,12 +113,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, int err = 0; unsigned long hang_check; - /* - * we need an extra reference to the request, so we can look at - * it after io completion - */ - rq->ref_count++; - if (!rq->sense) { memset(sense, 0, sizeof(sense)); rq->sense = sense; @@ -100,13 +125,18 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) - while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); + while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (rq->errors) err = -EIO; + if (rq->sense == sense) { + rq->sense = NULL; + rq->sense_len = 0; + } + return err; } EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index b27d0208611..3cb5e9e7108 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -1,205 +1,424 @@ /* * Functions to sequence FLUSH and FUA writes. + * + * Copyright (C) 2011 Max Planck Institute for Gravitational Physics + * Copyright (C) 2011 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three + * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request + * properties and hardware capability. + * + * If a request doesn't have data, only REQ_FLUSH makes sense, which + * indicates a simple flush request. If there is data, REQ_FLUSH indicates + * that the device cache should be flushed before the data is executed, and + * REQ_FUA means that the data must be on non-volatile media on request + * completion. + * + * If the device doesn't have writeback cache, FLUSH and FUA don't make any + * difference. The requests are either completed immediately if there's no + * data or executed as normal requests otherwise. + * + * If the device has writeback cache and supports FUA, REQ_FLUSH is + * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. + * + * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is + * translated to PREFLUSH and REQ_FUA to POSTFLUSH. + * + * The actual execution of flush is double buffered. Whenever a request + * needs to execute PRE or POSTFLUSH, it queues at + * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a + * flush is issued and the pending_idx is toggled. When the flush + * completes, all the requests which were pending are proceeded to the next + * step. This allows arbitrary merging of different types of FLUSH/FUA + * requests. + * + * Currently, the following conditions are used to determine when to issue + * flush. + * + * C1. At any given time, only one flush shall be in progress. This makes + * double buffering sufficient. + * + * C2. Flush is deferred if any request is executing DATA of its sequence. + * This avoids issuing separate POSTFLUSHes for requests which shared + * PREFLUSH. + * + * C3. The second condition is ignored if there is a request which has + * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid + * starvation in the unlikely case where there are continuous stream of + * FUA (without FLUSH) requests. + * + * For devices which support FUA, it isn't clear whether C2 (and thus C3) + * is beneficial. + * + * Note that a sequenced FLUSH/FUA request with DATA is completed twice. + * Once while executing DATA and again after the whole sequence is + * complete. The first completion updates the contained bio but doesn't + * finish it so that the bio submitter is notified only after the whole + * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in + * req_bio_endio(). + * + * The above peculiarity requires that each FLUSH/FUA request has only one + * bio attached to it, which is guaranteed as they aren't allowed to be + * merged in the usual way. */ + #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> +#include <linux/blk-mq.h> #include "blk.h" +#include "blk-mq.h" /* FLUSH/FUA sequences */ enum { - QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ - QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ - QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ - QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ - QUEUE_FSEQ_DONE = (1 << 4), -}; + REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ + REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ + REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ + REQ_FSEQ_DONE = (1 << 3), -static struct request *queue_next_fseq(struct request_queue *q); + REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | + REQ_FSEQ_POSTFLUSH, -unsigned blk_flush_cur_seq(struct request_queue *q) -{ - if (!q->flush_seq) - return 0; - return 1 << ffz(q->flush_seq); -} + /* + * If flush has been pending longer than the following timeout, + * it's issued even if flush_data requests are still in flight. + */ + FLUSH_PENDING_TIMEOUT = 5 * HZ, +}; -static struct request *blk_flush_complete_seq(struct request_queue *q, - unsigned seq, int error) -{ - struct request *next_rq = NULL; +static bool blk_kick_flush(struct request_queue *q); - if (error && !q->flush_err) - q->flush_err = error; +static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) +{ + unsigned int policy = 0; - BUG_ON(q->flush_seq & seq); - q->flush_seq |= seq; + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; - if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { - /* not complete yet, queue the next flush sequence */ - next_rq = queue_next_fseq(q); - } else { - /* complete this flush request */ - __blk_end_request_all(q->orig_flush_rq, q->flush_err); - q->orig_flush_rq = NULL; - q->flush_seq = 0; - - /* dispatch the next flush if there's one */ - if (!list_empty(&q->pending_flushes)) { - next_rq = list_entry_rq(q->pending_flushes.next); - list_move(&next_rq->queuelist, &q->queue_head); - } + if (fflags & REQ_FLUSH) { + if (rq->cmd_flags & REQ_FLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + policy |= REQ_FSEQ_POSTFLUSH; } - return next_rq; + return policy; } -static void blk_flush_complete_seq_end_io(struct request_queue *q, - unsigned seq, int error) +static unsigned int blk_flush_cur_seq(struct request *rq) { - bool was_empty = elv_queue_empty(q); - struct request *next_rq; - - next_rq = blk_flush_complete_seq(q, seq, error); + return 1 << ffz(rq->flush.seq); +} +static void blk_flush_restore_request(struct request *rq) +{ /* - * Moving a request silently to empty queue_head may stall the - * queue. Kick the queue in those cases. This function is called - * from request completion path and calling directly into - * request_fn may confuse the driver. Always use kblockd. + * After flush data completion, @rq->bio is %NULL but we need to + * complete the bio again. @rq->biotail is guaranteed to equal the + * original @rq->bio. Restore it. */ - if (was_empty && next_rq) - __blk_run_queue(q, true); -} + rq->bio = rq->biotail; -static void pre_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); -} + /* make @rq a normal request */ + rq->cmd_flags &= ~REQ_FLUSH_SEQ; + rq->end_io = rq->flush.saved_end_io; -static void flush_data_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error); + blk_clear_rq_complete(rq); } -static void post_flush_end_io(struct request *rq, int error) +static bool blk_flush_queue_rq(struct request *rq, bool add_front) { - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); -} + if (rq->q->mq_ops) { + struct request_queue *q = rq->q; -static void init_flush_request(struct request *rq, struct gendisk *disk) -{ - rq->cmd_type = REQ_TYPE_FS; - rq->cmd_flags = WRITE_FLUSH; - rq->rq_disk = disk; + blk_mq_add_to_requeue_list(rq, add_front); + blk_mq_kick_requeue_list(q); + return false; + } else { + if (add_front) + list_add(&rq->queuelist, &rq->q->queue_head); + else + list_add_tail(&rq->queuelist, &rq->q->queue_head); + return true; + } } -static struct request *queue_next_fseq(struct request_queue *q) +/** + * blk_flush_complete_seq - complete flush sequence + * @rq: FLUSH/FUA request being sequenced + * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) + * @error: whether an error occurred + * + * @rq just completed @seq part of its flush sequence, record the + * completion and trigger the next step. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * + * RETURNS: + * %true if requests were added to the dispatch queue, %false otherwise. + */ +static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, + int error) { - struct request *orig_rq = q->orig_flush_rq; - struct request *rq = &q->flush_rq; - - blk_rq_init(q, rq); + struct request_queue *q = rq->q; + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + bool queued = false, kicked; + + BUG_ON(rq->flush.seq & seq); + rq->flush.seq |= seq; + + if (likely(!error)) + seq = blk_flush_cur_seq(rq); + else + seq = REQ_FSEQ_DONE; + + switch (seq) { + case REQ_FSEQ_PREFLUSH: + case REQ_FSEQ_POSTFLUSH: + /* queue for flush */ + if (list_empty(pending)) + q->flush_pending_since = jiffies; + list_move_tail(&rq->flush.list, pending); + break; - switch (blk_flush_cur_seq(q)) { - case QUEUE_FSEQ_PREFLUSH: - init_flush_request(rq, orig_rq->rq_disk); - rq->end_io = pre_flush_end_io; + case REQ_FSEQ_DATA: + list_move_tail(&rq->flush.list, &q->flush_data_in_flight); + queued = blk_flush_queue_rq(rq, true); break; - case QUEUE_FSEQ_DATA: - init_request_from_bio(rq, orig_rq->bio); + + case REQ_FSEQ_DONE: /* - * orig_rq->rq_disk may be different from - * bio->bi_bdev->bd_disk if orig_rq got here through - * remapping drivers. Make sure rq->rq_disk points - * to the same one as orig_rq. + * @rq was previously adjusted by blk_flush_issue() for + * flush sequencing and may already have gone through the + * flush data request completion path. Restore @rq for + * normal completion and end it. */ - rq->rq_disk = orig_rq->rq_disk; - rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); - rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); - rq->end_io = flush_data_end_io; - break; - case QUEUE_FSEQ_POSTFLUSH: - init_flush_request(rq, orig_rq->rq_disk); - rq->end_io = post_flush_end_io; + BUG_ON(!list_empty(&rq->queuelist)); + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + if (q->mq_ops) + blk_mq_end_io(rq, error); + else + __blk_end_request_all(rq, error); break; + default: BUG(); } - elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); - return rq; + kicked = blk_kick_flush(q); + return kicked | queued; } -struct request *blk_do_flush(struct request_queue *q, struct request *rq) +static void flush_end_io(struct request *flush_rq, int error) { - unsigned int fflags = q->flush_flags; /* may change, cache it */ - bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; - bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH); - bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA); - unsigned skip = 0; + struct request_queue *q = flush_rq->q; + struct list_head *running; + bool queued = false; + struct request *rq, *n; + unsigned long flags = 0; + + if (q->mq_ops) { + spin_lock_irqsave(&q->mq_flush_lock, flags); + q->flush_rq->tag = -1; + } + + running = &q->flush_queue[q->flush_running_idx]; + BUG_ON(q->flush_pending_idx == q->flush_running_idx); + + /* account completion of the flush request */ + q->flush_running_idx ^= 1; + + if (!q->mq_ops) + elv_completed_request(q, flush_rq); + + /* and push the waiting requests to the next stage */ + list_for_each_entry_safe(rq, n, running, flush.list) { + unsigned int seq = blk_flush_cur_seq(rq); + + BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + queued |= blk_flush_complete_seq(rq, seq, error); + } /* - * Special case. If there's data but flush is not necessary, - * the request can be issued directly. - * - * Flush w/o data should be able to be issued directly too but - * currently some drivers assume that rq->bio contains - * non-zero data if it isn't NULL and empty FLUSH requests - * getting here usually have bio's without data. + * Kick the queue to avoid stall for two cases: + * 1. Moving a request silently to empty queue_head may stall the + * queue. + * 2. When flush request is running in non-queueable queue, the + * queue is hold. Restart the queue after flush request is finished + * to avoid stall. + * This function is called from request completion path and calling + * directly into request_fn may confuse the driver. Always use + * kblockd. */ - if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { - rq->cmd_flags &= ~REQ_FLUSH; - if (!has_fua) - rq->cmd_flags &= ~REQ_FUA; - return rq; + if (queued || q->flush_queue_delayed) { + WARN_ON(q->mq_ops); + blk_run_queue_async(q); } + q->flush_queue_delayed = 0; + if (q->mq_ops) + spin_unlock_irqrestore(&q->mq_flush_lock, flags); +} + +/** + * blk_kick_flush - consider issuing flush request + * @q: request_queue being kicked + * + * Flush related states of @q have changed, consider issuing flush request. + * Please read the comment at the top of this file for more info. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * + * RETURNS: + * %true if flush was issued, %false otherwise. + */ +static bool blk_kick_flush(struct request_queue *q) +{ + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct request *first_rq = + list_first_entry(pending, struct request, flush.list); + + /* C1 described at the top of this file */ + if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) + return false; + + /* C2 and C3 */ + if (!list_empty(&q->flush_data_in_flight) && + time_before(jiffies, + q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + return false; /* - * Sequenced flushes can't be processed in parallel. If - * another one is already in progress, queue for later - * processing. + * Issue flush and toggle pending_idx. This makes pending_idx + * different from running_idx, which means flush is in flight. */ - if (q->flush_seq) { - list_move_tail(&rq->queuelist, &q->pending_flushes); - return NULL; - } + q->flush_pending_idx ^= 1; + + blk_rq_init(q, q->flush_rq); + if (q->mq_ops) + blk_mq_clone_flush_request(q->flush_rq, first_rq); + + q->flush_rq->cmd_type = REQ_TYPE_FS; + q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + q->flush_rq->rq_disk = first_rq->rq_disk; + q->flush_rq->end_io = flush_end_io; + + return blk_flush_queue_rq(q->flush_rq, false); +} + +static void flush_data_end_io(struct request *rq, int error) +{ + struct request_queue *q = rq->q; /* - * Start a new flush sequence + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). */ - q->flush_err = 0; - q->flush_seq |= QUEUE_FSEQ_STARTED; + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + blk_run_queue_async(q); +} - /* adjust FLUSH/FUA of the original request and stash it away */ - rq->cmd_flags &= ~REQ_FLUSH; - if (!has_fua) - rq->cmd_flags &= ~REQ_FUA; - blk_dequeue_request(rq); - q->orig_flush_rq = rq; - - /* skip unneded sequences and return the first one */ - if (!do_preflush) - skip |= QUEUE_FSEQ_PREFLUSH; - if (!blk_rq_sectors(rq)) - skip |= QUEUE_FSEQ_DATA; - if (!do_postflush) - skip |= QUEUE_FSEQ_POSTFLUSH; - return blk_flush_complete_seq(q, skip, 0); +static void mq_flush_data_end_io(struct request *rq, int error) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + unsigned long flags; + + ctx = rq->mq_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + /* + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). + */ + spin_lock_irqsave(&q->mq_flush_lock, flags); + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + blk_mq_run_hw_queue(hctx, true); + spin_unlock_irqrestore(&q->mq_flush_lock, flags); } -static void bio_end_flush(struct bio *bio, int err) +/** + * blk_insert_flush - insert a new FLUSH/FUA request + * @rq: request to insert + * + * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. + * or __blk_mq_run_hw_queue() to dispatch request. + * @rq is being submitted. Analyze what needs to be done and put it on the + * right queue. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) in !mq case + */ +void blk_insert_flush(struct request *rq) { - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); + struct request_queue *q = rq->q; + unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned int policy = blk_flush_policy(fflags, rq); + + /* + * @policy now records what operations need to be done. Adjust + * REQ_FLUSH and FUA for the driver. + */ + rq->cmd_flags &= ~REQ_FLUSH; + if (!(fflags & REQ_FUA)) + rq->cmd_flags &= ~REQ_FUA; + + /* + * An empty flush handed down from a stacking driver may + * translate into nothing if the underlying device does not + * advertise a write-back cache. In this case, simply + * complete the request. + */ + if (!policy) { + if (q->mq_ops) + blk_mq_end_io(rq, 0); + else + __blk_end_bidi_request(rq, 0, 0, 0); + return; + } + + BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ + + /* + * If there's data but flush is not necessary, the request can be + * processed directly without going through flush machinery. Queue + * for normal execution. + */ + if ((policy & REQ_FSEQ_DATA) && + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + if (q->mq_ops) { + blk_mq_insert_request(rq, false, false, true); + } else + list_add_tail(&rq->queuelist, &q->queue_head); + return; + } + + /* + * @rq should go through flush machinery. Mark it part of flush + * sequence and submit for further processing. + */ + memset(&rq->flush, 0, sizeof(rq->flush)); + INIT_LIST_HEAD(&rq->flush.list); + rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ + if (q->mq_ops) { + rq->end_io = mq_flush_data_end_io; + + spin_lock_irq(&q->mq_flush_lock); + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&q->mq_flush_lock); + return; + } + rq->end_io = flush_data_end_io; + + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); } /** @@ -217,7 +436,6 @@ static void bio_end_flush(struct bio *bio, int err) int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; int ret = 0; @@ -239,13 +457,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio->bi_end_io = bio_end_flush; bio->bi_bdev = bdev; - bio->bi_private = &wait; - bio_get(bio); - submit_bio(WRITE_FLUSH, bio); - wait_for_completion(&wait); + ret = submit_bio_wait(WRITE_FLUSH, bio); /* * The driver must store the error location in ->bi_sector, if @@ -253,12 +467,14 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, * copied from blk_rq_pos(rq). */ if (error_sector) - *error_sector = bio->bi_sector; - - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; + *error_sector = bio->bi_iter.bi_sector; bio_put(bio); return ret; } EXPORT_SYMBOL(blkdev_issue_flush); + +void blk_mq_init_flush(struct request_queue *q) +{ + spin_lock_init(&q->mq_flush_lock); +} diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 54bcba6c02a..7fbab84399e 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -24,12 +24,15 @@ #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> +#include <linux/export.h> #include <linux/slab.h> #include "blk.h" static struct kmem_cache *integrity_cachep; +static const char *bi_unsupported_name = "unsupported"; + /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @q: request queue @@ -40,30 +43,32 @@ static struct kmem_cache *integrity_cachep; */ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) { - struct bio_vec *iv, *ivprv = NULL; + struct bio_vec iv, ivprv = { NULL }; unsigned int segments = 0; unsigned int seg_size = 0; - unsigned int i = 0; + struct bvec_iter iter; + int prev = 0; - bio_for_each_integrity_vec(iv, bio, i) { + bio_for_each_integrity_vec(iv, bio, iter) { - if (ivprv) { - if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + if (prev) { + if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv)) goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) goto new_segment; - if (seg_size + iv->bv_len > queue_max_segment_size(q)) + if (seg_size + iv.bv_len > queue_max_segment_size(q)) goto new_segment; - seg_size += iv->bv_len; + seg_size += iv.bv_len; } else { new_segment: segments++; - seg_size = iv->bv_len; + seg_size = iv.bv_len; } + prev = 1; ivprv = iv; } @@ -84,37 +89,39 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg); int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist) { - struct bio_vec *iv, *ivprv = NULL; + struct bio_vec iv, ivprv = { NULL }; struct scatterlist *sg = NULL; unsigned int segments = 0; - unsigned int i = 0; + struct bvec_iter iter; + int prev = 0; - bio_for_each_integrity_vec(iv, bio, i) { + bio_for_each_integrity_vec(iv, bio, iter) { - if (ivprv) { - if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + if (prev) { + if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv)) goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) goto new_segment; - if (sg->length + iv->bv_len > queue_max_segment_size(q)) + if (sg->length + iv.bv_len > queue_max_segment_size(q)) goto new_segment; - sg->length += iv->bv_len; + sg->length += iv.bv_len; } else { new_segment: if (!sg) sg = sglist; else { - sg->page_link &= ~0x02; + sg_unmark_end(sg); sg = sg_next(sg); } - sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset); + sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset); segments++; } + prev = 1; ivprv = iv; } @@ -358,6 +365,14 @@ static struct kobj_type integrity_ktype = { .release = blk_integrity_release, }; +bool blk_integrity_is_initialized(struct gendisk *disk) +{ + struct blk_integrity *bi = blk_get_integrity(disk); + + return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); +} +EXPORT_SYMBOL(blk_integrity_is_initialized); + /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware @@ -407,7 +422,9 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) bi->get_tag_fn = template->get_tag_fn; bi->tag_size = template->tag_size; } else - bi->name = "unsupported"; + bi->name = bi_unsupported_name; + + disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; return 0; } @@ -427,6 +444,8 @@ void blk_integrity_unregister(struct gendisk *disk) if (!disk || !disk->integrity) return; + disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES; + bi = disk->integrity; kobject_uevent(&bi->kobj, KOBJ_REMOVE); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index b791022beef..1a27f45ec77 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ #include <linux/slab.h> #include "blk.h" @@ -16,52 +15,185 @@ */ static struct kmem_cache *iocontext_cachep; -static void cfq_dtor(struct io_context *ioc) +/** + * get_io_context - increment reference count to io_context + * @ioc: io_context to get + * + * Increment reference count to @ioc. + */ +void get_io_context(struct io_context *ioc) { - if (!hlist_empty(&ioc->cic_list)) { - struct cfq_io_context *cic; + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + atomic_long_inc(&ioc->refcount); +} +EXPORT_SYMBOL(get_io_context); - cic = list_entry(ioc->cic_list.first, struct cfq_io_context, - cic_list); - cic->dtor(ioc); - } +static void icq_free_icq_rcu(struct rcu_head *head) +{ + struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); + + kmem_cache_free(icq->__rcu_icq_cache, icq); +} + +/* Exit an icq. Called with both ioc and q locked. */ +static void ioc_exit_icq(struct io_cq *icq) +{ + struct elevator_type *et = icq->q->elevator->type; + + if (icq->flags & ICQ_EXITED) + return; + + if (et->ops.elevator_exit_icq_fn) + et->ops.elevator_exit_icq_fn(icq); + + icq->flags |= ICQ_EXITED; +} + +/* Release an icq. Called with both ioc and q locked. */ +static void ioc_destroy_icq(struct io_cq *icq) +{ + struct io_context *ioc = icq->ioc; + struct request_queue *q = icq->q; + struct elevator_type *et = q->elevator->type; + + lockdep_assert_held(&ioc->lock); + lockdep_assert_held(q->queue_lock); + + radix_tree_delete(&ioc->icq_tree, icq->q->id); + hlist_del_init(&icq->ioc_node); + list_del_init(&icq->q_node); + + /* + * Both setting lookup hint to and clearing it from @icq are done + * under queue_lock. If it's not pointing to @icq now, it never + * will. Hint assignment itself can race safely. + */ + if (rcu_access_pointer(ioc->icq_hint) == icq) + rcu_assign_pointer(ioc->icq_hint, NULL); + + ioc_exit_icq(icq); + + /* + * @icq->q might have gone away by the time RCU callback runs + * making it impossible to determine icq_cache. Record it in @icq. + */ + icq->__rcu_icq_cache = et->icq_cache; + call_rcu(&icq->__rcu_head, icq_free_icq_rcu); } /* - * IO Context helper functions. put_io_context() returns 1 if there are no - * more users of this io context, 0 otherwise. + * Slow path for ioc release in put_io_context(). Performs double-lock + * dancing to unlink all icq's and then frees ioc. */ -int put_io_context(struct io_context *ioc) +static void ioc_release_fn(struct work_struct *work) { + struct io_context *ioc = container_of(work, struct io_context, + release_work); + unsigned long flags; + + /* + * Exiting icq may call into put_io_context() through elevator + * which will trigger lockdep warning. The ioc's are guaranteed to + * be different, use a different locking subclass here. Use + * irqsave variant as there's no spin_lock_irq_nested(). + */ + spin_lock_irqsave_nested(&ioc->lock, flags, 1); + + while (!hlist_empty(&ioc->icq_list)) { + struct io_cq *icq = hlist_entry(ioc->icq_list.first, + struct io_cq, ioc_node); + struct request_queue *q = icq->q; + + if (spin_trylock(q->queue_lock)) { + ioc_destroy_icq(icq); + spin_unlock(q->queue_lock); + } else { + spin_unlock_irqrestore(&ioc->lock, flags); + cpu_relax(); + spin_lock_irqsave_nested(&ioc->lock, flags, 1); + } + } + + spin_unlock_irqrestore(&ioc->lock, flags); + + kmem_cache_free(iocontext_cachep, ioc); +} + +/** + * put_io_context - put a reference of io_context + * @ioc: io_context to put + * + * Decrement reference count of @ioc and release it if the count reaches + * zero. + */ +void put_io_context(struct io_context *ioc) +{ + unsigned long flags; + bool free_ioc = false; + if (ioc == NULL) - return 1; + return; - BUG_ON(atomic_long_read(&ioc->refcount) == 0); + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + /* + * Releasing ioc requires reverse order double locking and we may + * already be holding a queue_lock. Do it asynchronously from wq. + */ if (atomic_long_dec_and_test(&ioc->refcount)) { - rcu_read_lock(); - cfq_dtor(ioc); - rcu_read_unlock(); + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) + queue_work(system_power_efficient_wq, + &ioc->release_work); + else + free_ioc = true; + spin_unlock_irqrestore(&ioc->lock, flags); + } + if (free_ioc) kmem_cache_free(iocontext_cachep, ioc); - return 1; - } - return 0; } EXPORT_SYMBOL(put_io_context); -static void cfq_exit(struct io_context *ioc) +/** + * put_io_context_active - put active reference on ioc + * @ioc: ioc of interest + * + * Undo get_io_context_active(). If active reference reaches zero after + * put, @ioc can never issue further IOs and ioscheds are notified. + */ +void put_io_context_active(struct io_context *ioc) { - rcu_read_lock(); + unsigned long flags; + struct io_cq *icq; - if (!hlist_empty(&ioc->cic_list)) { - struct cfq_io_context *cic; + if (!atomic_dec_and_test(&ioc->active_ref)) { + put_io_context(ioc); + return; + } - cic = list_entry(ioc->cic_list.first, struct cfq_io_context, - cic_list); - cic->exit(ioc); + /* + * Need ioc lock to walk icq_list and q lock to exit icq. Perform + * reverse double locking. Read comment in ioc_release_fn() for + * explanation on the nested locking annotation. + */ +retry: + spin_lock_irqsave_nested(&ioc->lock, flags, 1); + hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { + if (icq->flags & ICQ_EXITED) + continue; + if (spin_trylock(icq->q->queue_lock)) { + ioc_exit_icq(icq); + spin_unlock(icq->q->queue_lock); + } else { + spin_unlock_irqrestore(&ioc->lock, flags); + cpu_relax(); + goto retry; + } } - rcu_read_unlock(); + spin_unlock_irqrestore(&ioc->lock, flags); + + put_io_context(ioc); } /* Called by the exiting task */ @@ -74,83 +206,197 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - if (atomic_dec_and_test(&ioc->nr_tasks)) - cfq_exit(ioc); - - put_io_context(ioc); + atomic_dec(&ioc->nr_tasks); + put_io_context_active(ioc); } -struct io_context *alloc_io_context(gfp_t gfp_flags, int node) +/** + * ioc_clear_queue - break any ioc association with the specified queue + * @q: request_queue being cleared + * + * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked. + */ +void ioc_clear_queue(struct request_queue *q) { - struct io_context *ret; - - ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_long_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = 0; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - INIT_HLIST_HEAD(&ret->cic_list); - ret->ioc_data = NULL; + lockdep_assert_held(q->queue_lock); + + while (!list_empty(&q->icq_list)) { + struct io_cq *icq = list_entry(q->icq_list.next, + struct io_cq, q_node); + struct io_context *ioc = icq->ioc; + + spin_lock(&ioc->lock); + ioc_destroy_icq(icq); + spin_unlock(&ioc->lock); } +} + +int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) +{ + struct io_context *ioc; + int ret; + + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, + node); + if (unlikely(!ioc)) + return -ENOMEM; + + /* initialize */ + atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + atomic_set(&ioc->active_ref, 1); + spin_lock_init(&ioc->lock); + INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ioc->icq_list); + INIT_WORK(&ioc->release_work, ioc_release_fn); + + /* + * Try to install. ioc shouldn't be installed if someone else + * already did or @task, which isn't %current, is exiting. Note + * that we need to allow ioc creation on exiting %current as exit + * path may issue IOs from e.g. exit_files(). The exit path is + * responsible for not issuing IO after exit_io_context(). + */ + task_lock(task); + if (!task->io_context && + (task == current || !(task->flags & PF_EXITING))) + task->io_context = ioc; + else + kmem_cache_free(iocontext_cachep, ioc); + + ret = task->io_context ? 0 : -EBUSY; + + task_unlock(task); return ret; } -/* - * If the current task has no IO context then create one and initialise it. - * Otherwise, return its existing IO context. +/** + * get_task_io_context - get io_context of a task + * @task: task of interest + * @gfp_flags: allocation flags, used if allocation is necessary + * @node: allocation node, used if allocation is necessary * - * This returned IO context doesn't have a specifically elevated refcount, - * but since the current task itself holds a reference, the context can be - * used in general code, so long as it stays within `current` context. + * Return io_context of @task. If it doesn't exist, it is created with + * @gfp_flags and @node. The returned io_context has its reference count + * incremented. + * + * This function always goes through task_lock() and it's better to use + * %current->io_context + get_io_context() for %current. */ -struct io_context *current_io_context(gfp_t gfp_flags, int node) +struct io_context *get_task_io_context(struct task_struct *task, + gfp_t gfp_flags, int node) { - struct task_struct *tsk = current; - struct io_context *ret; - - ret = tsk->io_context; - if (likely(ret)) - return ret; - - ret = alloc_io_context(gfp_flags, node); - if (ret) { - /* make sure set_task_ioprio() sees the settings above */ - smp_wmb(); - tsk->io_context = ret; - } + struct io_context *ioc; - return ret; + might_sleep_if(gfp_flags & __GFP_WAIT); + + do { + task_lock(task); + ioc = task->io_context; + if (likely(ioc)) { + get_io_context(ioc); + task_unlock(task); + return ioc; + } + task_unlock(task); + } while (!create_task_io_context(task, gfp_flags, node)); + + return NULL; } +EXPORT_SYMBOL(get_task_io_context); -/* - * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. +/** + * ioc_lookup_icq - lookup io_cq from ioc + * @ioc: the associated io_context + * @q: the associated request_queue * - * This is always called in the context of the task which submitted the I/O. + * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called + * with @q->queue_lock held. */ -struct io_context *get_io_context(gfp_t gfp_flags, int node) +struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) { - struct io_context *ret = NULL; + struct io_cq *icq; + + lockdep_assert_held(q->queue_lock); /* - * Check for unlikely race with exiting task. ioc ref count is - * zero when ioc is being detached. + * icq's are indexed from @ioc using radix tree and hint pointer, + * both of which are protected with RCU. All removals are done + * holding both q and ioc locks, and we're holding q lock - if we + * find a icq which points to us, it's guaranteed to be valid. */ - do { - ret = current_io_context(gfp_flags, node); - if (unlikely(!ret)) - break; - } while (!atomic_long_inc_not_zero(&ret->refcount)); + rcu_read_lock(); + icq = rcu_dereference(ioc->icq_hint); + if (icq && icq->q == q) + goto out; - return ret; + icq = radix_tree_lookup(&ioc->icq_tree, q->id); + if (icq && icq->q == q) + rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ + else + icq = NULL; +out: + rcu_read_unlock(); + return icq; +} +EXPORT_SYMBOL(ioc_lookup_icq); + +/** + * ioc_create_icq - create and link io_cq + * @ioc: io_context of interest + * @q: request_queue of interest + * @gfp_mask: allocation mask + * + * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they + * will be created using @gfp_mask. + * + * The caller is responsible for ensuring @ioc won't go away and @q is + * alive and will stay alive until this function returns. + */ +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask) +{ + struct elevator_type *et = q->elevator->type; + struct io_cq *icq; + + /* allocate stuff */ + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, + q->node); + if (!icq) + return NULL; + + if (radix_tree_maybe_preload(gfp_mask) < 0) { + kmem_cache_free(et->icq_cache, icq); + return NULL; + } + + icq->ioc = ioc; + icq->q = q; + INIT_LIST_HEAD(&icq->q_node); + INIT_HLIST_NODE(&icq->ioc_node); + + /* lock both q and ioc and try to link @icq */ + spin_lock_irq(q->queue_lock); + spin_lock(&ioc->lock); + + if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { + hlist_add_head(&icq->ioc_node, &ioc->icq_list); + list_add(&icq->q_node, &q->icq_list); + if (et->ops.elevator_init_icq_fn) + et->ops.elevator_init_icq_fn(icq); + } else { + kmem_cache_free(et->icq_cache, icq); + icq = ioc_lookup_icq(ioc, q); + if (!icq) + printk(KERN_ERR "cfq: icq link failed!\n"); + } + + spin_unlock(&ioc->lock); + spin_unlock_irq(q->queue_lock); + radix_tree_preload_end(); + return icq; } -EXPORT_SYMBOL(get_io_context); static int __init blk_ioc_init(void) { diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index 58916afbbda..0736729d649 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -14,9 +14,6 @@ #include "blk.h" -int blk_iopoll_enabled = 1; -EXPORT_SYMBOL(blk_iopoll_enabled); - static unsigned int blk_iopoll_budget __read_mostly = 256; static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); @@ -35,7 +32,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop) unsigned long flags; local_irq_save(flags); - list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); + list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); local_irq_restore(flags); } @@ -52,7 +49,7 @@ EXPORT_SYMBOL(blk_iopoll_sched); void __blk_iopoll_complete(struct blk_iopoll *iop) { list_del(&iop->list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); } EXPORT_SYMBOL(__blk_iopoll_complete); @@ -67,19 +64,19 @@ EXPORT_SYMBOL(__blk_iopoll_complete); * iopoll handler will not be invoked again before blk_iopoll_sched_prep() * is called. **/ -void blk_iopoll_complete(struct blk_iopoll *iopoll) +void blk_iopoll_complete(struct blk_iopoll *iop) { unsigned long flags; local_irq_save(flags); - __blk_iopoll_complete(iopoll); + __blk_iopoll_complete(iop); local_irq_restore(flags); } EXPORT_SYMBOL(blk_iopoll_complete); static void blk_iopoll_softirq(struct softirq_action *h) { - struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = blk_iopoll_budget; unsigned long start_time = jiffies; @@ -164,7 +161,7 @@ EXPORT_SYMBOL(blk_iopoll_disable); void blk_iopoll_enable(struct blk_iopoll *iop) { BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); } EXPORT_SYMBOL(blk_iopoll_enable); @@ -189,8 +186,8 @@ void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) } EXPORT_SYMBOL(blk_iopoll_init); -static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int blk_iopoll_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { /* * If a CPU goes away, splice its entries to the current CPU @@ -201,7 +198,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self, local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - &__get_cpu_var(blk_cpu_iopoll)); + this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); local_irq_enable(); } @@ -209,7 +206,7 @@ static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = { +static struct notifier_block blk_iopoll_cpu_notifier = { .notifier_call = blk_iopoll_cpu_notify, }; diff --git a/block/blk-lib.c b/block/blk-lib.c index eec78becb35..8411be3c19d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -9,17 +9,20 @@ #include "blk.h" -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } +struct bio_batch { + atomic_t done; + unsigned long flags; + struct completion *wait; +}; - if (bio->bi_private) - complete(bio->bi_private); +static void bio_batch_end_io(struct bio *bio, int err) +{ + struct bio_batch *bb = bio->bi_private; + if (err && (err != -EOPNOTSUPP)) + clear_bit(BIO_UPTODATE, &bb->flags); + if (atomic_dec_and_test(&bb->done)) + complete(bb->wait); bio_put(bio); } @@ -40,9 +43,12 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - unsigned int max_discard_sectors; + unsigned int max_discard_sectors, granularity; + int alignment; + struct bio_batch bb; struct bio *bio; int ret = 0; + struct blk_plug plug; if (!q) return -ENXIO; @@ -50,15 +56,19 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!blk_queue_discard(q)) return -EOPNOTSUPP; + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + /* * Ensure that max_discard_sectors is of the proper - * granularity + * granularity, so that requests stay aligned after a split. */ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - if (q->limits.discard_granularity) { - unsigned int disc_sects = q->limits.discard_granularity >> 9; - - max_discard_sectors &= ~(disc_sects - 1); + max_discard_sectors -= max_discard_sectors % granularity; + if (unlikely(!max_discard_sectors)) { + /* Avoid infinite loop below. Being cautious never hurts. */ + return -EOPNOTSUPP; } if (flags & BLKDEV_DISCARD_SECURE) { @@ -67,69 +77,143 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, type |= REQ_SECURE; } - while (nr_sects && !ret) { + atomic_set(&bb.done, 1); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + + blk_start_plug(&plug); + while (nr_sects) { + unsigned int req_sects; + sector_t end_sect, tmp; + bio = bio_alloc(gfp_mask, 1); if (!bio) { ret = -ENOMEM; break; } - bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; - bio->bi_bdev = bdev; - bio->bi_private = &wait; + req_sects = min_t(sector_t, nr_sects, max_discard_sectors); - if (nr_sects > max_discard_sectors) { - bio->bi_size = max_discard_sectors << 9; - nr_sects -= max_discard_sectors; - sector += max_discard_sectors; - } else { - bio->bi_size = nr_sects << 9; - nr_sects = 0; + /* + * If splitting a request, and the next starting sector would be + * misaligned, stop the discard at the previous aligned sector. + */ + end_sect = sector + req_sects; + tmp = end_sect; + if (req_sects < nr_sects && + sector_div(tmp, granularity) != alignment) { + end_sect = end_sect - alignment; + sector_div(end_sect, granularity); + end_sect = end_sect * granularity + alignment; + req_sects = end_sect - sector; } - bio_get(bio); - submit_bio(type, bio); + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = bio_batch_end_io; + bio->bi_bdev = bdev; + bio->bi_private = &bb; + + bio->bi_iter.bi_size = req_sects << 9; + nr_sects -= req_sects; + sector = end_sect; - wait_for_completion(&wait); + atomic_inc(&bb.done); + submit_bio(type, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); } + blk_finish_plug(&plug); + + /* Wait for bios in-flight */ + if (!atomic_dec_and_test(&bb.done)) + wait_for_completion_io(&wait); + + if (!test_bit(BIO_UPTODATE, &bb.flags)) + ret = -EIO; return ret; } EXPORT_SYMBOL(blkdev_issue_discard); -struct bio_batch +/** + * blkdev_issue_write_same - queue a write same operation + * @bdev: target blockdev + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @page: page containing data to write + * + * Description: + * Issue a write same request for the sectors in question. + */ +int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, + struct page *page) { - atomic_t done; - unsigned long flags; - struct completion *wait; - bio_end_io_t *end_io; -}; + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_write_same_sectors; + struct bio_batch bb; + struct bio *bio; + int ret = 0; -static void bio_batch_end_io(struct bio *bio, int err) -{ - struct bio_batch *bb = bio->bi_private; + if (!q) + return -ENXIO; - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bb->flags); - else - clear_bit(BIO_UPTODATE, &bb->flags); - } - if (bb) { - if (bb->end_io) - bb->end_io(bio, err); - atomic_inc(&bb->done); - complete(bb->wait); + max_write_same_sectors = q->limits.max_write_same_sectors; + + if (max_write_same_sectors == 0) + return -EOPNOTSUPP; + + atomic_set(&bb.done, 1); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + + while (nr_sects) { + bio = bio_alloc(gfp_mask, 1); + if (!bio) { + ret = -ENOMEM; + break; + } + + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = bio_batch_end_io; + bio->bi_bdev = bdev; + bio->bi_private = &bb; + bio->bi_vcnt = 1; + bio->bi_io_vec->bv_page = page; + bio->bi_io_vec->bv_offset = 0; + bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); + + if (nr_sects > max_write_same_sectors) { + bio->bi_iter.bi_size = max_write_same_sectors << 9; + nr_sects -= max_write_same_sectors; + sector += max_write_same_sectors; + } else { + bio->bi_iter.bi_size = nr_sects << 9; + nr_sects = 0; + } + + atomic_inc(&bb.done); + submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio); } - bio_put(bio); + + /* Wait for bios in-flight */ + if (!atomic_dec_and_test(&bb.done)) + wait_for_completion_io(&wait); + + if (!test_bit(BIO_UPTODATE, &bb.flags)) + ret = -ENOTSUPP; + + return ret; } +EXPORT_SYMBOL(blkdev_issue_write_same); /** * blkdev_issue_zeroout - generate number of zero filed write bios @@ -140,25 +224,21 @@ static void bio_batch_end_io(struct bio *bio, int err) * * Description: * Generate and issue number of bios with zerofiled pages. - * Send barrier at the beginning and at the end if requested. This guarantie - * correct request ordering. Empty barrier allow us to avoid post queue flush. */ -int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; struct bio_batch bb; - unsigned int sz, issued = 0; + unsigned int sz; DECLARE_COMPLETION_ONSTACK(wait); - atomic_set(&bb.done, 0); + atomic_set(&bb.done, 1); bb.flags = 1 << BIO_UPTODATE; bb.wait = &wait; - bb.end_io = NULL; -submit: ret = 0; while (nr_sects != 0) { bio = bio_alloc(gfp_mask, @@ -168,16 +248,13 @@ submit: break; } - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio->bi_end_io = bio_batch_end_io; bio->bi_private = &bb; while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); - if (sz == 0) - /* bio has maximum size possible */ - break; ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); nr_sects -= ret >> 9; sector += ret >> 9; @@ -185,28 +262,46 @@ submit: break; } ret = 0; - issued++; + atomic_inc(&bb.done); submit_bio(WRITE, bio); } /* Wait for bios in-flight */ - while (issued != atomic_read(&bb.done)) - wait_for_completion(&wait); + if (!atomic_dec_and_test(&bb.done)) + wait_for_completion_io(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) /* One of bios in the batch was completed with error.*/ ret = -EIO; - if (ret) - goto out; + return ret; +} - if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { - ret = -EOPNOTSUPP; - goto out; +/** + * blkdev_issue_zeroout - zero-fill a block range + * @bdev: blockdev to write + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * + * Description: + * Generate and issue number of bios with zerofiled pages. + */ + +int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) +{ + if (bdev_write_same(bdev)) { + unsigned char bdn[BDEVNAME_SIZE]; + + if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, + ZERO_PAGE(0))) + return 0; + + bdevname(bdev, bdn); + pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); } - if (nr_sects != 0) - goto submit; -out: - return ret; + + return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); } EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-map.c b/block/blk-map.c index e663ac2d8e6..f890d4345b0 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -20,7 +20,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq, rq->biotail->bi_next = bio; rq->biotail = bio; - rq->__data_len += bio->bi_size; + rq->__data_len += bio->bi_iter.bi_size; } return 0; } @@ -76,7 +76,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq, ret = blk_rq_append_bio(q, rq, bio); if (!ret) - return bio->bi_size; + return bio->bi_iter.bi_size; /* if it was boucned we must call the end io function */ bio_endio(bio, 0); @@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - rq->buffer = NULL; return 0; unmap_rq: blk_rq_unmap_user(bio); @@ -188,7 +187,7 @@ EXPORT_SYMBOL(blk_rq_map_user); * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, struct sg_iovec *iov, + struct rq_map_data *map_data, const struct sg_iovec *iov, int iov_count, unsigned int len, gfp_t gfp_mask) { struct bio *bio; @@ -204,10 +203,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (!iov[i].iov_len) return -EINVAL; - if (uaddr & queue_dma_alignment(q)) { + /* + * Keep going so we check length of all segments + */ + if (uaddr & queue_dma_alignment(q)) unaligned = 1; - break; - } } if (unaligned || (q->dma_pad_mask & len) || map_data) @@ -219,7 +219,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (IS_ERR(bio)) return PTR_ERR(bio); - if (bio->bi_size != len) { + if (bio->bi_iter.bi_size != len) { /* * Grab an extra reference to this bio, as bio_unmap_user() * expects to be able to drop it twice as it happens on the @@ -237,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, blk_queue_bounce(q, &bio); bio_get(bio); blk_rq_bio_prep(q, rq, bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_user_iov); @@ -284,7 +283,7 @@ EXPORT_SYMBOL(blk_rq_unmap_user); * * Description: * Data will be mapped directly if possible. Otherwise a bounce - * buffer is used. Can be called multple times to append multple + * buffer is used. Can be called multiple times to append multiple * buffers. */ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, @@ -310,7 +309,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (IS_ERR(bio)) return PTR_ERR(bio); - if (rq_data_dir(rq) == WRITE) + if (!reading) bio->bi_rw |= REQ_WRITE; if (do_copy) @@ -324,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, } blk_queue_bounce(q, &rq->bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index ea85e20d5e9..54535831f1e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -12,38 +12,56 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio) { - struct bio_vec *bv, *bvprv = NULL; - int cluster, i, high, highprv = 1; + struct bio_vec bv, bvprv = { NULL }; + int cluster, high, highprv = 1, no_sg_merge; unsigned int seg_size, nr_phys_segs; struct bio *fbio, *bbio; + struct bvec_iter iter; if (!bio) return 0; + /* + * This should probably be returning 0, but blk_add_request_payload() + * (Christoph!!!!) + */ + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + fbio = bio; cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; + no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); + high = 0; for_each_bio(bio) { - bio_for_each_segment(bv, bio, i) { + bio_for_each_segment(bv, bio, iter) { /* - * the trick here is making sure that a high page is - * never considered part of another segment, since that - * might change with the bounce page. + * If SG merging is disabled, each bio vector is + * a segment */ - high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q); - if (high || highprv) + if (no_sg_merge) goto new_segment; - if (cluster) { - if (seg_size + bv->bv_len + + /* + * the trick here is making sure that a high page is + * never considered part of another segment, since + * that might change with the bounce page. + */ + high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); + if (!high && !highprv && cluster) { + if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) + if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv)) goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) + if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) goto new_segment; - seg_size += bv->bv_len; + seg_size += bv.bv_len; bvprv = bv; continue; } @@ -54,7 +72,7 @@ new_segment: nr_phys_segs++; bvprv = bv; - seg_size = bv->bv_len; + seg_size = bv.bv_len; highprv = high; } bbio = bio; @@ -75,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq) void blk_recount_segments(struct request_queue *q, struct bio *bio) { - struct bio *nxt = bio->bi_next; + if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags)) + bio->bi_phys_segments = bio->bi_vcnt; + else { + struct bio *nxt = bio->bi_next; + + bio->bi_next = NULL; + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); + bio->bi_next = nxt; + } - bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); - bio->bi_next = nxt; bio->bi_flags |= (1 << BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); @@ -87,6 +110,9 @@ EXPORT_SYMBOL(blk_recount_segments); static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { + struct bio_vec end_bv = { NULL }, nxt_bv; + struct bvec_iter iter; + if (!blk_queue_cluster(q)) return 0; @@ -97,77 +123,122 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, if (!bio_has_data(bio)) return 1; - if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) + bio_for_each_segment(end_bv, bio, iter) + if (end_bv.bv_len == iter.bi_size) + break; + + nxt_bv = bio_iovec(nxt); + + if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) return 0; /* * bio and nxt are contiguous in memory; check if the queue allows * these two to be merged into one */ - if (BIO_SEG_BOUNDARY(q, bio, nxt)) + if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv)) return 1; return 0; } -/* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries - */ -int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +static inline void +__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, + struct scatterlist *sglist, struct bio_vec *bvprv, + struct scatterlist **sg, int *nsegs, int *cluster) { - struct bio_vec *bvec, *bvprv; - struct req_iterator iter; - struct scatterlist *sg; + + int nbytes = bvec->bv_len; + + if (*sg && *cluster) { + if ((*sg)->length + nbytes > queue_max_segment_size(q)) + goto new_segment; + + if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) + goto new_segment; + + (*sg)->length += nbytes; + } else { +new_segment: + if (!*sg) + *sg = sglist; + else { + /* + * If the driver previously mapped a shorter + * list, we could see a termination bit + * prematurely unless it fully inits the sg + * table on each mapping. We KNOW that there + * must be more entries here or the driver + * would be buggy, so force clear the + * termination bit to avoid doing a full + * sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + *sg = sg_next(*sg); + } + + sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); + (*nsegs)++; + } + *bvprv = *bvec; +} + +static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist, + struct scatterlist **sg) +{ + struct bio_vec bvec, bvprv = { NULL }; + struct bvec_iter iter; int nsegs, cluster; nsegs = 0; cluster = blk_queue_cluster(q); - /* - * for each bio in rq - */ - bvprv = NULL; - sg = NULL; - rq_for_each_segment(bvec, rq, iter) { - int nbytes = bvec->bv_len; + if (bio->bi_rw & REQ_DISCARD) { + /* + * This is a hack - drivers should be neither modifying the + * biovec, nor relying on bi_vcnt - but because of + * blk_add_request_payload(), a discard bio may or may not have + * a payload we need to set up here (thank you Christoph) and + * bi_vcnt is really the only way of telling if we need to. + */ - if (bvprv && cluster) { - if (sg->length + nbytes > queue_max_segment_size(q)) - goto new_segment; + if (bio->bi_vcnt) + goto single_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) - goto new_segment; + return 0; + } - sg->length += nbytes; - } else { -new_segment: - if (!sg) - sg = sglist; - else { - /* - * If the driver previously mapped a shorter - * list, we could see a termination bit - * prematurely unless it fully inits the sg - * table on each mapping. We KNOW that there - * must be more entries here or the driver - * would be buggy, so force clear the - * termination bit to avoid doing a full - * sg_init_table() in drivers for each command. - */ - sg->page_link &= ~0x02; - sg = sg_next(sg); - } + if (bio->bi_rw & REQ_WRITE_SAME) { +single_segment: + *sg = sglist; + bvec = bio_iovec(bio); + sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); + return 1; + } - sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset); - nsegs++; - } - bvprv = bvec; - } /* segments in rq */ + for_each_bio(bio) + bio_for_each_segment(bvec, bio, iter) + __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, + &nsegs, &cluster); + + return nsegs; +} + +/* + * map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries + */ +int blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist) +{ + struct scatterlist *sg = NULL; + int nsegs = 0; + if (rq->bio) + nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); if (unlikely(rq->cmd_flags & REQ_COPY_USER) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { @@ -199,6 +270,35 @@ new_segment: } EXPORT_SYMBOL(blk_rq_map_sg); +/** + * blk_bio_map_sg - map a bio to a scatterlist + * @q: request_queue in question + * @bio: bio being mapped + * @sglist: scatterlist being mapped + * + * Note: + * Caller must make sure sg can hold bio->bi_phys_segments entries + * + * Will return the number of sg entries setup + */ +int blk_bio_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) +{ + struct scatterlist *sg = NULL; + int nsegs; + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + + nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); + bio->bi_next = next; + if (sg) + sg_mark_end(sg); + + BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); + return nsegs; +} +EXPORT_SYMBOL(blk_bio_map_sg); + static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) @@ -228,14 +328,8 @@ no_merge: int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - unsigned short max_sectors; - - if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) - max_sectors = queue_max_hw_sectors(q); - else - max_sectors = queue_max_sectors(q); - - if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -252,15 +346,8 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - unsigned short max_sectors; - - if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) - max_sectors = queue_max_hw_sectors(q); - else - max_sectors = queue_max_sectors(q); - - - if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -274,6 +361,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return ll_new_hw_segment(q, req, bio); } +/* + * blk-mq uses req->special to carry normal driver per-request payload, it + * does not indicate a prepared command that we cannot merge with. + */ +static bool req_no_special_merge(struct request *req) +{ + struct request_queue *q = req->q; + + return !q->mq_ops && req->special; +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -285,13 +383,14 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, * First check if the either of the requests are re-queued * requests. Can't merge them if they are. */ - if (req->special || next->special) + if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; /* * Will it become too large? */ - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q)) + if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > + blk_rq_get_max_sectors(req)) return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; @@ -370,16 +469,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (!rq_mergeable(req) || !rq_mergeable(next)) return 0; - /* - * Don't merge file system requests and discard requests - */ - if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD)) - return 0; - - /* - * Don't merge discard requests and secure discard requests - */ - if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE)) + if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags)) return 0; /* @@ -390,7 +480,11 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (rq_data_dir(req) != rq_data_dir(next) || req->rq_disk != next->rq_disk - || next->special) + || req_no_special_merge(next)) + return 0; + + if (req->cmd_flags & REQ_WRITE_SAME && + !blk_write_same_mergeable(req->bio, next->bio)) return 0; /* @@ -465,3 +559,56 @@ int attempt_front_merge(struct request_queue *q, struct request *rq) return 0; } + +int blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + return attempt_merge(q, rq, next); +} + +bool blk_rq_merge_ok(struct request *rq, struct bio *bio) +{ + struct request_queue *q = rq->q; + + if (!rq_mergeable(rq) || !bio_mergeable(bio)) + return false; + + if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw)) + return false; + + /* different data direction or already started, don't merge */ + if (bio_data_dir(bio) != rq_data_dir(rq)) + return false; + + /* must be same device and not a special request */ + if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) + return false; + + /* only merge integrity protected bio into ditto rq */ + if (bio_integrity(bio) != blk_integrity_rq(rq)) + return false; + + /* must be using the same buffer */ + if (rq->cmd_flags & REQ_WRITE_SAME && + !blk_write_same_mergeable(rq->bio, bio)) + return false; + + if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { + struct bio_vec *bprev; + + bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1]; + if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) + return false; + } + + return true; +} + +int blk_try_merge(struct request *rq, struct bio *bio) +{ + if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) + return ELEVATOR_BACK_MERGE; + else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) + return ELEVATOR_FRONT_MERGE; + return ELEVATOR_NO_MERGE; +} diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c new file mode 100644 index 00000000000..bb3ed488f7b --- /dev/null +++ b/block/blk-mq-cpu.c @@ -0,0 +1,67 @@ +/* + * CPU notifier helper code for blk-mq + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/list.h> +#include <linux/llist.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" + +static LIST_HEAD(blk_mq_cpu_notify_list); +static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock); + +static int blk_mq_main_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long) hcpu; + struct blk_mq_cpu_notifier *notify; + int ret = NOTIFY_OK; + + raw_spin_lock(&blk_mq_cpu_notify_lock); + + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { + ret = notify->notify(notify->data, action, cpu); + if (ret != NOTIFY_OK) + break; + } + + raw_spin_unlock(&blk_mq_cpu_notify_lock); + return ret; +} + +void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ + BUG_ON(!notifier->notify); + + raw_spin_lock(&blk_mq_cpu_notify_lock); + list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); + raw_spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ + raw_spin_lock(&blk_mq_cpu_notify_lock); + list_del(¬ifier->list); + raw_spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, + int (*fn)(void *, unsigned long, unsigned int), + void *data) +{ + notifier->notify = fn; + notifier->data = data; +} + +void __init blk_mq_cpu_init(void) +{ + hotcpu_notifier(blk_mq_main_cpu_notify, 0); +} diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c new file mode 100644 index 00000000000..1065d7c65fa --- /dev/null +++ b/block/blk-mq-cpumap.c @@ -0,0 +1,119 @@ +/* + * CPU <-> hardware queue mapping helpers + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" + +static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, + const int cpu) +{ + return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); +} + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_thread_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + + return cpu; +} + +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) +{ + unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; + cpumask_var_t cpus; + + if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) + return 1; + + cpumask_clear(cpus); + nr_cpus = nr_uniq_cpus = 0; + for_each_online_cpu(i) { + nr_cpus++; + first_sibling = get_first_sibling(i); + if (!cpumask_test_cpu(first_sibling, cpus)) + nr_uniq_cpus++; + cpumask_set_cpu(i, cpus); + } + + queue = 0; + for_each_possible_cpu(i) { + if (!cpu_online(i)) { + map[i] = 0; + continue; + } + + /* + * Easy case - we have equal or more hardware queues. Or + * there are no thread siblings to take into account. Do + * 1:1 if enough, or sequential mapping if less. + */ + if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { + map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); + queue++; + continue; + } + + /* + * Less then nr_cpus queues, and we have some number of + * threads per cores. Map sibling threads to the same + * queue. + */ + first_sibling = get_first_sibling(i); + if (first_sibling == i) { + map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, + queue); + queue++; + } else + map[i] = map[first_sibling]; + } + + free_cpumask_var(cpus); + return 0; +} + +unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) +{ + unsigned int *map; + + /* If cpus are offline, map them to first hctx */ + map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, + set->numa_node); + if (!map) + return NULL; + + if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) + return map; + + kfree(map); + return NULL; +} + +/* + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +{ + int i; + + for_each_possible_cpu(i) { + if (index == mq_map[i]) + return cpu_to_node(i); + } + + return NUMA_NO_NODE; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c new file mode 100644 index 00000000000..ed521786755 --- /dev/null +++ b/block/blk-mq-sysfs.c @@ -0,0 +1,456 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/smp.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static void blk_mq_sysfs_release(struct kobject *kobj) +{ +} + +struct blk_mq_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_ctx *, char *); + ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); +}; + +struct blk_mq_hw_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_hw_ctx *, char *); + ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); +}; + +static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + q = ctx->queue; + + if (!entry->show) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->show(ctx, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + q = ctx->queue; + + if (!entry->store) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->store(ctx, page, length); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *page) +{ + struct blk_mq_hw_ctx_sysfs_entry *entry; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); + q = hctx->queue; + + if (!entry->show) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->show(hctx, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, + struct attribute *attr, const char *page, + size_t length) +{ + struct blk_mq_hw_ctx_sysfs_entry *entry; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); + q = hctx->queue; + + if (!entry->store) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->store(hctx, page, length); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) +{ + return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], + ctx->rq_dispatched[0]); +} + +static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) +{ + return sprintf(page, "%lu\n", ctx->rq_merged); +} + +static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) +{ + return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], + ctx->rq_completed[0]); +} + +static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) +{ + char *start_page = page; + struct request *rq; + + page += sprintf(page, "%s:\n", msg); + + list_for_each_entry(rq, list, queuelist) + page += sprintf(page, "\t%p\n", rq); + + return page - start_page; +} + +static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) +{ + ssize_t ret; + + spin_lock(&ctx->lock); + ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); + spin_unlock(&ctx->lock); + + return ret; +} + +static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + return sprintf(page, "%lu\n", hctx->queued); +} + +static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "%lu\n", hctx->run); +} + +static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + char *start_page = page; + int i; + + page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); + + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { + unsigned long d = 1U << (i - 1); + + page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); + } + + return page - start_page; +} + +static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + ssize_t ret; + + spin_lock(&hctx->lock); + ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); + spin_unlock(&hctx->lock); + + return ret; +} + +static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return blk_mq_tag_sysfs_show(hctx->tags, page); +} + +static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); +} + +static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + unsigned int i, first = 1; + ssize_t ret = 0; + + blk_mq_disable_hotplug(); + + for_each_cpu(i, hctx->cpumask) { + if (first) + ret += sprintf(ret + page, "%u", i); + else + ret += sprintf(ret + page, ", %u", i); + + first = 0; + } + + blk_mq_enable_hotplug(); + + ret += sprintf(ret + page, "\n"); + return ret; +} + +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { + .attr = {.name = "dispatched", .mode = S_IRUGO }, + .show = blk_mq_sysfs_dispatched_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { + .attr = {.name = "merged", .mode = S_IRUGO }, + .show = blk_mq_sysfs_merged_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { + .attr = {.name = "completed", .mode = S_IRUGO }, + .show = blk_mq_sysfs_completed_show, +}; +static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { + .attr = {.name = "rq_list", .mode = S_IRUGO }, + .show = blk_mq_sysfs_rq_list_show, +}; + +static struct attribute *default_ctx_attrs[] = { + &blk_mq_sysfs_dispatched.attr, + &blk_mq_sysfs_merged.attr, + &blk_mq_sysfs_completed.attr, + &blk_mq_sysfs_rq_list.attr, + NULL, +}; + +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { + .attr = {.name = "queued", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_queued_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { + .attr = {.name = "run", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_run_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { + .attr = {.name = "dispatched", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_dispatched_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { + .attr = {.name = "active", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_active_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { + .attr = {.name = "pending", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_rq_list_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { + .attr = {.name = "tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_tags_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { + .attr = {.name = "cpu_list", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_cpus_show, +}; + +static struct attribute *default_hw_ctx_attrs[] = { + &blk_mq_hw_sysfs_queued.attr, + &blk_mq_hw_sysfs_run.attr, + &blk_mq_hw_sysfs_dispatched.attr, + &blk_mq_hw_sysfs_pending.attr, + &blk_mq_hw_sysfs_tags.attr, + &blk_mq_hw_sysfs_cpus.attr, + &blk_mq_hw_sysfs_active.attr, + NULL, +}; + +static const struct sysfs_ops blk_mq_sysfs_ops = { + .show = blk_mq_sysfs_show, + .store = blk_mq_sysfs_store, +}; + +static const struct sysfs_ops blk_mq_hw_sysfs_ops = { + .show = blk_mq_hw_sysfs_show, + .store = blk_mq_hw_sysfs_store, +}; + +static struct kobj_type blk_mq_ktype = { + .sysfs_ops = &blk_mq_sysfs_ops, + .release = blk_mq_sysfs_release, +}; + +static struct kobj_type blk_mq_ctx_ktype = { + .sysfs_ops = &blk_mq_sysfs_ops, + .default_attrs = default_ctx_attrs, + .release = blk_mq_sysfs_release, +}; + +static struct kobj_type blk_mq_hw_ktype = { + .sysfs_ops = &blk_mq_hw_sysfs_ops, + .default_attrs = default_hw_ctx_attrs, + .release = blk_mq_sysfs_release, +}; + +static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + int i; + + if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + return; + + hctx_for_each_ctx(hctx, ctx, i) + kobject_del(&ctx->kobj); + + kobject_del(&hctx->kobj); +} + +static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + int i, ret; + + if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + return 0; + + ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); + if (ret) + return ret; + + hctx_for_each_ctx(hctx, ctx, i) { + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); + if (ret) + break; + } + + return ret; +} + +void blk_mq_unregister_disk(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + blk_mq_unregister_hctx(hctx); + + hctx_for_each_ctx(hctx, ctx, j) + kobject_put(&ctx->kobj); + + kobject_put(&hctx->kobj); + } + + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); + kobject_del(&q->mq_kobj); + kobject_put(&q->mq_kobj); + + kobject_put(&disk_to_dev(disk)->kobj); +} + +static void blk_mq_sysfs_init(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + kobject_init(&q->mq_kobj, &blk_mq_ktype); + + queue_for_each_hw_ctx(q, hctx, i) { + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); + + hctx_for_each_ctx(hctx, ctx, j) + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); + } +} + +int blk_mq_register_disk(struct gendisk *disk) +{ + struct device *dev = disk_to_dev(disk); + struct request_queue *q = disk->queue; + struct blk_mq_hw_ctx *hctx; + int ret, i; + + blk_mq_sysfs_init(q); + + ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); + if (ret < 0) + return ret; + + kobject_uevent(&q->mq_kobj, KOBJ_ADD); + + queue_for_each_hw_ctx(q, hctx, i) { + hctx->flags |= BLK_MQ_F_SYSFS_UP; + ret = blk_mq_register_hctx(hctx); + if (ret) + break; + } + + if (ret) { + blk_mq_unregister_disk(disk); + return ret; + } + + return 0; +} + +void blk_mq_sysfs_unregister(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_unregister_hctx(hctx); +} + +int blk_mq_sysfs_register(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i, ret = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_register_hctx(hctx); + if (ret) + break; + } + + return ret; +} diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c new file mode 100644 index 00000000000..c1b92426c95 --- /dev/null +++ b/block/blk-mq-tag.c @@ -0,0 +1,618 @@ +/* + * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread + * over multiple cachelines to avoid ping-pong between multiple submitters + * or submitter and completer. Uses rolling wakeups to avoid falling of + * the scaling cliff when we run out of tags and have to start putting + * submitters to sleep. + * + * Uses active queue tracking to support fairer distribution of tags + * between multiple submitters when a shared tag map is used. + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/random.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) +{ + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int ret; + + ret = find_first_zero_bit(&bm->word, bm->depth); + if (ret < bm->depth) + return true; + } + + return false; +} + +bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +{ + if (!tags) + return true; + + return bt_has_free_tags(&tags->bitmap_tags); +} + +static inline int bt_index_inc(int index) +{ + return (index + 1) & (BT_WAIT_QUEUES - 1); +} + +static inline void bt_index_atomic_inc(atomic_t *index) +{ + int old = atomic_read(index); + int new = bt_index_inc(old); + atomic_cmpxchg(index, old, new); +} + +/* + * If a previously inactive queue goes active, bump the active user count. + */ +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); + + return true; +} + +/* + * Wakeup all potentially sleeping on normal (non-reserved) tags + */ +static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) +{ + struct blk_mq_bitmap_tags *bt; + int i, wake_index; + + bt = &tags->bitmap_tags; + wake_index = atomic_read(&bt->wake_index); + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) + wake_up(&bs->wait); + + wake_index = bt_index_inc(wake_index); + } +} + +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + blk_mq_tag_wakeup_all(tags); +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; +} + +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) +{ + int tag, org_last_tag, end; + + org_last_tag = last_tag; + end = bm->depth; + do { +restart: + tag = find_next_zero_bit(&bm->word, end, last_tag); + if (unlikely(tag >= end)) { + /* + * We started with an offset, start from 0 to + * exhaust the map. + */ + if (org_last_tag && last_tag) { + end = last_tag; + last_tag = 0; + goto restart; + } + return -1; + } + last_tag = tag + 1; + } while (test_and_set_bit_lock(tag, &bm->word)); + + return tag; +} + +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, + unsigned int *tag_cache) +{ + unsigned int last_tag, org_last_tag; + int index, i, tag; + + if (!hctx_may_queue(hctx, bt)) + return -1; + + last_tag = org_last_tag = *tag_cache; + index = TAG_TO_INDEX(bt, last_tag); + + for (i = 0; i < bt->map_nr; i++) { + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); + if (tag != -1) { + tag += (index << bt->bits_per_word); + goto done; + } + + last_tag = 0; + if (++index >= bt->map_nr) + index = 0; + } + + *tag_cache = 0; + return -1; + + /* + * Only update the cache from the allocation path, if we ended + * up using the specific cached tag. + */ +done: + if (tag == org_last_tag) { + last_tag = tag + 1; + if (last_tag >= bt->depth - 1) + last_tag = 0; + + *tag_cache = last_tag; + } + + return tag; +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx) +{ + struct bt_wait_state *bs; + int wait_index; + + if (!hctx) + return &bt->bs[0]; + + wait_index = atomic_read(&hctx->wait_index); + bs = &bt->bs[wait_index]; + bt_index_atomic_inc(&hctx->wait_index); + return bs; +} + +static int bt_get(struct blk_mq_alloc_data *data, + struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag) +{ + struct bt_wait_state *bs; + DEFINE_WAIT(wait); + int tag; + + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + return tag; + + if (!(data->gfp & __GFP_WAIT)) + return -1; + + bs = bt_wait_ptr(bt, hctx); + do { + prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + + blk_mq_put_ctx(data->ctx); + + io_schedule(); + + data->ctx = blk_mq_get_ctx(data->q); + data->hctx = data->q->mq_ops->map_queue(data->q, + data->ctx->cpu); + if (data->reserved) { + bt = &data->hctx->tags->breserved_tags; + } else { + last_tag = &data->ctx->last_tag; + hctx = data->hctx; + bt = &hctx->tags->bitmap_tags; + } + finish_wait(&bs->wait, &wait); + bs = bt_wait_ptr(bt, hctx); + } while (1); + + finish_wait(&bs->wait, &wait); + return tag; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) +{ + int tag; + + tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, + &data->ctx->last_tag); + if (tag >= 0) + return tag + data->hctx->tags->nr_reserved_tags; + + return BLK_MQ_TAG_FAIL; +} + +static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) +{ + int tag, zero = 0; + + if (unlikely(!data->hctx->tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return BLK_MQ_TAG_FAIL; + } + + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero); + if (tag < 0) + return BLK_MQ_TAG_FAIL; + + return tag; +} + +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) +{ + if (!data->reserved) + return __blk_mq_get_tag(data); + + return __blk_mq_get_reserved_tag(data); +} + +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +{ + int i, wake_index; + + wake_index = atomic_read(&bt->wake_index); + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) { + int o = atomic_read(&bt->wake_index); + if (wake_index != o) + atomic_cmpxchg(&bt->wake_index, o, wake_index); + + return bs; + } + + wake_index = bt_index_inc(wake_index); + } + + return NULL; +} + +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +{ + const int index = TAG_TO_INDEX(bt, tag); + struct bt_wait_state *bs; + int wait_cnt; + + /* + * The unlock memory barrier need to order access to req in free + * path and clearing tag bit + */ + clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + bs = bt_wake_ptr(bt); + if (!bs) + return; + + wait_cnt = atomic_dec_return(&bs->wait_cnt); + if (wait_cnt == 0) { +wake: + atomic_add(bt->wake_cnt, &bs->wait_cnt); + bt_index_atomic_inc(&bt->wake_index); + wake_up(&bs->wait); + } else if (wait_cnt < 0) { + wait_cnt = atomic_inc_return(&bs->wait_cnt); + if (!wait_cnt) + goto wake; + } +} + +static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +{ + BUG_ON(tag >= tags->nr_tags); + + bt_clear_tag(&tags->bitmap_tags, tag); +} + +static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, + unsigned int tag) +{ + BUG_ON(tag >= tags->nr_reserved_tags); + + bt_clear_tag(&tags->breserved_tags, tag); +} + +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, + unsigned int *last_tag) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; + + __blk_mq_put_tag(tags, real_tag); + *last_tag = real_tag; + } else + __blk_mq_put_reserved_tag(tags, tag); +} + +static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, + unsigned long *free_map, unsigned int off) +{ + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int bit = 0; + + do { + bit = find_next_zero_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + __set_bit(bit + off, free_map); + bit++; + } while (1); + + off += (1 << bt->bits_per_word); + } +} + +void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, + void (*fn)(void *, unsigned long *), void *data) +{ + unsigned long *tag_map; + size_t map_size; + + map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; + tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); + if (!tag_map) + return; + + bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); + if (tags->nr_reserved_tags) + bt_for_each_free(&tags->breserved_tags, tag_map, 0); + + fn(data, tag_map); + kfree(tag_map); +} +EXPORT_SYMBOL(blk_mq_tag_busy_iter); + +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ + unsigned int i, used; + + for (i = 0, used = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + used += bitmap_weight(&bm->word, bm->depth); + } + + return bt->depth - used; +} + +static void bt_update_count(struct blk_mq_bitmap_tags *bt, + unsigned int depth) +{ + unsigned int tags_per_word = 1U << bt->bits_per_word; + unsigned int map_depth = depth; + + if (depth) { + int i; + + for (i = 0; i < bt->map_nr; i++) { + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= bt->map[i].depth; + } + } + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / 4) + bt->wake_cnt = max(1U, depth / 4); + + bt->depth = depth; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, + int node, bool reserved) +{ + int i; + + bt->bits_per_word = ilog2(BITS_PER_LONG); + + /* + * Depth can be zero for reserved tags, that's not a failure + * condition. + */ + if (depth) { + unsigned int nr, tags_per_word; + + tags_per_word = (1 << bt->bits_per_word); + + /* + * If the tag space is small, shrink the number of tags + * per word so we spread over a few cachelines, at least. + * If less than 4 tags, just forget about it, it's not + * going to work optimally anyway. + */ + if (depth >= 4) { + while (tags_per_word * 4 > depth) { + bt->bits_per_word--; + tags_per_word = (1 << bt->bits_per_word); + } + } + + nr = ALIGN(depth, tags_per_word) / tags_per_word; + bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bt->map) + return -ENOMEM; + + bt->map_nr = nr; + } + + bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); + if (!bt->bs) { + kfree(bt->map); + return -ENOMEM; + } + + bt_update_count(bt, depth); + + for (i = 0; i < BT_WAIT_QUEUES; i++) { + init_waitqueue_head(&bt->bs[i].wait); + atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); + } + + return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ + kfree(bt->map); + kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) + goto enomem; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) + goto enomem; + + return tags; +enomem: + bt_free(&tags->bitmap_tags); + kfree(tags); + return NULL; +} + +struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, + unsigned int reserved_tags, int node) +{ + struct blk_mq_tags *tags; + + if (total_tags > BLK_MQ_TAG_MAX) { + pr_err("blk-mq: tag depth too large\n"); + return NULL; + } + + tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); + if (!tags) + return NULL; + + tags->nr_tags = total_tags; + tags->nr_reserved_tags = reserved_tags; + + return blk_mq_init_bitmap_tags(tags, node); +} + +void blk_mq_free_tags(struct blk_mq_tags *tags) +{ + bt_free(&tags->bitmap_tags); + bt_free(&tags->breserved_tags); + kfree(tags); +} + +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + *tag = prandom_u32() % depth; +} + +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) +{ + tdepth -= tags->nr_reserved_tags; + if (tdepth > tags->nr_tags) + return -EINVAL; + + /* + * Don't need (or can't) update reserved tags here, they remain + * static and should never need resizing. + */ + bt_update_count(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags); + return 0; +} + +ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) +{ + char *orig_page = page; + unsigned int free, res; + + if (!tags) + return 0; + + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " + "bits_per_word=%u\n", + tags->nr_tags, tags->nr_reserved_tags, + tags->bitmap_tags.bits_per_word); + + free = bt_unused_tags(&tags->bitmap_tags); + res = bt_unused_tags(&tags->breserved_tags); + + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); + + return page - orig_page; +} diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h new file mode 100644 index 00000000000..6206ed17ef7 --- /dev/null +++ b/block/blk-mq-tag.h @@ -0,0 +1,88 @@ +#ifndef INT_BLK_MQ_TAG_H +#define INT_BLK_MQ_TAG_H + +#include "blk-mq.h" + +enum { + BT_WAIT_QUEUES = 8, + BT_WAIT_BATCH = 8, +}; + +struct bt_wait_state { + atomic_t wait_cnt; + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) +#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) + +struct blk_mq_bitmap_tags { + unsigned int depth; + unsigned int wake_cnt; + unsigned int bits_per_word; + + unsigned int map_nr; + struct blk_align_bitmap *map; + + atomic_t wake_index; + struct bt_wait_state *bs; +}; + +/* + * Tag address space map. + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + + atomic_t active_queues; + + struct blk_mq_bitmap_tags bitmap_tags; + struct blk_mq_bitmap_tags breserved_tags; + + struct request **rqs; + struct list_head page_list; +}; + + +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); +extern void blk_mq_free_tags(struct blk_mq_tags *tags); + +extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); +extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); +extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); +extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); +extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); + +enum { + BLK_MQ_TAG_CACHE_MIN = 1, + BLK_MQ_TAG_CACHE_MAX = 64, +}; + +enum { + BLK_MQ_TAG_FAIL = -1U, + BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, + BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, +}; + +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return false; + + return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return; + + __blk_mq_tag_idle(hctx); +} + +#endif diff --git a/block/blk-mq.c b/block/blk-mq.c new file mode 100644 index 00000000000..ad69ef657e8 --- /dev/null +++ b/block/blk-mq.c @@ -0,0 +1,2058 @@ +/* + * Block multiqueue core code + * + * Copyright (C) 2013-2014 Jens Axboe + * Copyright (C) 2013-2014 Christoph Hellwig + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/smp.h> +#include <linux/llist.h> +#include <linux/list_sort.h> +#include <linux/cpu.h> +#include <linux/cache.h> +#include <linux/sched/sysctl.h> +#include <linux/delay.h> + +#include <trace/events/block.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static DEFINE_MUTEX(all_q_mutex); +static LIST_HEAD(all_q_list); + +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); + +/* + * Check if any of the ctx's have pending work in this hardware queue + */ +static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +{ + unsigned int i; + + for (i = 0; i < hctx->ctx_map.map_size; i++) + if (hctx->ctx_map.map[i].word) + return true; + + return false; +} + +static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; +} + +#define CTX_TO_BIT(hctx, ctx) \ + ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) + +/* + * Mark this ctx as having pending work in this hardware queue + */ +static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + struct blk_align_bitmap *bm = get_bm(hctx, ctx); + + if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) + set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); +} + +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + struct blk_align_bitmap *bm = get_bm(hctx, ctx); + + clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); +} + +static int blk_mq_queue_enter(struct request_queue *q) +{ + int ret; + + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); + smp_wmb(); + + /* we have problems freezing the queue if it's initializing */ + if (!blk_queue_dying(q) && + (!blk_queue_bypass(q) || !blk_queue_init_done(q))) + return 0; + + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); + + spin_lock_irq(q->queue_lock); + ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, + !blk_queue_bypass(q) || blk_queue_dying(q), + *q->queue_lock); + /* inc usage with lock hold to avoid freeze_queue runs here */ + if (!ret && !blk_queue_dying(q)) + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); + else if (blk_queue_dying(q)) + ret = -ENODEV; + spin_unlock_irq(q->queue_lock); + + return ret; +} + +static void blk_mq_queue_exit(struct request_queue *q) +{ + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); +} + +void blk_mq_drain_queue(struct request_queue *q) +{ + while (true) { + s64 count; + + spin_lock_irq(q->queue_lock); + count = percpu_counter_sum(&q->mq_usage_counter); + spin_unlock_irq(q->queue_lock); + + if (count == 0) + break; + blk_mq_start_hw_queues(q); + msleep(10); + } +} + +/* + * Guarantee no request is in use, so we can change any data structure of + * the queue afterward. + */ +static void blk_mq_freeze_queue(struct request_queue *q) +{ + bool drain; + + spin_lock_irq(q->queue_lock); + drain = !q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + spin_unlock_irq(q->queue_lock); + + if (drain) + blk_mq_drain_queue(q); +} + +static void blk_mq_unfreeze_queue(struct request_queue *q) +{ + bool wake = false; + + spin_lock_irq(q->queue_lock); + if (!--q->bypass_depth) { + queue_flag_clear(QUEUE_FLAG_BYPASS, q); + wake = true; + } + WARN_ON_ONCE(q->bypass_depth < 0); + spin_unlock_irq(q->queue_lock); + if (wake) + wake_up_all(&q->mq_freeze_wq); +} + +bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) +{ + return blk_mq_has_free_tags(hctx->tags); +} +EXPORT_SYMBOL(blk_mq_can_queue); + +static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int rw_flags) +{ + if (blk_queue_io_stat(q)) + rw_flags |= REQ_IO_STAT; + + INIT_LIST_HEAD(&rq->queuelist); + /* csd/requeue_work/fifo_time is initialized before use */ + rq->q = q; + rq->mq_ctx = ctx; + rq->cmd_flags |= rw_flags; + /* do not touch atomic flags, it needs atomic ops against the timer */ + rq->cpu = -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->rq_disk = NULL; + rq->part = NULL; + rq->start_time = jiffies; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; + set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + rq->nr_integrity_segments = 0; +#endif + rq->special = NULL; + /* tag was already set */ + rq->errors = 0; + + rq->extra_len = 0; + rq->sense_len = 0; + rq->resid_len = 0; + rq->sense = NULL; + + INIT_LIST_HEAD(&rq->timeout_list); + rq->timeout = 0; + + rq->end_io = NULL; + rq->end_io_data = NULL; + rq->next_rq = NULL; + + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; +} + +static struct request * +__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) +{ + struct request *rq; + unsigned int tag; + + tag = blk_mq_get_tag(data); + if (tag != BLK_MQ_TAG_FAIL) { + rq = data->hctx->tags->rqs[tag]; + + rq->cmd_flags = 0; + if (blk_mq_tag_busy(data->hctx)) { + rq->cmd_flags = REQ_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } + + rq->tag = tag; + blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); + return rq; + } + + return NULL; +} + +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, + bool reserved) +{ + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; + struct request *rq; + struct blk_mq_alloc_data alloc_data; + + if (blk_mq_queue_enter(q)) + return NULL; + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + reserved, ctx, hctx); + + rq = __blk_mq_alloc_request(&alloc_data, rw); + if (!rq && (gfp & __GFP_WAIT)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, + hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + ctx = alloc_data.ctx; + } + blk_mq_put_ctx(ctx); + return rq; +} +EXPORT_SYMBOL(blk_mq_alloc_request); + +static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, struct request *rq) +{ + const int tag = rq->tag; + struct request_queue *q = rq->q; + + if (rq->cmd_flags & REQ_MQ_INFLIGHT) + atomic_dec(&hctx->nr_active); + + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + blk_mq_put_tag(hctx, tag, &ctx->last_tag); + blk_mq_queue_exit(q); +} + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + ctx->rq_completed[rq_is_sync(rq)]++; + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + __blk_mq_free_request(hctx, ctx, rq); +} + +/* + * Clone all relevant state from a request that has been put on hold in + * the flush state machine into the preallocated flush request that hangs + * off the request queue. + * + * For a driver the flush request should be invisible, that's why we are + * impersonating the original request here. + */ +void blk_mq_clone_flush_request(struct request *flush_rq, + struct request *orig_rq) +{ + struct blk_mq_hw_ctx *hctx = + orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); + + flush_rq->mq_ctx = orig_rq->mq_ctx; + flush_rq->tag = orig_rq->tag; + memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), + hctx->cmd_size); +} + +inline void __blk_mq_end_io(struct request *rq, int error) +{ + blk_account_io_done(rq); + + if (rq->end_io) { + rq->end_io(rq, error); + } else { + if (unlikely(blk_bidi_rq(rq))) + blk_mq_free_request(rq->next_rq); + blk_mq_free_request(rq); + } +} +EXPORT_SYMBOL(__blk_mq_end_io); + +void blk_mq_end_io(struct request *rq, int error) +{ + if (blk_update_request(rq, error, blk_rq_bytes(rq))) + BUG(); + __blk_mq_end_io(rq, error); +} +EXPORT_SYMBOL(blk_mq_end_io); + +static void __blk_mq_complete_request_remote(void *data) +{ + struct request *rq = data; + + rq->q->softirq_done_fn(rq); +} + +static void blk_mq_ipi_complete_request(struct request *rq) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + bool shared = false; + int cpu; + + if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { + rq->q->softirq_done_fn(rq); + return; + } + + cpu = get_cpu(); + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + shared = cpus_share_cache(cpu, ctx->cpu); + + if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { + rq->csd.func = __blk_mq_complete_request_remote; + rq->csd.info = rq; + rq->csd.flags = 0; + smp_call_function_single_async(ctx->cpu, &rq->csd); + } else { + rq->q->softirq_done_fn(rq); + } + put_cpu(); +} + +void __blk_mq_complete_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (!q->softirq_done_fn) + blk_mq_end_io(rq, rq->errors); + else + blk_mq_ipi_complete_request(rq); +} + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Ends all I/O on a request. It does not handle partial completions. + * The actual completion happens out-of-order, through a IPI handler. + **/ +void blk_mq_complete_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (unlikely(blk_should_fake_timeout(q))) + return; + if (!blk_mark_rq_complete(rq)) + __blk_mq_complete_request(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); + +static void blk_mq_start_request(struct request *rq, bool last) +{ + struct request_queue *q = rq->q; + + trace_block_rq_issue(q, rq); + + rq->resid_len = blk_rq_bytes(rq); + if (unlikely(blk_bidi_rq(rq))) + rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + + blk_add_timer(rq); + + /* + * Mark us as started and clear complete. Complete might have been + * set if requeue raced with timeout, which then marked it as + * complete. So be sure to clear complete again when we start + * the request, otherwise we'll ignore the completion event. + */ + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + + if (q->dma_drain_size && blk_rq_bytes(rq)) { + /* + * Make sure space for the drain appears. We know we can do + * this because max_hw_segments has been adjusted to be one + * fewer than the device can handle. + */ + rq->nr_phys_segments++; + } + + /* + * Flag the last request in the series so that drivers know when IO + * should be kicked off, if they don't do it on a per-request basis. + * + * Note: the flag isn't the only condition drivers should do kick off. + * If drive is busy, the last request might not have the bit set. + */ + if (last) + rq->cmd_flags |= REQ_END; +} + +static void __blk_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + trace_block_rq_requeue(q, rq); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + + rq->cmd_flags &= ~REQ_END; + + if (q->dma_drain_size && blk_rq_bytes(rq)) + rq->nr_phys_segments--; +} + +void blk_mq_requeue_request(struct request *rq) +{ + __blk_mq_requeue_request(rq); + blk_clear_rq_complete(rq); + + BUG_ON(blk_queued_rq(rq)); + blk_mq_add_to_requeue_list(rq, true); +} +EXPORT_SYMBOL(blk_mq_requeue_request); + +static void blk_mq_requeue_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, requeue_work); + LIST_HEAD(rq_list); + struct request *rq, *next; + unsigned long flags; + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + continue; + + rq->cmd_flags &= ~REQ_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, true, false, false); + } + + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, false, false, false); + } + + blk_mq_run_queues(q, false); +} + +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + /* + * We abuse this flag that is otherwise used by the I/O scheduler to + * request head insertation from the workqueue. + */ + BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + + spin_lock_irqsave(&q->requeue_lock, flags); + if (at_head) { + rq->cmd_flags |= REQ_SOFTBARRIER; + list_add(&rq->queuelist, &q->requeue_list); + } else { + list_add_tail(&rq->queuelist, &q->requeue_list); + } + spin_unlock_irqrestore(&q->requeue_lock, flags); +} +EXPORT_SYMBOL(blk_mq_add_to_requeue_list); + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ + kblockd_schedule_work(&q->requeue_work); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + +static inline bool is_flush_request(struct request *rq, unsigned int tag) +{ + return ((rq->cmd_flags & REQ_FLUSH_SEQ) && + rq->q->flush_rq->tag == tag); +} + +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) +{ + struct request *rq = tags->rqs[tag]; + + if (!is_flush_request(rq, tag)) + return rq; + + return rq->q->flush_rq; +} +EXPORT_SYMBOL(blk_mq_tag_to_rq); + +struct blk_mq_timeout_data { + struct blk_mq_hw_ctx *hctx; + unsigned long *next; + unsigned int *next_set; +}; + +static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) +{ + struct blk_mq_timeout_data *data = __data; + struct blk_mq_hw_ctx *hctx = data->hctx; + unsigned int tag; + + /* It may not be in flight yet (this is where + * the REQ_ATOMIC_STARTED flag comes in). The requests are + * statically allocated, so we know it's always safe to access the + * memory associated with a bit offset into ->rqs[]. + */ + tag = 0; + do { + struct request *rq; + + tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); + if (tag >= hctx->tags->nr_tags) + break; + + rq = blk_mq_tag_to_rq(hctx->tags, tag++); + if (rq->q != hctx->queue) + continue; + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + continue; + + blk_rq_check_expired(rq, data->next, data->next_set); + } while (1); +} + +static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, + unsigned long *next, + unsigned int *next_set) +{ + struct blk_mq_timeout_data data = { + .hctx = hctx, + .next = next, + .next_set = next_set, + }; + + /* + * Ask the tagging code to iterate busy requests, so we can + * check them for timeout. + */ + blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); +} + +static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +{ + struct request_queue *q = rq->q; + + /* + * We know that complete is set at this point. If STARTED isn't set + * anymore, then the request isn't active and the "timeout" should + * just be ignored. This can happen due to the bitflag ordering. + * Timeout first checks if STARTED is set, and if it is, assumes + * the request is active. But if we race with completion, then + * we both flags will get cleared. So check here again, and ignore + * a timeout event with a request that isn't active. + */ + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return BLK_EH_NOT_HANDLED; + + if (!q->mq_ops->timeout) + return BLK_EH_RESET_TIMER; + + return q->mq_ops->timeout(rq); +} + +static void blk_mq_rq_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *) data; + struct blk_mq_hw_ctx *hctx; + unsigned long next = 0; + int i, next_set = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!hctx->nr_ctx || !hctx->tags) + continue; + + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + } + + if (next_set) { + next = blk_rq_timeout(round_jiffies_up(next)); + mod_timer(&q->timeout, next); + } else { + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_tag_idle(hctx); + } +} + +/* + * Reverse check our software queue for entries that we could potentially + * merge with. Currently includes a hand-wavy stop count of 8, to not spend + * too much time checking for merges. + */ +static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_ctx *ctx, struct bio *bio) +{ + struct request *rq; + int checked = 8; + + list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { + int el_ret; + + if (!checked--) + break; + + if (!blk_rq_merge_ok(rq, bio)) + continue; + + el_ret = blk_try_merge(rq, bio); + if (el_ret == ELEVATOR_BACK_MERGE) { + if (bio_attempt_back_merge(q, rq, bio)) { + ctx->rq_merged++; + return true; + } + break; + } else if (el_ret == ELEVATOR_FRONT_MERGE) { + if (bio_attempt_front_merge(q, rq, bio)) { + ctx->rq_merged++; + return true; + } + break; + } + } + + return false; +} + +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +{ + struct blk_mq_ctx *ctx; + int i; + + for (i = 0; i < hctx->ctx_map.map_size; i++) { + struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; + unsigned int off, bit; + + if (!bm->word) + continue; + + bit = 0; + off = i * hctx->ctx_map.bits_per_word; + do { + bit = find_next_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + ctx = hctx->ctxs[bit + off]; + clear_bit(bit, &bm->word); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, list); + spin_unlock(&ctx->lock); + + bit++; + } while (1); + } +} + +/* + * Run this hardware queue, pulling any software queues mapped to it in. + * Note that this function currently has various problems around ordering + * of IO. In particular, we'd like FIFO behaviour on handling existing + * items on the hctx->dispatch list. Ignore that for now. + */ +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct request *rq; + LIST_HEAD(rq_list); + int queued; + + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); + + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + return; + + hctx->run++; + + /* + * Touch any software queue that has pending entries. + */ + flush_busy_ctxs(hctx, &rq_list); + + /* + * If we have previous entries on our dispatch list, grab them + * and stuff them at the front for more fair dispatch. + */ + if (!list_empty_careful(&hctx->dispatch)) { + spin_lock(&hctx->lock); + if (!list_empty(&hctx->dispatch)) + list_splice_init(&hctx->dispatch, &rq_list); + spin_unlock(&hctx->lock); + } + + /* + * Now process all the entries, sending them to the driver. + */ + queued = 0; + while (!list_empty(&rq_list)) { + int ret; + + rq = list_first_entry(&rq_list, struct request, queuelist); + list_del_init(&rq->queuelist); + + blk_mq_start_request(rq, list_empty(&rq_list)); + + ret = q->mq_ops->queue_rq(hctx, rq); + switch (ret) { + case BLK_MQ_RQ_QUEUE_OK: + queued++; + continue; + case BLK_MQ_RQ_QUEUE_BUSY: + list_add(&rq->queuelist, &rq_list); + __blk_mq_requeue_request(rq); + break; + default: + pr_err("blk-mq: bad return on queue: %d\n", ret); + case BLK_MQ_RQ_QUEUE_ERROR: + rq->errors = -EIO; + blk_mq_end_io(rq, rq->errors); + break; + } + + if (ret == BLK_MQ_RQ_QUEUE_BUSY) + break; + } + + if (!queued) + hctx->dispatched[0]++; + else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) + hctx->dispatched[ilog2(queued) + 1]++; + + /* + * Any items that need requeuing? Stuff them into hctx->dispatch, + * that is where we will continue on next queue run. + */ + if (!list_empty(&rq_list)) { + spin_lock(&hctx->lock); + list_splice(&rq_list, &hctx->dispatch); + spin_unlock(&hctx->lock); + } +} + +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = hctx->next_cpu; + + if (--hctx->next_cpu_batch <= 0) { + int next_cpu; + + next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(hctx->cpumask); + + hctx->next_cpu = next_cpu; + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } + + return cpu; +} + +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + return; + + if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) + __blk_mq_run_hw_queue(hctx); + else if (hctx->queue->nr_hw_queues == 1) + kblockd_schedule_delayed_work(&hctx->run_work, 0); + else { + unsigned int cpu; + + cpu = blk_mq_hctx_next_cpu(hctx); + kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); + } +} + +void blk_mq_run_queues(struct request_queue *q, bool async) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if ((!blk_mq_hctx_has_pending(hctx) && + list_empty_careful(&hctx->dispatch)) || + test_bit(BLK_MQ_S_STOPPED, &hctx->state)) + continue; + + preempt_disable(); + blk_mq_run_hw_queue(hctx, async); + preempt_enable(); + } +} +EXPORT_SYMBOL(blk_mq_run_queues); + +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + cancel_delayed_work(&hctx->run_work); + cancel_delayed_work(&hctx->delay_work); + set_bit(BLK_MQ_S_STOPPED, &hctx->state); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queue); + +void blk_mq_stop_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_stop_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queues); + +void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + + preempt_disable(); + blk_mq_run_hw_queue(hctx, false); + preempt_enable(); +} +EXPORT_SYMBOL(blk_mq_start_hw_queue); + +void blk_mq_start_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_start_hw_queues); + + +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) + continue; + + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + preempt_disable(); + blk_mq_run_hw_queue(hctx, async); + preempt_enable(); + } +} +EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); + +static void blk_mq_run_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + + __blk_mq_run_hw_queue(hctx); +} + +static void blk_mq_delay_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); + + if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) + __blk_mq_run_hw_queue(hctx); +} + +void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) +{ + unsigned long tmo = msecs_to_jiffies(msecs); + + if (hctx->queue->nr_hw_queues == 1) + kblockd_schedule_delayed_work(&hctx->delay_work, tmo); + else { + unsigned int cpu; + + cpu = blk_mq_hctx_next_cpu(hctx); + kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); + } +} +EXPORT_SYMBOL(blk_mq_delay_queue); + +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + + trace_block_rq_insert(hctx->queue, rq); + + if (at_head) + list_add(&rq->queuelist, &ctx->rq_list); + else + list_add_tail(&rq->queuelist, &ctx->rq_list); + + blk_mq_hctx_mark_pending(hctx, ctx); +} + +void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, + bool async) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; + + current_ctx = blk_mq_get_ctx(q); + if (!cpu_online(ctx->cpu)) + rq->mq_ctx = ctx = current_ctx; + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) && + !(rq->cmd_flags & (REQ_FLUSH_SEQ))) { + blk_insert_flush(rq); + } else { + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); + } + + if (run_queue) + blk_mq_run_hw_queue(hctx, async); + + blk_mq_put_ctx(current_ctx); +} + +static void blk_mq_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, + int depth, + bool from_schedule) + +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *current_ctx; + + trace_block_unplug(q, depth, !from_schedule); + + current_ctx = blk_mq_get_ctx(q); + + if (!cpu_online(ctx->cpu)) + ctx = current_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + /* + * preemption doesn't flush plug list, so it's possible ctx->cpu is + * offline now + */ + spin_lock(&ctx->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + rq->mq_ctx = ctx; + __blk_mq_insert_request(hctx, rq, false); + } + spin_unlock(&ctx->lock); + + blk_mq_run_hw_queue(hctx, from_schedule); + blk_mq_put_ctx(current_ctx); +} + +static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return !(rqa->mq_ctx < rqb->mq_ctx || + (rqa->mq_ctx == rqb->mq_ctx && + blk_rq_pos(rqa) < blk_rq_pos(rqb))); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_ctx *this_ctx; + struct request_queue *this_q; + struct request *rq; + LIST_HEAD(list); + LIST_HEAD(ctx_list); + unsigned int depth; + + list_splice_init(&plug->mq_list, &list); + + list_sort(NULL, &list, plug_ctx_cmp); + + this_q = NULL; + this_ctx = NULL; + depth = 0; + + while (!list_empty(&list)) { + rq = list_entry_rq(list.next); + list_del_init(&rq->queuelist); + BUG_ON(!rq->q); + if (rq->mq_ctx != this_ctx) { + if (this_ctx) { + blk_mq_insert_requests(this_q, this_ctx, + &ctx_list, depth, + from_schedule); + } + + this_ctx = rq->mq_ctx; + this_q = rq->q; + depth = 0; + } + + depth++; + list_add_tail(&rq->queuelist, &ctx_list); + } + + /* + * If 'this_ctx' is set, we know we have entries to complete + * on 'ctx_list'. Do those. + */ + if (this_ctx) { + blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, + from_schedule); + } +} + +static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) +{ + init_request_from_bio(rq, bio); + + if (blk_do_io_stat(rq)) + blk_account_io_start(rq, 1); +} + +static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + blk_mq_bio_to_request(rq, bio); + spin_lock(&ctx->lock); +insert_rq: + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); + return false; + } else { + spin_lock(&ctx->lock); + if (!blk_mq_attempt_merge(q, ctx, bio)) { + blk_mq_bio_to_request(rq, bio); + goto insert_rq; + } + + spin_unlock(&ctx->lock); + __blk_mq_free_request(hctx, ctx, rq); + return true; + } +} + +struct blk_map_ctx { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; +}; + +static struct request *blk_mq_map_request(struct request_queue *q, + struct bio *bio, + struct blk_map_ctx *data) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + int rw = bio_data_dir(bio); + struct blk_mq_alloc_data alloc_data; + + if (unlikely(blk_mq_queue_enter(q))) { + bio_endio(bio, -EIO); + return NULL; + } + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + if (rw_is_sync(bio->bi_rw)) + rw |= REQ_SYNC; + + trace_block_getrq(q, bio, rw); + blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, + hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + if (unlikely(!rq)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + trace_block_sleeprq(q, bio, rw); + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_set_alloc_data(&alloc_data, q, + __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + ctx = alloc_data.ctx; + hctx = alloc_data.hctx; + } + + hctx->queued++; + data->hctx = hctx; + data->ctx = ctx; + return rq; +} + +/* + * Multiple hardware queue variant. This will not use per-process plugs, + * but will attempt to bypass the hctx queueing if we can go straight to + * hardware for SYNC IO. + */ +static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + struct blk_map_ctx data; + struct request *rq; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return; + + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_insert_flush(rq); + goto run_queue; + } + + if (is_sync) { + int ret; + + blk_mq_bio_to_request(rq, bio); + blk_mq_start_request(rq, true); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(data.hctx, rq); + if (ret == BLK_MQ_RQ_QUEUE_OK) + goto done; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_io(rq, rq->errors); + goto done; + } + } + } + + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } +done: + blk_mq_put_ctx(data.ctx); +} + +/* + * Single hardware queue variant. This will attempt to use any per-process + * plug for merging and IO deferral. + */ +static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + unsigned int use_plug, request_count = 0; + struct blk_map_ctx data; + struct request *rq; + + /* + * If we have multiple hardware queues, just go directly to + * one of those for sync IO. + */ + use_plug = !is_flush_fua && !is_sync; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + + if (use_plug && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) + return; + + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return; + + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_insert_flush(rq); + goto run_queue; + } + + /* + * A task plug currently exists. Since this is completely lockless, + * utilize that to temporarily store requests until the task is + * either done or scheduled away. + */ + if (use_plug) { + struct blk_plug *plug = current->plug; + + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (list_empty(&plug->mq_list)) + trace_block_plug(q); + else if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); + } + list_add_tail(&rq->queuelist, &plug->mq_list); + blk_mq_put_ctx(data.ctx); + return; + } + } + + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } + + blk_mq_put_ctx(data.ctx); +} + +/* + * Default mapping to a software queue, since we use one per CPU. + */ +struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) +{ + return q->queue_hw_ctx[q->mq_map[cpu]]; +} +EXPORT_SYMBOL(blk_mq_map_queue); + +static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct page *page; + + if (tags->rqs && set->ops->exit_request) { + int i; + + for (i = 0; i < tags->nr_tags; i++) { + if (!tags->rqs[i]) + continue; + set->ops->exit_request(set->driver_data, tags->rqs[i], + hctx_idx, i); + } + } + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + __free_pages(page, page->private); + } + + kfree(tags->rqs); + + blk_mq_free_tags(tags); +} + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + struct blk_mq_tags *tags; + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + set->numa_node); + if (!tags) + return NULL; + + INIT_LIST_HEAD(&tags->page_list); + + tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), + GFP_KERNEL, set->numa_node); + if (!tags->rqs) { + blk_mq_free_tags(tags); + return NULL; + } + + /* + * rq_size is the size of the request plus driver payload, rounded + * to the cacheline size + */ + rq_size = round_up(sizeof(struct request) + set->cmd_size, + cache_line_size()); + left = rq_size * set->queue_depth; + + for (i = 0; i < set->queue_depth; ) { + int this_order = max_order; + struct page *page; + int to_do; + void *p; + + while (left < order_to_size(this_order - 1) && this_order) + this_order--; + + do { + page = alloc_pages_node(set->numa_node, GFP_KERNEL, + this_order); + if (page) + break; + if (!this_order--) + break; + if (order_to_size(this_order) < rq_size) + break; + } while (1); + + if (!page) + goto fail; + + page->private = this_order; + list_add_tail(&page->lru, &tags->page_list); + + p = page_address(page); + entries_per_page = order_to_size(this_order) / rq_size; + to_do = min(entries_per_page, set->queue_depth - i); + left -= to_do * rq_size; + for (j = 0; j < to_do; j++) { + tags->rqs[i] = p; + if (set->ops->init_request) { + if (set->ops->init_request(set->driver_data, + tags->rqs[i], hctx_idx, i, + set->numa_node)) + goto fail; + } + + p += rq_size; + i++; + } + } + + return tags; + +fail: + pr_warn("%s: failed to allocate requests\n", __func__); + blk_mq_free_rq_map(set, tags, hctx_idx); + return NULL; +} + +static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) +{ + kfree(bitmap->map); +} + +static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) +{ + unsigned int bpw = 8, total, num_maps, i; + + bitmap->bits_per_word = bpw; + + num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; + bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bitmap->map) + return -ENOMEM; + + bitmap->map_size = num_maps; + + total = nr_cpu_ids; + for (i = 0; i < num_maps; i++) { + bitmap->map[i].depth = min(total, bitmap->bits_per_word); + total -= bitmap->map[i].depth; + } + + return 0; +} + +static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + LIST_HEAD(tmp); + + /* + * Move ctx entries to new CPU, if this one is going away. + */ + ctx = __blk_mq_get_ctx(q, cpu); + + spin_lock(&ctx->lock); + if (!list_empty(&ctx->rq_list)) { + list_splice_init(&ctx->rq_list, &tmp); + blk_mq_hctx_clear_pending(hctx, ctx); + } + spin_unlock(&ctx->lock); + + if (list_empty(&tmp)) + return NOTIFY_OK; + + ctx = blk_mq_get_ctx(q); + spin_lock(&ctx->lock); + + while (!list_empty(&tmp)) { + struct request *rq; + + rq = list_first_entry(&tmp, struct request, queuelist); + rq->mq_ctx = ctx; + list_move_tail(&rq->queuelist, &ctx->rq_list); + } + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_hctx_mark_pending(hctx, ctx); + + spin_unlock(&ctx->lock); + + blk_mq_run_hw_queue(hctx, true); + blk_mq_put_ctx(ctx); + return NOTIFY_OK; +} + +static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[hctx->queue_num]) + return NOTIFY_OK; + + set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); + if (!set->tags[hctx->queue_num]) + return NOTIFY_STOP; + + hctx->tags = set->tags[hctx->queue_num]; + return NOTIFY_OK; +} + +static int blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) +{ + struct blk_mq_hw_ctx *hctx = data; + + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + return blk_mq_hctx_cpu_offline(hctx, cpu); + else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + return blk_mq_hctx_cpu_online(hctx, cpu); + + return NOTIFY_OK; +} + +static void blk_mq_exit_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set, int nr_queue) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (i == nr_queue) + break; + + blk_mq_tag_idle(hctx); + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, i); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); + } + +} + +static void blk_mq_free_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + free_cpumask_var(hctx->cpumask); + kfree(hctx); + } +} + +static int blk_mq_init_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + /* + * Initialize hardware queues + */ + queue_for_each_hw_ctx(q, hctx, i) { + int node; + + node = hctx->numa_node; + if (node == NUMA_NO_NODE) + node = hctx->numa_node = set->numa_node; + + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); + spin_lock_init(&hctx->lock); + INIT_LIST_HEAD(&hctx->dispatch); + hctx->queue = q; + hctx->queue_num = i; + hctx->flags = set->flags; + hctx->cmd_size = set->cmd_size; + + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, + blk_mq_hctx_notify, hctx); + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + + hctx->tags = set->tags[i]; + + /* + * Allocate space for all possible cpus to avoid allocation in + * runtime + */ + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + GFP_KERNEL, node); + if (!hctx->ctxs) + break; + + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + break; + + hctx->nr_ctx = 0; + + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, i)) + break; + } + + if (i == q->nr_hw_queues) + return 0; + + /* + * Init failed + */ + blk_mq_exit_hw_queues(q, set, i); + + return 1; +} + +static void blk_mq_init_cpu_queues(struct request_queue *q, + unsigned int nr_hw_queues) +{ + unsigned int i; + + for_each_possible_cpu(i) { + struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); + struct blk_mq_hw_ctx *hctx; + + memset(__ctx, 0, sizeof(*__ctx)); + __ctx->cpu = i; + spin_lock_init(&__ctx->lock); + INIT_LIST_HEAD(&__ctx->rq_list); + __ctx->queue = q; + + /* If the cpu isn't online, the cpu is mapped to first hctx */ + if (!cpu_online(i)) + continue; + + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); + hctx->nr_ctx++; + + /* + * Set local node, IFF we have more than one hw queue. If + * not, we remain on the home node of the device + */ + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) + hctx->numa_node = cpu_to_node(i); + } +} + +static void blk_mq_map_swqueue(struct request_queue *q) +{ + unsigned int i; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + + queue_for_each_hw_ctx(q, hctx, i) { + cpumask_clear(hctx->cpumask); + hctx->nr_ctx = 0; + } + + /* + * Map software to hardware queues + */ + queue_for_each_ctx(q, ctx, i) { + /* If the cpu isn't online, the cpu is mapped to first hctx */ + if (!cpu_online(i)) + continue; + + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); + ctx->index_hw = hctx->nr_ctx; + hctx->ctxs[hctx->nr_ctx++] = ctx; + } + + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are mapped to this hardware queue, + * disable it and free the request entries + */ + if (!hctx->nr_ctx) { + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[i]) { + blk_mq_free_rq_map(set, set->tags[i], i); + set->tags[i] = NULL; + hctx->tags = NULL; + } + continue; + } + + /* + * Initialize batch roundrobin counts + */ + hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + bool shared; + int i; + + if (set->tag_list.next == set->tag_list.prev) + shared = false; + else + shared = true; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_freeze_queue(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } + blk_mq_unfreeze_queue(q); + } +} + +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + blk_mq_freeze_queue(q); + + mutex_lock(&set->tag_list_lock); + list_del_init(&q->tag_set_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); + + blk_mq_unfreeze_queue(q); +} + +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + q->tag_set = set; + + mutex_lock(&set->tag_list_lock); + list_add_tail(&q->tag_set_list, &set->tag_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); +} + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx **hctxs; + struct blk_mq_ctx __percpu *ctx; + struct request_queue *q; + unsigned int *map; + int i; + + ctx = alloc_percpu(struct blk_mq_ctx); + if (!ctx) + return ERR_PTR(-ENOMEM); + + hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, + set->numa_node); + + if (!hctxs) + goto err_percpu; + + map = blk_mq_make_queue_map(set); + if (!map) + goto err_map; + + for (i = 0; i < set->nr_hw_queues; i++) { + int node = blk_mq_hw_queue_to_node(map, i); + + hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), + GFP_KERNEL, node); + if (!hctxs[i]) + goto err_hctxs; + + if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) + goto err_hctxs; + + atomic_set(&hctxs[i]->nr_active, 0); + hctxs[i]->numa_node = node; + hctxs[i]->queue_num = i; + } + + q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + if (!q) + goto err_hctxs; + + if (percpu_counter_init(&q->mq_usage_counter, 0)) + goto err_map; + + setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); + blk_queue_rq_timeout(q, 30000); + + q->nr_queues = nr_cpu_ids; + q->nr_hw_queues = set->nr_hw_queues; + q->mq_map = map; + + q->queue_ctx = ctx; + q->queue_hw_ctx = hctxs; + + q->mq_ops = set->ops; + q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + + if (!(set->flags & BLK_MQ_F_SG_MERGE)) + q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; + + q->sg_reserved_size = INT_MAX; + + INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->requeue_list); + spin_lock_init(&q->requeue_lock); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); + if (set->timeout) + blk_queue_rq_timeout(q, set->timeout); + + /* + * Do this after blk_queue_make_request() overrides it... + */ + q->nr_requests = set->queue_depth; + + if (set->ops->complete) + blk_queue_softirq_done(q, set->ops->complete); + + blk_mq_init_flush(q); + blk_mq_init_cpu_queues(q, set->nr_hw_queues); + + q->flush_rq = kzalloc(round_up(sizeof(struct request) + + set->cmd_size, cache_line_size()), + GFP_KERNEL); + if (!q->flush_rq) + goto err_hw; + + if (blk_mq_init_hw_queues(q, set)) + goto err_flush_rq; + + mutex_lock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); + mutex_unlock(&all_q_mutex); + + blk_mq_add_queue_tag_set(set, q); + + blk_mq_map_swqueue(q); + + return q; + +err_flush_rq: + kfree(q->flush_rq); +err_hw: + blk_cleanup_queue(q); +err_hctxs: + kfree(map); + for (i = 0; i < set->nr_hw_queues; i++) { + if (!hctxs[i]) + break; + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); + } +err_map: + kfree(hctxs); +err_percpu: + free_percpu(ctx); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(blk_mq_init_queue); + +void blk_mq_free_queue(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + blk_mq_del_queue_tag_set(q); + + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + blk_mq_free_hw_queues(q, set); + + percpu_counter_destroy(&q->mq_usage_counter); + + free_percpu(q->queue_ctx); + kfree(q->queue_hw_ctx); + kfree(q->mq_map); + + q->queue_ctx = NULL; + q->queue_hw_ctx = NULL; + q->mq_map = NULL; + + mutex_lock(&all_q_mutex); + list_del_init(&q->all_q_node); + mutex_unlock(&all_q_mutex); +} + +/* Basically redo blk_mq_init_queue with queue frozen */ +static void blk_mq_queue_reinit(struct request_queue *q) +{ + blk_mq_freeze_queue(q); + + blk_mq_sysfs_unregister(q); + + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); + + /* + * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe + * we should change hctx numa_node according to new topology (this + * involves free and re-allocate memory, worthy doing?) + */ + + blk_mq_map_swqueue(q); + + blk_mq_sysfs_register(q); + + blk_mq_unfreeze_queue(q); +} + +static int blk_mq_queue_reinit_notify(struct notifier_block *nb, + unsigned long action, void *hcpu) +{ + struct request_queue *q; + + /* + * Before new mappings are established, hotadded cpu might already + * start handling requests. This doesn't break anything as we map + * offline CPUs to first hardware queue. We will re-init the queue + * below to get optimal settings. + */ + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && + action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) + return NOTIFY_OK; + + mutex_lock(&all_q_mutex); + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_queue_reinit(q); + mutex_unlock(&all_q_mutex); + return NOTIFY_OK; +} + +/* + * Alloc a tag set to be associated with one or more request queues. + * May fail with EINVAL for various error conditions. May adjust the + * requested depth down, if if it too large. In that case, the set + * value will be stored in set->queue_depth. + */ +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) +{ + int i; + + if (!set->nr_hw_queues) + return -EINVAL; + if (!set->queue_depth) + return -EINVAL; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) + return -EINVAL; + + if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) + return -EINVAL; + + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { + pr_info("blk-mq: reduced tag depth to %u\n", + BLK_MQ_MAX_DEPTH); + set->queue_depth = BLK_MQ_MAX_DEPTH; + } + + set->tags = kmalloc_node(set->nr_hw_queues * + sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!set->tags) + goto out; + + for (i = 0; i < set->nr_hw_queues; i++) { + set->tags[i] = blk_mq_init_rq_map(set, i); + if (!set->tags[i]) + goto out_unwind; + } + + mutex_init(&set->tag_list_lock); + INIT_LIST_HEAD(&set->tag_list); + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_rq_map(set, set->tags[i], i); +out: + return -ENOMEM; +} +EXPORT_SYMBOL(blk_mq_alloc_tag_set); + +void blk_mq_free_tag_set(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + if (set->tags[i]) + blk_mq_free_rq_map(set, set->tags[i], i); + } + + kfree(set->tags); +} +EXPORT_SYMBOL(blk_mq_free_tag_set); + +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i, ret; + + if (!set || nr > set->queue_depth) + return -EINVAL; + + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_tag_update_depth(hctx->tags, nr); + if (ret) + break; + } + + if (!ret) + q->nr_requests = nr; + + return ret; +} + +void blk_mq_disable_hotplug(void) +{ + mutex_lock(&all_q_mutex); +} + +void blk_mq_enable_hotplug(void) +{ + mutex_unlock(&all_q_mutex); +} + +static int __init blk_mq_init(void) +{ + blk_mq_cpu_init(); + + /* Must be called after percpu_counter_hotcpu_callback() */ + hotcpu_notifier(blk_mq_queue_reinit_notify, -10); + + return 0; +} +subsys_initcall(blk_mq_init); diff --git a/block/blk-mq.h b/block/blk-mq.h new file mode 100644 index 00000000000..26460884c6c --- /dev/null +++ b/block/blk-mq.h @@ -0,0 +1,117 @@ +#ifndef INT_BLK_MQ_H +#define INT_BLK_MQ_H + +struct blk_mq_tag_set; + +struct blk_mq_ctx { + struct { + spinlock_t lock; + struct list_head rq_list; + } ____cacheline_aligned_in_smp; + + unsigned int cpu; + unsigned int index_hw; + + unsigned int last_tag ____cacheline_aligned_in_smp; + + /* incremented at dispatch time */ + unsigned long rq_dispatched[2]; + unsigned long rq_merged; + + /* incremented at completion time */ + unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + + struct request_queue *queue; + struct kobject kobj; +} ____cacheline_aligned_in_smp; + +void __blk_mq_complete_request(struct request *rq); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +void blk_mq_init_flush(struct request_queue *q); +void blk_mq_drain_queue(struct request_queue *q); +void blk_mq_free_queue(struct request_queue *q); +void blk_mq_clone_flush_request(struct request *flush_rq, + struct request *orig_rq); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); + +/* + * CPU hotplug helpers + */ +struct blk_mq_cpu_notifier; +void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, + int (*fn)(void *, unsigned long, unsigned int), + void *data); +void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); +void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); +void blk_mq_cpu_init(void); +void blk_mq_enable_hotplug(void); +void blk_mq_disable_hotplug(void); + +/* + * CPU -> queue mappings + */ +extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); + +/* + * sysfs helpers + */ +extern int blk_mq_sysfs_register(struct request_queue *q); +extern void blk_mq_sysfs_unregister(struct request_queue *q); + +/* + * Basic implementation of sparser bitmap, allowing the user to spread + * the bits over more cachelines. + */ +struct blk_align_bitmap { + unsigned long word; + unsigned long depth; +} ____cacheline_aligned_in_smp; + +static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, + unsigned int cpu) +{ + return per_cpu_ptr(q->queue_ctx, cpu); +} + +/* + * This assumes per-cpu software queueing queues. They could be per-node + * as well, for instance. For now this is hardcoded as-is. Note that we don't + * care about preemption, since we know the ctx's are persistent. This does + * mean that we can't rely on ctx always matching the currently running CPU. + */ +static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) +{ + return __blk_mq_get_ctx(q, get_cpu()); +} + +static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) +{ + put_cpu(); +} + +struct blk_mq_alloc_data { + /* input parameter */ + struct request_queue *q; + gfp_t gfp; + bool reserved; + + /* input & output parameter */ + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; +}; + +static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, + struct request_queue *q, gfp_t gfp, bool reserved, + struct blk_mq_ctx *ctx, + struct blk_mq_hw_ctx *hctx) +{ + data->q = q; + data->gfp = gfp; + data->reserved = reserved; + data->ctx = ctx; + data->hctx = hctx; +} + +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 36c8c1f2af1..f1a1795a568 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -104,9 +104,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); * @lim: the queue_limits structure to reset * * Description: - * Returns a queue_limit struct to its default state. Can be used by - * stacking drivers like DM that stage table swaps and reuse an - * existing device queue. + * Returns a queue_limit struct to its default state. */ void blk_set_default_limits(struct queue_limits *lim) { @@ -114,13 +112,14 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = BLK_DEF_MAX_SECTORS; - lim->max_hw_sectors = INT_MAX; + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->chunk_sectors = 0; + lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; - lim->discard_zeroes_data = -1; + lim->discard_zeroes_data = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -131,6 +130,28 @@ void blk_set_default_limits(struct queue_limits *lim) EXPORT_SYMBOL(blk_set_default_limits); /** + * blk_set_stacking_limits - set default limits for stacking devices + * @lim: the queue_limits structure to reset + * + * Description: + * Returns a queue_limit struct to its default state. Should be used + * by stacking drivers like DM that have no internal limits. + */ +void blk_set_stacking_limits(struct queue_limits *lim) +{ + blk_set_default_limits(lim); + + /* Inherit limits from component devices */ + lim->discard_zeroes_data = 1; + lim->max_segments = USHRT_MAX; + lim->max_hw_sectors = UINT_MAX; + lim->max_segment_size = UINT_MAX; + lim->max_sectors = UINT_MAX; + lim->max_write_same_sectors = UINT_MAX; +} +EXPORT_SYMBOL(blk_set_stacking_limits); + +/** * blk_queue_make_request - define an alternate make_request function for a device * @q: the request queue for the device to be affected * @mfn: the alternate make_request function @@ -164,23 +185,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) blk_queue_congestion_threshold(q); q->nr_batching = BLK_BATCH_REQ; - q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */ - if (q->unplug_delay == 0) - q->unplug_delay = 1; - - q->unplug_timer.function = blk_unplug_timeout; - q->unplug_timer.data = (unsigned long)q; - blk_set_default_limits(&q->limits); - blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); - - /* - * If the caller didn't supply a lock, fall back to our embedded - * per-queue locks - */ - if (!q->queue_lock) - q->queue_lock = &q->__queue_lock; /* * by default assume old behaviour and bounce for any highmem page @@ -192,17 +197,17 @@ EXPORT_SYMBOL(blk_queue_make_request); /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device - * @dma_mask: the maximum address the device can handle + * @max_addr: the maximum address the device can handle * * Description: * Different hardware can have different requirements as to what pages * it can do I/O directly to. A low level driver can call * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @dma_mask. + * buffers for doing I/O to pages residing above @max_addr. **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) +void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) { - unsigned long b_pfn = dma_mask >> PAGE_SHIFT; + unsigned long b_pfn = max_addr >> PAGE_SHIFT; int dma = 0; q->bounce_gfp = GFP_NOIO; @@ -273,6 +278,26 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto EXPORT_SYMBOL(blk_queue_max_hw_sectors); /** + * blk_queue_chunk_sectors - set size of the chunk for this queue + * @q: the request queue for the device + * @chunk_sectors: chunk sectors in the usual 512b unit + * + * Description: + * If a driver doesn't want IOs to cross a given chunk size, it can set + * this limit and prevent merging across chunks. Note that the chunk size + * must currently be a power-of-2 in sectors. Also note that the block + * layer must accept a page worth of data at any offset. So if the + * crossing of chunks is a hard limitation in the driver, it must still be + * prepared to split single page bios. + **/ +void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) +{ + BUG_ON(!is_power_of_2(chunk_sectors)); + q->limits.chunk_sectors = chunk_sectors; +} +EXPORT_SYMBOL(blk_queue_chunk_sectors); + +/** * blk_queue_max_discard_sectors - set max sectors for a single discard * @q: the request queue for the device * @max_discard_sectors: maximum number of sectors to discard @@ -285,6 +310,18 @@ void blk_queue_max_discard_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** + * blk_queue_max_write_same_sectors - set max sectors for a single write same + * @q: the request queue for the device + * @max_write_same_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_same_sectors(struct request_queue *q, + unsigned int max_write_same_sectors) +{ + q->limits.max_write_same_sectors = max_write_same_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_same_sectors); + +/** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device * @max_segments: max number of segments @@ -509,6 +546,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_write_same_sectors = min(t->max_write_same_sectors, + b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, @@ -574,6 +613,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ret = -1; } + t->raid_partial_stripes_expensive = + max(t->raid_partial_stripes_expensive, + b->raid_partial_stripes_expensive); + /* Find lowest common alignment_offset */ t->alignment_offset = lcm(t->alignment_offset, alignment) & (max(t->physical_block_size, t->io_min) - 1); @@ -594,7 +637,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, bottom = b->discard_granularity + alignment; /* Verify that top and bottom intervals line up */ - if (max(top, bottom) & (min(top, bottom) - 1)) + if ((max(top, bottom) % min(top, bottom)) != 0) t->discard_misaligned = 1; } @@ -602,8 +645,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); - t->discard_alignment = lcm(t->discard_alignment, alignment) & - (t->discard_granularity - 1); + t->discard_alignment = lcm(t->discard_alignment, alignment) % + t->discard_granularity; } return ret; @@ -805,6 +848,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush) } EXPORT_SYMBOL_GPL(blk_queue_flush); +void blk_queue_flush_queueable(struct request_queue *q, bool queueable) +{ + q->flush_not_queueable = !queueable; +} +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ee9c2160222..53b1737e978 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -8,6 +8,7 @@ #include <linux/blkdev.h> #include <linux/interrupt.h> #include <linux/cpu.h> +#include <linux/sched.h> #include "blk.h" @@ -22,20 +23,20 @@ static void blk_done_softirq(struct softirq_action *h) struct list_head *cpu_list, local_list; local_irq_disable(); - cpu_list = &__get_cpu_var(blk_cpu_done); + cpu_list = this_cpu_ptr(&blk_cpu_done); list_replace_init(cpu_list, &local_list); local_irq_enable(); while (!list_empty(&local_list)) { struct request *rq; - rq = list_entry(local_list.next, struct request, csd.list); - list_del_init(&rq->csd.list); + rq = list_entry(local_list.next, struct request, ipi_list); + list_del_init(&rq->ipi_list); rq->q->softirq_done_fn(rq); } } -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP static void trigger_softirq(void *data) { struct request *rq = data; @@ -43,10 +44,10 @@ static void trigger_softirq(void *data) struct list_head *list; local_irq_save(flags); - list = &__get_cpu_var(blk_cpu_done); - list_add_tail(&rq->csd.list, list); + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&rq->ipi_list, list); - if (list->next == &rq->csd.list) + if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); local_irq_restore(flags); @@ -64,21 +65,21 @@ static int raise_blk_irq(int cpu, struct request *rq) data->info = rq; data->flags = 0; - __smp_call_function_single(cpu, data, 0); + smp_call_function_single_async(cpu, data); return 0; } return 1; } -#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +#else /* CONFIG_SMP */ static int raise_blk_irq(int cpu, struct request *rq) { return 1; } #endif -static int __cpuinit blk_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int blk_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { /* * If a CPU goes away, splice its entries to the current CPU @@ -89,7 +90,7 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self, local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_done, cpu), - &__get_cpu_var(blk_cpu_done)); + this_cpu_ptr(&blk_cpu_done)); raise_softirq_irqoff(BLOCK_SOFTIRQ); local_irq_enable(); } @@ -97,35 +98,45 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata blk_cpu_notifier = { +static struct notifier_block blk_cpu_notifier = { .notifier_call = blk_cpu_notify, }; void __blk_complete_request(struct request *req) { + int ccpu, cpu; struct request_queue *q = req->q; unsigned long flags; - int ccpu, cpu, group_cpu; + bool shared = false; BUG_ON(!q->softirq_done_fn); local_irq_save(flags); cpu = smp_processor_id(); - group_cpu = blk_cpu_to_group(cpu); /* * Select completion CPU */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) + if (req->cpu != -1) { ccpu = req->cpu; - else + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) + shared = cpus_share_cache(cpu, ccpu); + } else ccpu = cpu; - if (ccpu == cpu || ccpu == group_cpu) { + /* + * If current CPU and requested CPU share a cache, run the softirq on + * the current CPU. One might concern this is just like + * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is + * running in interrupt handler, and currently I/O controller doesn't + * support multiple interrupts, so current CPU is unique actually. This + * avoids IPI sending from current CPU to the first CPU of a group. + */ + if (ccpu == cpu || shared) { struct list_head *list; do_local: - list = &__get_cpu_var(blk_cpu_done); - list_add_tail(&req->csd.list, list); + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&req->ipi_list, list); /* * if the list only contains our just added request, @@ -133,7 +144,7 @@ do_local: * entries there, someone already raised the irq but it * hasn't run yet. */ - if (list->next == &req->csd.list) + if (list->next == &req->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); } else if (raise_blk_irq(ccpu, req)) goto do_local; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 41fb69150b4..23321fbab29 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -7,8 +7,11 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blktrace_api.h> +#include <linux/blk-mq.h> #include "blk.h" +#include "blk-cgroup.h" +#include "blk-mq.h" struct queue_sysfs_entry { struct attribute attr; @@ -25,9 +28,15 @@ queue_var_show(unsigned long var, char *page) static ssize_t queue_var_store(unsigned long *var, const char *page, size_t count) { - char *p = (char *) page; + int err; + unsigned long v; + + err = kstrtoul(page, 10, &v); + if (err || v > UINT_MAX) + return -EINVAL; + + *var = v; - *var = simple_strtoul(p, &p, 10); return count; } @@ -39,45 +48,27 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl = &q->rq; unsigned long nr; - int ret; + int ret, err; - if (!q->request_fn) + if (!q->request_fn && !q->mq_ops) return -EINVAL; ret = queue_var_store(&nr, page, count); + if (ret < 0) + return ret; + if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); - - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_queue_full(q, BLK_RW_SYNC); - } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { - blk_clear_queue_full(q, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } + if (q->request_fn) + err = blk_update_nr_requests(q, nr); + else + err = blk_mq_update_nr_requests(q, nr); + + if (err) + return err; - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_queue_full(q, BLK_RW_ASYNC); - } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { - blk_clear_queue_full(q, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - spin_unlock_irq(q->queue_lock); return ret; } @@ -95,6 +86,9 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) unsigned long ra_kb; ssize_t ret = queue_var_store(&ra_kb, page, count); + if (ret < 0) + return ret; + q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); return ret; @@ -152,7 +146,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag static ssize_t queue_discard_max_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.max_discard_sectors << 9, page); + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_discard_sectors << 9); } static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) @@ -160,6 +155,13 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag return queue_var_show(queue_discard_zeroes_data(q), page); } +static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_same_sectors << 9); +} + + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -168,6 +170,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) page_kb = 1 << (PAGE_CACHE_SHIFT - 10); ssize_t ret = queue_var_store(&max_sectors_kb, page, count); + if (ret < 0) + return ret; + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; @@ -199,6 +204,8 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \ unsigned long val; \ ssize_t ret; \ ret = queue_var_store(&val, page, count); \ + if (ret < 0) \ + return ret; \ if (neg) \ val = !val; \ \ @@ -228,6 +235,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, unsigned long nm; ssize_t ret = queue_var_store(&nm, page, count); + if (ret < 0) + return ret; + spin_lock_irq(q->queue_lock); queue_flag_clear(QUEUE_FLAG_NOMERGES, q); queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); @@ -243,23 +253,33 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); - return queue_var_show(set, page); + return queue_var_show(set << force, page); } static ssize_t queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) { ssize_t ret = -EINVAL; -#if defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP unsigned long val; ret = queue_var_store(&val, page, count); + if (ret < 0) + return ret; + spin_lock_irq(q->queue_lock); - if (val) + if (val == 2) { queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - else - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 1) { + queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 0) { + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } spin_unlock_irq(q->queue_lock); #endif return ret; @@ -349,6 +369,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { .show = queue_discard_zeroes_data_show, }; +static struct queue_sysfs_entry queue_write_same_max_entry = { + .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO }, + .show = queue_write_same_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_show_nonrot, @@ -396,6 +421,7 @@ static struct attribute *default_attrs[] = { &queue_discard_granularity_entry.attr, &queue_discard_max_entry.attr, &queue_discard_zeroes_data_entry.attr, + &queue_write_same_max_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, @@ -417,7 +443,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; mutex_lock(&q->sysfs_lock); - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (blk_queue_dying(q)) { mutex_unlock(&q->sysfs_lock); return -ENOENT; } @@ -439,7 +465,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (blk_queue_dying(q)) { mutex_unlock(&q->sysfs_lock); return -ENOENT; } @@ -448,12 +474,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, return res; } +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + struct request_queue *q = container_of(rcu_head, struct request_queue, + rcu_head); + kmem_cache_free(blk_requestq_cachep, q); +} + /** - * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed - * @kobj: the kobj belonging of the request queue to be released + * blk_release_queue: - release a &struct request_queue when it is no longer needed + * @kobj: the kobj belonging to the request queue to be released * * Description: - * blk_cleanup_queue is the pair to blk_init_queue() or + * blk_release_queue is the pair to blk_init_queue() or * blk_queue_make_request(). It should be called when a request queue is * being released; typically when a block device is being de-registered. * Currently, its primary task it to free all the &struct request @@ -467,22 +500,34 @@ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - struct request_list *rl = &q->rq; blk_sync_queue(q); - blk_throtl_exit(q); + blkcg_exit_queue(q); + + if (q->elevator) { + spin_lock_irq(q->queue_lock); + ioc_clear_queue(q); + spin_unlock_irq(q->queue_lock); + elevator_exit(q->elevator); + } - if (rl->rq_pool) - mempool_destroy(rl->rq_pool); + blk_exit_rl(&q->root_rl); if (q->queue_tags) __blk_queue_free_tags(q); + if (q->mq_ops) + blk_mq_free_queue(q); + + kfree(q->flush_rq); + blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); - kmem_cache_free(blk_requestq_cachep, q); + + ida_simple_remove(&blk_queue_ida, q->id); + call_rcu(&q->rcu_head, blk_free_queue_rcu); } static const struct sysfs_ops queue_sysfs_ops = { @@ -500,22 +545,33 @@ int blk_register_queue(struct gendisk *disk) { int ret; struct device *dev = disk_to_dev(disk); - struct request_queue *q = disk->queue; if (WARN_ON(!q)) return -ENXIO; + /* + * Initialization must be complete by now. Finish the initial + * bypass from queue allocation. + */ + blk_queue_bypass_end(q); + queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + ret = blk_trace_init_sysfs(dev); if (ret) return ret; ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); - if (ret < 0) + if (ret < 0) { + blk_trace_remove_sysfs(dev); return ret; + } kobject_uevent(&q->kobj, KOBJ_ADD); + if (q->mq_ops) + blk_mq_register_disk(disk); + if (!q->request_fn) return 0; @@ -523,7 +579,7 @@ int blk_register_queue(struct gendisk *disk) if (ret) { kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); - blk_trace_remove_sysfs(disk_to_dev(disk)); + blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; } @@ -538,6 +594,9 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + if (q->mq_ops) + blk_mq_unregister_disk(disk); + if (q->request_fn) elv_unregister_queue(q); diff --git a/block/blk-tag.c b/block/blk-tag.c index ece65fc4c79..a185b86741e 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -27,18 +27,15 @@ struct request *blk_queue_find_tag(struct request_queue *q, int tag) EXPORT_SYMBOL(blk_queue_find_tag); /** - * __blk_free_tags - release a given set of tag maintenance info + * blk_free_tags - release a given set of tag maintenance info * @bqt: the tag map to free * - * Tries to free the specified @bqt. Returns true if it was - * actually freed and false if there are still references using it + * Drop the reference count on @bqt and frees it when the last reference + * is dropped. */ -static int __blk_free_tags(struct blk_queue_tag *bqt) +void blk_free_tags(struct blk_queue_tag *bqt) { - int retval; - - retval = atomic_dec_and_test(&bqt->refcnt); - if (retval) { + if (atomic_dec_and_test(&bqt->refcnt)) { BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) < bqt->max_depth); @@ -50,9 +47,8 @@ static int __blk_free_tags(struct blk_queue_tag *bqt) kfree(bqt); } - - return retval; } +EXPORT_SYMBOL(blk_free_tags); /** * __blk_queue_free_tags - release tag maintenance info @@ -69,28 +65,13 @@ void __blk_queue_free_tags(struct request_queue *q) if (!bqt) return; - __blk_free_tags(bqt); + blk_free_tags(bqt); q->queue_tags = NULL; queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); } /** - * blk_free_tags - release a given set of tag maintenance info - * @bqt: the tag map to free - * - * For externally managed @bqt frees the map. Callers of this - * function must guarantee to have released all the queues that - * might have been using this tag map. - */ -void blk_free_tags(struct blk_queue_tag *bqt) -{ - if (unlikely(!__blk_free_tags(bqt))) - BUG(); -} -EXPORT_SYMBOL(blk_free_tags); - -/** * blk_queue_free_tags - release tag maintenance info * @q: the request queue for the device * @@ -186,7 +167,8 @@ int blk_queue_init_tags(struct request_queue *q, int depth, tags = __blk_queue_init_tags(q, depth); if (!tags) - goto fail; + return -ENOMEM; + } else if (q->queue_tags) { rc = blk_queue_resize_tags(q, depth); if (rc) @@ -203,9 +185,6 @@ int blk_queue_init_tags(struct request_queue *q, int depth, queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); INIT_LIST_HEAD(&q->tag_busy_list); return 0; -fail: - kfree(tags); - return -ENOMEM; } EXPORT_SYMBOL(blk_queue_init_tags); @@ -282,16 +261,9 @@ EXPORT_SYMBOL(blk_queue_resize_tags); void blk_queue_end_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - int tag = rq->tag; + unsigned tag = rq->tag; /* negative tags invalid */ - BUG_ON(tag == -1); - - if (unlikely(tag >= bqt->real_max_depth)) - /* - * This can happen after tag depth has been reduced. - * FIXME: how about a warning or info message here? - */ - return; + BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); rq->cmd_flags &= ~REQ_QUEUED; @@ -357,9 +329,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) */ max_depth = bqt->max_depth; if (!rq_is_sync(rq) && max_depth > 1) { - max_depth -= 2; - if (!max_depth) + switch (max_depth) { + case 2: max_depth = 1; + break; + case 3: + max_depth = 2; + break; + default: + max_depth -= 2; + } if (q->in_flight[BLK_RW_ASYNC] > max_depth) return 1; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index e36cc10a346..3fdb21a390c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -10,6 +10,7 @@ #include <linux/bio.h> #include <linux/blktrace_api.h> #include "blk-cgroup.h" +#include "blk.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -20,30 +21,100 @@ static int throtl_quantum = 32; /* Throttling is performed over 100ms slice and after that slice is renewed */ static unsigned long throtl_slice = HZ/10; /* 100 ms */ +static struct blkcg_policy blkcg_policy_throtl; + /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; -static void throtl_schedule_delayed_work(struct throtl_data *td, - unsigned long delay); - -struct throtl_rb_root { - struct rb_root rb; - struct rb_node *left; - unsigned int count; - unsigned long min_disptime; + +/* + * To implement hierarchical throttling, throtl_grps form a tree and bios + * are dispatched upwards level by level until they reach the top and get + * issued. When dispatching bios from the children and local group at each + * level, if the bios are dispatched into a single bio_list, there's a risk + * of a local or child group which can queue many bios at once filling up + * the list starving others. + * + * To avoid such starvation, dispatched bios are queued separately + * according to where they came from. When they are again dispatched to + * the parent, they're popped in round-robin order so that no single source + * hogs the dispatch window. + * + * throtl_qnode is used to keep the queued bios separated by their sources. + * Bios are queued to throtl_qnode which in turn is queued to + * throtl_service_queue and then dispatched in round-robin order. + * + * It's also used to track the reference counts on blkg's. A qnode always + * belongs to a throtl_grp and gets queued on itself or the parent, so + * incrementing the reference of the associated throtl_grp when a qnode is + * queued and decrementing when dequeued is enough to keep the whole blkg + * tree pinned while bios are in flight. + */ +struct throtl_qnode { + struct list_head node; /* service_queue->queued[] */ + struct bio_list bios; /* queued bios */ + struct throtl_grp *tg; /* tg this qnode belongs to */ +}; + +struct throtl_service_queue { + struct throtl_service_queue *parent_sq; /* the parent service_queue */ + + /* + * Bios queued directly to this service_queue or dispatched from + * children throtl_grp's. + */ + struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ + unsigned int nr_queued[2]; /* number of queued bios */ + + /* + * RB tree of active children throtl_grp's, which are sorted by + * their ->disptime. + */ + struct rb_root pending_tree; /* RB tree of active tgs */ + struct rb_node *first_pending; /* first node in the tree */ + unsigned int nr_pending; /* # queued in the tree */ + unsigned long first_pending_disptime; /* disptime of the first tg */ + struct timer_list pending_timer; /* fires on first_pending_disptime */ }; -#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ - .count = 0, .min_disptime = 0} +enum tg_state_flags { + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ +}; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) +/* Per-cpu group stats */ +struct tg_stats_cpu { + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; +}; + struct throtl_grp { - /* List of throtl groups on the request queue*/ - struct hlist_node tg_node; + /* must be the first member */ + struct blkg_policy_data pd; - /* active throtl group service_tree member */ + /* active throtl group service_queue member */ struct rb_node rb_node; + /* throtl_data this group belongs to */ + struct throtl_data *td; + + /* this group's service queue */ + struct throtl_service_queue service_queue; + + /* + * qnode_on_self is used when bios are directly queued to this + * throtl_grp so that local bios compete fairly with bios + * dispatched from children. qnode_on_parent is used when bios are + * dispatched from this throtl_grp into its parent and will compete + * with the sibling qnode_on_parents and the parent's + * qnode_on_self. + */ + struct throtl_qnode qnode_on_self[2]; + struct throtl_qnode qnode_on_parent[2]; + /* * Dispatch time in jiffies. This is the estimated time when group * will unthrottle and is ready to dispatch more bio. It is used as @@ -51,15 +122,10 @@ struct throtl_grp { */ unsigned long disptime; - struct blkio_group blkg; - atomic_t ref; unsigned int flags; - /* Two lists for READ and WRITE */ - struct bio_list bio_lists[2]; - - /* Number of queued bios on READ and WRITE lists */ - unsigned int nr_queued[2]; + /* are there any throtl rules between this group and td? */ + bool has_rules[2]; /* bytes per second rate limits */ uint64_t bps[2]; @@ -76,19 +142,18 @@ struct throtl_grp { unsigned long slice_start[2]; unsigned long slice_end[2]; - /* Some throttle limits got updated for the group */ - bool limits_changed; + /* Per cpu stats pointer */ + struct tg_stats_cpu __percpu *stats_cpu; + + /* List of tgs waiting for per cpu stats memory to be allocated */ + struct list_head stats_alloc_node; }; struct throtl_data { - /* List of throtl groups */ - struct hlist_head tg_list; - /* service tree for active throtl groups */ - struct throtl_rb_root tg_service_tree; + struct throtl_service_queue service_queue; - struct throtl_grp root_tg; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ @@ -100,157 +165,402 @@ struct throtl_data unsigned int nr_undestroyed_grps; /* Work for dispatching throttled bios */ - struct delayed_work throtl_work; - - atomic_t limits_changed; + struct work_struct dispatch_work; }; -enum tg_state_flags { - THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ -}; +/* list and work item to allocate percpu group stats */ +static DEFINE_SPINLOCK(tg_stats_alloc_lock); +static LIST_HEAD(tg_stats_alloc_list); + +static void tg_stats_alloc_fn(struct work_struct *); +static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); -#define THROTL_TG_FNS(name) \ -static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ -{ \ - (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ -} \ -static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ -{ \ - (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ -} \ -static inline int throtl_tg_##name(const struct throtl_grp *tg) \ -{ \ - return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ +static void throtl_pending_timer_fn(unsigned long arg); + +static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct throtl_grp, pd) : NULL; } -THROTL_TG_FNS(on_rr); +static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) +{ + return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); +} -#define throtl_log_tg(td, tg, fmt, args...) \ - blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ - blkg_path(&(tg)->blkg), ##args); \ +static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) +{ + return pd_to_blkg(&tg->pd); +} -#define throtl_log(td, fmt, args...) \ - blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) +static inline struct throtl_grp *td_root_tg(struct throtl_data *td) +{ + return blkg_to_tg(td->queue->root_blkg); +} -static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) +/** + * sq_to_tg - return the throl_grp the specified service queue belongs to + * @sq: the throtl_service_queue of interest + * + * Return the throtl_grp @sq belongs to. If @sq is the top-level one + * embedded in throtl_data, %NULL is returned. + */ +static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) { - if (blkg) - return container_of(blkg, struct throtl_grp, blkg); + if (sq && sq->parent_sq) + return container_of(sq, struct throtl_grp, service_queue); + else + return NULL; +} - return NULL; +/** + * sq_to_td - return throtl_data the specified service queue belongs to + * @sq: the throtl_service_queue of interest + * + * A service_queue can be embeded in either a throtl_grp or throtl_data. + * Determine the associated throtl_data accordingly and return it. + */ +static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) +{ + struct throtl_grp *tg = sq_to_tg(sq); + + if (tg) + return tg->td; + else + return container_of(sq, struct throtl_data, service_queue); } -static inline int total_nr_queued(struct throtl_data *td) +/** + * throtl_log - log debug message via blktrace + * @sq: the service_queue being reported + * @fmt: printf format string + * @args: printf args + * + * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a + * throtl_grp; otherwise, just "throtl". + * + * TODO: this should be made a function and name formatting should happen + * after testing whether blktrace is enabled. + */ +#define throtl_log(sq, fmt, args...) do { \ + struct throtl_grp *__tg = sq_to_tg((sq)); \ + struct throtl_data *__td = sq_to_td((sq)); \ + \ + (void)__td; \ + if ((__tg)) { \ + char __pbuf[128]; \ + \ + blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ + } else { \ + blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ + } \ +} while (0) + +static void tg_stats_init(struct tg_stats_cpu *tg_stats) { - return (td->nr_queued[0] + td->nr_queued[1]); + blkg_rwstat_init(&tg_stats->service_bytes); + blkg_rwstat_init(&tg_stats->serviced); } -static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) +/* + * Worker for allocating per cpu stat for tgs. This is scheduled on the + * system_wq once there are some groups on the alloc_list waiting for + * allocation. + */ +static void tg_stats_alloc_fn(struct work_struct *work) { - atomic_inc(&tg->ref); - return tg; + static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ + struct delayed_work *dwork = to_delayed_work(work); + bool empty = false; + +alloc_stats: + if (!stats_cpu) { + int cpu; + + stats_cpu = alloc_percpu(struct tg_stats_cpu); + if (!stats_cpu) { + /* allocation failed, try again after some time */ + schedule_delayed_work(dwork, msecs_to_jiffies(10)); + return; + } + for_each_possible_cpu(cpu) + tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); + } + + spin_lock_irq(&tg_stats_alloc_lock); + + if (!list_empty(&tg_stats_alloc_list)) { + struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, + struct throtl_grp, + stats_alloc_node); + swap(tg->stats_cpu, stats_cpu); + list_del_init(&tg->stats_alloc_node); + } + + empty = list_empty(&tg_stats_alloc_list); + spin_unlock_irq(&tg_stats_alloc_lock); + if (!empty) + goto alloc_stats; } -static void throtl_put_tg(struct throtl_grp *tg) +static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { - BUG_ON(atomic_read(&tg->ref) <= 0); - if (!atomic_dec_and_test(&tg->ref)) - return; - kfree(tg); + INIT_LIST_HEAD(&qn->node); + bio_list_init(&qn->bios); + qn->tg = tg; } -static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, - struct cgroup *cgroup) +/** + * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it + * @bio: bio being added + * @qn: qnode to add bio to + * @queued: the service_queue->queued[] list @qn belongs to + * + * Add @bio to @qn and put @qn on @queued if it's not already on. + * @qn->tg's reference count is bumped when @qn is activated. See the + * comment on top of throtl_qnode definition for details. + */ +static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, + struct list_head *queued) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - struct throtl_grp *tg = NULL; - void *key = td; - struct backing_dev_info *bdi = &td->queue->backing_dev_info; - unsigned int major, minor; + bio_list_add(&qn->bios, bio); + if (list_empty(&qn->node)) { + list_add_tail(&qn->node, queued); + blkg_get(tg_to_blkg(qn->tg)); + } +} + +/** + * throtl_peek_queued - peek the first bio on a qnode list + * @queued: the qnode list to peek + */ +static struct bio *throtl_peek_queued(struct list_head *queued) +{ + struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); + struct bio *bio; + + if (list_empty(queued)) + return NULL; + + bio = bio_list_peek(&qn->bios); + WARN_ON_ONCE(!bio); + return bio; +} + +/** + * throtl_pop_queued - pop the first bio form a qnode list + * @queued: the qnode list to pop a bio from + * @tg_to_put: optional out argument for throtl_grp to put + * + * Pop the first bio from the qnode list @queued. After popping, the first + * qnode is removed from @queued if empty or moved to the end of @queued so + * that the popping order is round-robin. + * + * When the first qnode is removed, its associated throtl_grp should be put + * too. If @tg_to_put is NULL, this function automatically puts it; + * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is + * responsible for putting it. + */ +static struct bio *throtl_pop_queued(struct list_head *queued, + struct throtl_grp **tg_to_put) +{ + struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); + struct bio *bio; + + if (list_empty(queued)) + return NULL; + + bio = bio_list_pop(&qn->bios); + WARN_ON_ONCE(!bio); + + if (bio_list_empty(&qn->bios)) { + list_del_init(&qn->node); + if (tg_to_put) + *tg_to_put = qn->tg; + else + blkg_put(tg_to_blkg(qn->tg)); + } else { + list_move_tail(&qn->node, queued); + } + + return bio; +} + +/* init a service_queue, assumes the caller zeroed it */ +static void throtl_service_queue_init(struct throtl_service_queue *sq, + struct throtl_service_queue *parent_sq) +{ + INIT_LIST_HEAD(&sq->queued[0]); + INIT_LIST_HEAD(&sq->queued[1]); + sq->pending_tree = RB_ROOT; + sq->parent_sq = parent_sq; + setup_timer(&sq->pending_timer, throtl_pending_timer_fn, + (unsigned long)sq); +} + +static void throtl_service_queue_exit(struct throtl_service_queue *sq) +{ + del_timer_sync(&sq->pending_timer); +} + +static void throtl_pd_init(struct blkcg_gq *blkg) +{ + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_data *td = blkg->q->td; + struct throtl_service_queue *parent_sq; + unsigned long flags; + int rw; /* - * TODO: Speed up blkiocg_lookup_group() by maintaining a radix - * tree of blkg (instead of traversing through hash list all - * the time. + * If sane_hierarchy is enabled, we switch to properly hierarchical + * behavior where limits on a given throtl_grp are applied to the + * whole subtree rather than just the group itself. e.g. If 16M + * read_bps limit is set on the root group, the whole system can't + * exceed 16M for the device. + * + * If sane_hierarchy is not enabled, the broken flat hierarchy + * behavior is retained where all throtl_grps are treated as if + * they're all separate root groups right below throtl_data. + * Limits of a group don't interact with limits of other groups + * regardless of the position of the group in the hierarchy. */ + parent_sq = &td->service_queue; - /* - * This is the common case when there are no blkio cgroups. - * Avoid lookup in this case - */ - if (blkcg == &blkio_root_cgroup) - tg = &td->root_tg; - else - tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) + parent_sq = &blkg_to_tg(blkg->parent)->service_queue; - /* Fill in device details for root group */ - if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - tg->blkg.dev = MKDEV(major, minor); - goto done; + throtl_service_queue_init(&tg->service_queue, parent_sq); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); } - if (tg) - goto done; + RB_CLEAR_NODE(&tg->rb_node); + tg->td = td; - tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); - if (!tg) - goto done; + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; - INIT_HLIST_NODE(&tg->tg_node); - RB_CLEAR_NODE(&tg->rb_node); - bio_list_init(&tg->bio_lists[0]); - bio_list_init(&tg->bio_lists[1]); + /* + * Ugh... We need to perform per-cpu allocation for tg->stats_cpu + * but percpu allocator can't be called from IO path. Queue tg on + * tg_stats_alloc_list and allocate from work item. + */ + spin_lock_irqsave(&tg_stats_alloc_lock, flags); + list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); + schedule_delayed_work(&tg_stats_alloc_work, 0); + spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); +} + +/* + * Set has_rules[] if @tg or any of its parents have limits configured. + * This doesn't require walking up to the top of the hierarchy as the + * parent's has_rules[] is guaranteed to be correct. + */ +static void tg_update_has_rules(struct throtl_grp *tg) +{ + struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); + int rw; + for (rw = READ; rw <= WRITE; rw++) + tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || + (tg->bps[rw] != -1 || tg->iops[rw] != -1); +} + +static void throtl_pd_online(struct blkcg_gq *blkg) +{ /* - * Take the initial reference that will be released on destroy - * This can be thought of a joint reference by cgroup and - * request queue which will be dropped by either request queue - * exit or cgroup deletion path depending on who is exiting first. + * We don't want new groups to escape the limits of its ancestors. + * Update has_rules[] after a new group is brought online. */ - atomic_set(&tg->ref, 1); + tg_update_has_rules(blkg_to_tg(blkg)); +} + +static void throtl_pd_exit(struct blkcg_gq *blkg) +{ + struct throtl_grp *tg = blkg_to_tg(blkg); + unsigned long flags; - /* Add group onto cgroup list */ - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, - MKDEV(major, minor), BLKIO_POLICY_THROTL); + spin_lock_irqsave(&tg_stats_alloc_lock, flags); + list_del_init(&tg->stats_alloc_node); + spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); - tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); - tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); - tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); + free_percpu(tg->stats_cpu); - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; -done: - return tg; + throtl_service_queue_exit(&tg->service_queue); +} + +static void throtl_pd_reset_stats(struct blkcg_gq *blkg) +{ + struct throtl_grp *tg = blkg_to_tg(blkg); + int cpu; + + if (tg->stats_cpu == NULL) + return; + + for_each_possible_cpu(cpu) { + struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); + + blkg_rwstat_reset(&sc->service_bytes); + blkg_rwstat_reset(&sc->serviced); + } +} + +static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, + struct blkcg *blkcg) +{ + /* + * This is the common case when there are no blkcgs. Avoid lookup + * in this case + */ + if (blkcg == &blkcg_root) + return td_root_tg(td); + + return blkg_to_tg(blkg_lookup(blkcg, td->queue)); } -static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, + struct blkcg *blkcg) { - struct cgroup *cgroup; + struct request_queue *q = td->queue; struct throtl_grp *tg = NULL; - rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - tg = throtl_find_alloc_tg(td, cgroup); - if (!tg) - tg = &td->root_tg; - rcu_read_unlock(); + /* + * This is the common case when there are no blkcgs. Avoid lookup + * in this case + */ + if (blkcg == &blkcg_root) { + tg = td_root_tg(td); + } else { + struct blkcg_gq *blkg; + + blkg = blkg_lookup_create(blkcg, q); + + /* if %NULL and @q is alive, fall back to root_tg */ + if (!IS_ERR(blkg)) + tg = blkg_to_tg(blkg); + else if (!blk_queue_dying(q)) + tg = td_root_tg(td); + } + return tg; } -static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) +static struct throtl_grp * +throtl_rb_first(struct throtl_service_queue *parent_sq) { /* Service tree is empty */ - if (!root->count) + if (!parent_sq->nr_pending) return NULL; - if (!root->left) - root->left = rb_first(&root->rb); + if (!parent_sq->first_pending) + parent_sq->first_pending = rb_first(&parent_sq->pending_tree); - if (root->left) - return rb_entry_tg(root->left); + if (parent_sq->first_pending) + return rb_entry_tg(parent_sq->first_pending); return NULL; } @@ -261,29 +571,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root) RB_CLEAR_NODE(n); } -static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) +static void throtl_rb_erase(struct rb_node *n, + struct throtl_service_queue *parent_sq) { - if (root->left == n) - root->left = NULL; - rb_erase_init(n, &root->rb); - --root->count; + if (parent_sq->first_pending == n) + parent_sq->first_pending = NULL; + rb_erase_init(n, &parent_sq->pending_tree); + --parent_sq->nr_pending; } -static void update_min_dispatch_time(struct throtl_rb_root *st) +static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) { struct throtl_grp *tg; - tg = throtl_rb_first(st); + tg = throtl_rb_first(parent_sq); if (!tg) return; - st->min_disptime = tg->disptime; + parent_sq->first_pending_disptime = tg->disptime; } -static void -tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) +static void tg_service_queue_add(struct throtl_grp *tg) { - struct rb_node **node = &st->rb.rb_node; + struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; + struct rb_node **node = &parent_sq->pending_tree.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; @@ -302,99 +613,144 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) } if (left) - st->left = &tg->rb_node; + parent_sq->first_pending = &tg->rb_node; rb_link_node(&tg->rb_node, parent, node); - rb_insert_color(&tg->rb_node, &st->rb); + rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); } -static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +static void __throtl_enqueue_tg(struct throtl_grp *tg) { - struct throtl_rb_root *st = &td->tg_service_tree; + tg_service_queue_add(tg); + tg->flags |= THROTL_TG_PENDING; + tg->service_queue.parent_sq->nr_pending++; +} - tg_service_tree_add(st, tg); - throtl_mark_tg_on_rr(tg); - st->count++; +static void throtl_enqueue_tg(struct throtl_grp *tg) +{ + if (!(tg->flags & THROTL_TG_PENDING)) + __throtl_enqueue_tg(tg); } -static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +static void __throtl_dequeue_tg(struct throtl_grp *tg) { - if (!throtl_tg_on_rr(tg)) - __throtl_enqueue_tg(td, tg); + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); + tg->flags &= ~THROTL_TG_PENDING; } -static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +static void throtl_dequeue_tg(struct throtl_grp *tg) { - throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); - throtl_clear_tg_on_rr(tg); + if (tg->flags & THROTL_TG_PENDING) + __throtl_dequeue_tg(tg); } -static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +/* Call with queue lock held */ +static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, + unsigned long expires) { - if (throtl_tg_on_rr(tg)) - __throtl_dequeue_tg(td, tg); + mod_timer(&sq->pending_timer, expires); + throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", + expires - jiffies, jiffies); } -static void throtl_schedule_next_dispatch(struct throtl_data *td) +/** + * throtl_schedule_next_dispatch - schedule the next dispatch cycle + * @sq: the service_queue to schedule dispatch for + * @force: force scheduling + * + * Arm @sq->pending_timer so that the next dispatch cycle starts on the + * dispatch time of the first pending child. Returns %true if either timer + * is armed or there's no pending child left. %false if the current + * dispatch window is still open and the caller should continue + * dispatching. + * + * If @force is %true, the dispatch timer is always scheduled and this + * function is guaranteed to return %true. This is to be used when the + * caller can't dispatch itself and needs to invoke pending_timer + * unconditionally. Note that forced scheduling is likely to induce short + * delay before dispatch starts even if @sq->first_pending_disptime is not + * in the future and thus shouldn't be used in hot paths. + */ +static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, + bool force) { - struct throtl_rb_root *st = &td->tg_service_tree; + /* any pending children left? */ + if (!sq->nr_pending) + return true; - /* - * If there are more bios pending, schedule more work. - */ - if (!total_nr_queued(td)) - return; + update_min_dispatch_time(sq); - BUG_ON(!st->count); + /* is the next dispatch time in the future? */ + if (force || time_after(sq->first_pending_disptime, jiffies)) { + throtl_schedule_pending_timer(sq, sq->first_pending_disptime); + return true; + } - update_min_dispatch_time(st); + /* tell the caller to continue dispatching */ + return false; +} - if (time_before_eq(st->min_disptime, jiffies)) - throtl_schedule_delayed_work(td, 0); - else - throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); +static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, + bool rw, unsigned long start) +{ + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + + /* + * Previous slice has expired. We must have trimmed it after last + * bio dispatch. That means since start of last slice, we never used + * that bandwidth. Do try to make use of that bandwidth while giving + * credit. + */ + if (time_after_eq(start, tg->slice_start[rw])) + tg->slice_start[rw] = start; + + tg->slice_end[rw] = jiffies + throtl_slice; + throtl_log(&tg->service_queue, + "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); } -static inline void -throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + throtl_slice; - throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', tg->slice_start[rw], - tg->slice_end[rw], jiffies); + throtl_log(&tg->service_queue, + "[%c] new slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); } -static inline void throtl_set_slice_end(struct throtl_data *td, - struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, + unsigned long jiffy_end) { tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); } -static inline void throtl_extend_slice(struct throtl_data *td, - struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, + unsigned long jiffy_end) { tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); - throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', tg->slice_start[rw], - tg->slice_end[rw], jiffies); + throtl_log(&tg->service_queue, + "[%c] extend slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); } /* Determine if previously allocated or extended slice is complete or not */ -static bool -throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) +static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) - return 0; + return false; return 1; } /* Trim the used slices and adjust slice start accordingly */ -static inline void -throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { unsigned long nr_slices, time_elapsed, io_trim; u64 bytes_trim, tmp; @@ -406,7 +762,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) * renewed. Don't try to trim the slice if slice is used. A new * slice will start when appropriate. */ - if (throtl_slice_used(td, tg, rw)) + if (throtl_slice_used(tg, rw)) return; /* @@ -417,7 +773,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); + throtl_set_slice_end(tg, rw, jiffies + throtl_slice); time_elapsed = jiffies - tg->slice_start[rw]; @@ -446,14 +802,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) tg->slice_start[rw] += nr_slices * throtl_slice; - throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" - " start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, - tg->slice_start[rw], tg->slice_end[rw], jiffies); + throtl_log(&tg->service_queue, + "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, + tg->slice_start[rw], tg->slice_end[rw], jiffies); } -static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, - struct bio *bio, unsigned long *wait) +static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, + unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned int io_allowed; @@ -486,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -502,8 +858,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, return 0; } -static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, - struct bio *bio, unsigned long *wait) +static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, + unsigned long *wait) { bool rw = bio_data_dir(bio); u64 bytes_allowed, extra_bytes, tmp; @@ -521,14 +877,14 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, do_div(tmp, HZ); bytes_allowed = tmp; - if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ - extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; + extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); if (!jiffy_wait) @@ -548,8 +904,8 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, * Returns whether one can dispatch a bio or not. Also returns approx number * of jiffies to wait before this bio is with-in IO rate and can be dispatched */ -static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, - struct bio *bio, unsigned long *wait) +static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, + unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; @@ -560,13 +916,14 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, * this function with a different bio if there are other bios * queued. */ - BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + BUG_ON(tg->service_queue.nr_queued[rw] && + bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { if (wait) *wait = 0; - return 1; + return true; } /* @@ -574,15 +931,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, * existing slice to make sure it is at least throtl_slice interval * long since now. */ - if (throtl_slice_used(td, tg, rw)) - throtl_start_new_slice(td, tg, rw); + if (throtl_slice_used(tg, rw)) + throtl_start_new_slice(tg, rw); else { if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) - throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + throtl_extend_slice(tg, rw, jiffies + throtl_slice); } - if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) - && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { + if (tg_with_in_bps_limit(tg, bio, &bps_wait) && + tg_with_in_iops_limit(tg, bio, &iops_wait)) { if (wait) *wait = 0; return 1; @@ -594,83 +951,175 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, *wait = max_wait; if (time_before(tg->slice_end[rw], jiffies + max_wait)) - throtl_extend_slice(td, tg, rw, jiffies + max_wait); + throtl_extend_slice(tg, rw, jiffies + max_wait); return 0; } +static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, + int rw) +{ + struct throtl_grp *tg = blkg_to_tg(blkg); + struct tg_stats_cpu *stats_cpu; + unsigned long flags; + + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (tg->stats_cpu == NULL) + return; + + /* + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. + */ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(tg->stats_cpu); + + blkg_rwstat_add(&stats_cpu->serviced, rw, 1); + blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); + + local_irq_restore(flags); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - bool sync = bio->bi_rw & REQ_SYNC; /* Charge the bio to the group */ - tg->bytes_disp[rw] += bio->bi_size; + tg->bytes_disp[rw] += bio->bi_iter.bi_size; tg->io_disp[rw]++; /* - * TODO: This will take blkg->stats_lock. Figure out a way - * to avoid this cost. + * REQ_THROTTLED is used to prevent the same bio to be throttled + * more than once as a throttled bio will go through blk-throtl the + * second time when it eventually gets issued. Set it when a bio + * is being charged to a tg. + * + * Dispatch stats aren't recursive and each @bio should only be + * accounted by the @tg it was originally associated with. Let's + * update the stats when setting REQ_THROTTLED for the first time + * which is guaranteed to be for the @bio's original tg. */ - blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); + if (!(bio->bi_rw & REQ_THROTTLED)) { + bio->bi_rw |= REQ_THROTTLED; + throtl_update_dispatch_stats(tg_to_blkg(tg), + bio->bi_iter.bi_size, bio->bi_rw); + } } -static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, - struct bio *bio) +/** + * throtl_add_bio_tg - add a bio to the specified throtl_grp + * @bio: bio to add + * @qn: qnode to use + * @tg: the target throtl_grp + * + * Add @bio to @tg's service_queue using @qn. If @qn is not specified, + * tg->qnode_on_self[] is used. + */ +static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, + struct throtl_grp *tg) { + struct throtl_service_queue *sq = &tg->service_queue; bool rw = bio_data_dir(bio); - bio_list_add(&tg->bio_lists[rw], bio); - /* Take a bio reference on tg */ - throtl_ref_get_tg(tg); - tg->nr_queued[rw]++; - td->nr_queued[rw]++; - throtl_enqueue_tg(td, tg); + if (!qn) + qn = &tg->qnode_on_self[rw]; + + /* + * If @tg doesn't currently have any bios queued in the same + * direction, queueing @bio can change when @tg should be + * dispatched. Mark that @tg was empty. This is automatically + * cleaered on the next tg_update_disptime(). + */ + if (!sq->nr_queued[rw]) + tg->flags |= THROTL_TG_WAS_EMPTY; + + throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); + + sq->nr_queued[rw]++; + throtl_enqueue_tg(tg); } -static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) +static void tg_update_disptime(struct throtl_grp *tg) { + struct throtl_service_queue *sq = &tg->service_queue; unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; - if ((bio = bio_list_peek(&tg->bio_lists[READ]))) - tg_may_dispatch(td, tg, bio, &read_wait); + if ((bio = throtl_peek_queued(&sq->queued[READ]))) + tg_may_dispatch(tg, bio, &read_wait); - if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) - tg_may_dispatch(td, tg, bio, &write_wait); + if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) + tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; /* Update dispatch time */ - throtl_dequeue_tg(td, tg); + throtl_dequeue_tg(tg); tg->disptime = disptime; - throtl_enqueue_tg(td, tg); + throtl_enqueue_tg(tg); + + /* see throtl_add_bio_tg() */ + tg->flags &= ~THROTL_TG_WAS_EMPTY; } -static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, - bool rw, struct bio_list *bl) +static void start_parent_slice_with_credit(struct throtl_grp *child_tg, + struct throtl_grp *parent_tg, bool rw) { - struct bio *bio; + if (throtl_slice_used(parent_tg, rw)) { + throtl_start_new_slice_with_credit(parent_tg, rw, + child_tg->slice_start[rw]); + } - bio = bio_list_pop(&tg->bio_lists[rw]); - tg->nr_queued[rw]--; - /* Drop bio reference on tg */ - throtl_put_tg(tg); +} + +static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) +{ + struct throtl_service_queue *sq = &tg->service_queue; + struct throtl_service_queue *parent_sq = sq->parent_sq; + struct throtl_grp *parent_tg = sq_to_tg(parent_sq); + struct throtl_grp *tg_to_put = NULL; + struct bio *bio; - BUG_ON(td->nr_queued[rw] <= 0); - td->nr_queued[rw]--; + /* + * @bio is being transferred from @tg to @parent_sq. Popping a bio + * from @tg may put its reference and @parent_sq might end up + * getting released prematurely. Remember the tg to put and put it + * after @bio is transferred to @parent_sq. + */ + bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); + sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); - bio_list_add(bl, bio); - bio->bi_rw |= REQ_THROTTLED; - throtl_trim_slice(td, tg, rw); + /* + * If our parent is another tg, we just need to transfer @bio to + * the parent using throtl_add_bio_tg(). If our parent is + * @td->service_queue, @bio is ready to be issued. Put it on its + * bio_lists[] and decrease total number queued. The caller is + * responsible for issuing these bios. + */ + if (parent_tg) { + throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); + start_parent_slice_with_credit(tg, parent_tg, rw); + } else { + throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], + &parent_sq->queued[rw]); + BUG_ON(tg->td->nr_queued[rw] <= 0); + tg->td->nr_queued[rw]--; + } + + throtl_trim_slice(tg, rw); + + if (tg_to_put) + blkg_put(tg_to_blkg(tg_to_put)); } -static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, - struct bio_list *bl) +static int throtl_dispatch_tg(struct throtl_grp *tg) { + struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; unsigned int max_nr_reads = throtl_grp_quantum*3/4; unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; @@ -678,20 +1127,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, /* Try to dispatch 75% READS and 25% WRITES */ - while ((bio = bio_list_peek(&tg->bio_lists[READ])) - && tg_may_dispatch(td, tg, bio, NULL)) { + while ((bio = throtl_peek_queued(&sq->queued[READ])) && + tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + tg_dispatch_one_bio(tg, bio_data_dir(bio)); nr_reads++; if (nr_reads >= max_nr_reads) break; } - while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) - && tg_may_dispatch(td, tg, bio, NULL)) { + while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && + tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + tg_dispatch_one_bio(tg, bio_data_dir(bio)); nr_writes++; if (nr_writes >= max_nr_writes) @@ -701,14 +1150,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, return nr_reads + nr_writes; } -static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) +static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) { unsigned int nr_disp = 0; - struct throtl_grp *tg; - struct throtl_rb_root *st = &td->tg_service_tree; while (1) { - tg = throtl_rb_first(st); + struct throtl_grp *tg = throtl_rb_first(parent_sq); + struct throtl_service_queue *sq = &tg->service_queue; if (!tg) break; @@ -716,14 +1164,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) if (time_before(jiffies, tg->disptime)) break; - throtl_dequeue_tg(td, tg); + throtl_dequeue_tg(tg); - nr_disp += throtl_dispatch_tg(td, tg, bl); + nr_disp += throtl_dispatch_tg(tg); - if (tg->nr_queued[0] || tg->nr_queued[1]) { - tg_update_disptime(td, tg); - throtl_enqueue_tg(td, tg); - } + if (sq->nr_queued[0] || sq->nr_queued[1]) + tg_update_disptime(tg); if (nr_disp >= throtl_quantum) break; @@ -732,408 +1178,510 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) return nr_disp; } -static void throtl_process_limit_change(struct throtl_data *td) +/** + * throtl_pending_timer_fn - timer function for service_queue->pending_timer + * @arg: the throtl_service_queue being serviced + * + * This timer is armed when a child throtl_grp with active bio's become + * pending and queued on the service_queue's pending_tree and expires when + * the first child throtl_grp should be dispatched. This function + * dispatches bio's from the children throtl_grps to the parent + * service_queue. + * + * If the parent's parent is another throtl_grp, dispatching is propagated + * by either arming its pending_timer or repeating dispatch directly. If + * the top-level service_tree is reached, throtl_data->dispatch_work is + * kicked so that the ready bio's are issued. + */ +static void throtl_pending_timer_fn(unsigned long arg) { - struct throtl_grp *tg; - struct hlist_node *pos, *n; + struct throtl_service_queue *sq = (void *)arg; + struct throtl_grp *tg = sq_to_tg(sq); + struct throtl_data *td = sq_to_td(sq); + struct request_queue *q = td->queue; + struct throtl_service_queue *parent_sq; + bool dispatched; + int ret; - if (!atomic_read(&td->limits_changed)) - return; + spin_lock_irq(q->queue_lock); +again: + parent_sq = sq->parent_sq; + dispatched = false; + + while (true) { + throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", + sq->nr_queued[READ] + sq->nr_queued[WRITE], + sq->nr_queued[READ], sq->nr_queued[WRITE]); + + ret = throtl_select_dispatch(sq); + if (ret) { + throtl_log(sq, "bios disp=%u", ret); + dispatched = true; + } - throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); + if (throtl_schedule_next_dispatch(sq, false)) + break; - /* - * Make sure updates from throtl_update_blkio_group_read_bps() group - * of functions to tg->limits_changed are visible. We do not - * want update td->limits_changed to be visible but update to - * tg->limits_changed not being visible yet on this cpu. Hence - * the read barrier. - */ - smp_rmb(); - - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { - if (throtl_tg_on_rr(tg) && tg->limits_changed) { - throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" - " riops=%u wiops=%u", tg->bps[READ], - tg->bps[WRITE], tg->iops[READ], - tg->iops[WRITE]); - tg_update_disptime(td, tg); - tg->limits_changed = false; - } + /* this dispatch windows is still open, relax and repeat */ + spin_unlock_irq(q->queue_lock); + cpu_relax(); + spin_lock_irq(q->queue_lock); } - smp_mb__before_atomic_dec(); - atomic_dec(&td->limits_changed); - smp_mb__after_atomic_dec(); + if (!dispatched) + goto out_unlock; + + if (parent_sq) { + /* @parent_sq is another throl_grp, propagate dispatch */ + if (tg->flags & THROTL_TG_WAS_EMPTY) { + tg_update_disptime(tg); + if (!throtl_schedule_next_dispatch(parent_sq, false)) { + /* window is already open, repeat dispatching */ + sq = parent_sq; + tg = sq_to_tg(sq); + goto again; + } + } + } else { + /* reached the top-level, queue issueing */ + queue_work(kthrotld_workqueue, &td->dispatch_work); + } +out_unlock: + spin_unlock_irq(q->queue_lock); } -/* Dispatch throttled bios. Should be called without queue lock held. */ -static int throtl_dispatch(struct request_queue *q) +/** + * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work + * @work: work item being executed + * + * This function is queued for execution when bio's reach the bio_lists[] + * of throtl_data->service_queue. Those bio's are ready and issued by this + * function. + */ +static void blk_throtl_dispatch_work_fn(struct work_struct *work) { - struct throtl_data *td = q->td; - unsigned int nr_disp = 0; + struct throtl_data *td = container_of(work, struct throtl_data, + dispatch_work); + struct throtl_service_queue *td_sq = &td->service_queue; + struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; - - spin_lock_irq(q->queue_lock); - - throtl_process_limit_change(td); - - if (!total_nr_queued(td)) - goto out; + struct blk_plug plug; + int rw; bio_list_init(&bio_list_on_stack); - throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", - total_nr_queued(td), td->nr_queued[READ], - td->nr_queued[WRITE]); - - nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); - - if (nr_disp) - throtl_log(td, "bios disp=%u", nr_disp); - - throtl_schedule_next_dispatch(td); -out: + spin_lock_irq(q->queue_lock); + for (rw = READ; rw <= WRITE; rw++) + while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) + bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(q->queue_lock); - /* - * If we dispatched some requests, unplug the queue to make sure - * immediate dispatch - */ - if (nr_disp) { + if (!bio_list_empty(&bio_list_on_stack)) { + blk_start_plug(&plug); while((bio = bio_list_pop(&bio_list_on_stack))) generic_make_request(bio); - blk_unplug(q); + blk_finish_plug(&plug); } - return nr_disp; -} - -void blk_throtl_work(struct work_struct *work) -{ - struct throtl_data *td = container_of(work, struct throtl_data, - throtl_work.work); - struct request_queue *q = td->queue; - - throtl_dispatch(q); } -/* Call with queue lock held */ -static void -throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) +static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { + struct throtl_grp *tg = pd_to_tg(pd); + struct blkg_rwstat rwstat = { }, tmp; + int i, cpu; - struct delayed_work *dwork = &td->throtl_work; + for_each_possible_cpu(cpu) { + struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - if (total_nr_queued(td) > 0) { - /* - * We might have a work scheduled to be executed in future. - * Cancel that and schedule a new one. - */ - __cancel_delayed_work(dwork); - queue_delayed_work(kthrotld_workqueue, dwork, delay); - throtl_log(td, "schedule work. delay=%lu jiffies=%lu", - delay, jiffies); + tmp = blkg_rwstat_read((void *)sc + off); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + rwstat.cnt[i] += tmp.cnt[i]; } + + return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static void -throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) +static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) { - /* Something wrong if we are trying to remove same group twice */ - BUG_ON(hlist_unhashed(&tg->tg_node)); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, + &blkcg_policy_throtl, seq_cft(sf)->private, true); + return 0; +} - hlist_del_init(&tg->tg_node); +static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + u64 v = *(u64 *)((void *)tg + off); - /* - * Put the reference taken at the time of creation so that when all - * queues are gone, group can be destroyed. - */ - throtl_put_tg(tg); - td->nr_undestroyed_grps--; + if (v == -1) + return 0; + return __blkg_prfill_u64(sf, pd, v); } -static void throtl_release_tgs(struct throtl_data *td) +static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, + int off) { - struct hlist_node *pos, *n; - struct throtl_grp *tg; + struct throtl_grp *tg = pd_to_tg(pd); + unsigned int v = *(unsigned int *)((void *)tg + off); - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { - /* - * If cgroup removal path got to blk_group first and removed - * it from cgroup list, then it will take care of destroying - * cfqg also. - */ - if (!blkiocg_del_blkio_group(&tg->blkg)) - throtl_destroy_tg(td, tg); - } + if (v == -1) + return 0; + return __blkg_prfill_u64(sf, pd, v); } -static void throtl_td_free(struct throtl_data *td) +static int tg_print_conf_u64(struct seq_file *sf, void *v) { - kfree(td); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; } -/* - * Blk cgroup controller notification saying that blkio_group object is being - * delinked as associated cgroup object is going away. That also means that - * no new IO will come in this group. So get rid of this group as soon as - * any pending IO in the group is finished. - * - * This function is called under rcu_read_lock(). key is the rcu protected - * pointer. That means "key" is a valid throtl_data pointer as long as we are - * rcu read lock. - * - * "key" was fetched from blkio_group under blkio_cgroup->lock. That means - * it should not be NULL as even if queue was going away, cgroup deltion - * path got to it first. - */ -void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) +static int tg_print_conf_uint(struct seq_file *sf, void *v) { - unsigned long flags; - struct throtl_data *td = key; - - spin_lock_irqsave(td->queue->queue_lock, flags); - throtl_destroy_tg(td, tg_of_blkg(blkg)); - spin_unlock_irqrestore(td->queue->queue_lock, flags); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; } -/* - * For all update functions, key should be a valid pointer because these - * update functions are called under blkcg_lock, that means, blkg is - * valid and in turn key is valid. queue exit path can not race becuase - * of blkcg_lock - * - * Can not take queue lock in update functions as queue lock under blkcg_lock - * is not allowed. Under other paths we take blkcg_lock under queue_lock. - */ -static void throtl_update_blkio_group_read_bps(void *key, - struct blkio_group *blkg, u64 read_bps) +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) { - struct throtl_data *td = key; + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + struct throtl_service_queue *sq; + struct blkcg_gq *blkg; + struct cgroup_subsys_state *pos_css; + int ret; - tg_of_blkg(blkg)->bps[READ] = read_bps; - /* Make sure read_bps is updated before setting limits_changed */ - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; - /* Make sure tg->limits_changed is updated before td->limits_changed */ - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); + tg = blkg_to_tg(ctx.blkg); + sq = &tg->service_queue; - /* Schedule a work now to process the limit change */ - throtl_schedule_delayed_work(td, 0); -} + if (!ctx.v) + ctx.v = -1; -static void throtl_update_blkio_group_write_bps(void *key, - struct blkio_group *blkg, u64 write_bps) -{ - struct throtl_data *td = key; + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; - tg_of_blkg(blkg)->bps[WRITE] = write_bps; - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); - throtl_schedule_delayed_work(td, 0); -} + throtl_log(&tg->service_queue, + "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", + tg->bps[READ], tg->bps[WRITE], + tg->iops[READ], tg->iops[WRITE]); -static void throtl_update_blkio_group_read_iops(void *key, - struct blkio_group *blkg, unsigned int read_iops) -{ - struct throtl_data *td = key; + /* + * Update has_rules[] flags for the updated tg's subtree. A tg is + * considered to have rules if either the tg itself or any of its + * ancestors has rules. This identifies groups without any + * restrictions in the whole hierarchy and allows them to bypass + * blk-throttle. + */ + blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) + tg_update_has_rules(blkg_to_tg(blkg)); + + /* + * We're already holding queue_lock and know @tg is valid. Let's + * apply the new config directly. + * + * Restart the slices for both READ and WRITES. It might happen + * that a group's limit are dropped suddenly and we don't want to + * account recently dispatched IO with new low rate. + */ + throtl_start_new_slice(tg, 0); + throtl_start_new_slice(tg, 1); + + if (tg->flags & THROTL_TG_PENDING) { + tg_update_disptime(tg); + throtl_schedule_next_dispatch(sq->parent_sq, true); + } - tg_of_blkg(blkg)->iops[READ] = read_iops; - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); - throtl_schedule_delayed_work(td, 0); + blkg_conf_finish(&ctx); + return nbytes; } -static void throtl_update_blkio_group_write_iops(void *key, - struct blkio_group *blkg, unsigned int write_iops) +static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct throtl_data *td = key; + return tg_set_conf(of, buf, nbytes, off, true); +} - tg_of_blkg(blkg)->iops[WRITE] = write_iops; - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); - throtl_schedule_delayed_work(td, 0); +static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return tg_set_conf(of, buf, nbytes, off, false); } -void throtl_shutdown_timer_wq(struct request_queue *q) +static struct cftype throtl_files[] = { + { + .name = "throttle.read_bps_device", + .private = offsetof(struct throtl_grp, bps[READ]), + .seq_show = tg_print_conf_u64, + .write = tg_set_conf_u64, + }, + { + .name = "throttle.write_bps_device", + .private = offsetof(struct throtl_grp, bps[WRITE]), + .seq_show = tg_print_conf_u64, + .write = tg_set_conf_u64, + }, + { + .name = "throttle.read_iops_device", + .private = offsetof(struct throtl_grp, iops[READ]), + .seq_show = tg_print_conf_uint, + .write = tg_set_conf_uint, + }, + { + .name = "throttle.write_iops_device", + .private = offsetof(struct throtl_grp, iops[WRITE]), + .seq_show = tg_print_conf_uint, + .write = tg_set_conf_uint, + }, + { + .name = "throttle.io_service_bytes", + .private = offsetof(struct tg_stats_cpu, service_bytes), + .seq_show = tg_print_cpu_rwstat, + }, + { + .name = "throttle.io_serviced", + .private = offsetof(struct tg_stats_cpu, serviced), + .seq_show = tg_print_cpu_rwstat, + }, + { } /* terminate */ +}; + +static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; - cancel_delayed_work_sync(&td->throtl_work); + cancel_work_sync(&td->dispatch_work); } -static struct blkio_policy_type blkio_policy_throtl = { - .ops = { - .blkio_unlink_group_fn = throtl_unlink_blkio_group, - .blkio_update_group_read_bps_fn = - throtl_update_blkio_group_read_bps, - .blkio_update_group_write_bps_fn = - throtl_update_blkio_group_write_bps, - .blkio_update_group_read_iops_fn = - throtl_update_blkio_group_read_iops, - .blkio_update_group_write_iops_fn = - throtl_update_blkio_group_write_iops, - }, - .plid = BLKIO_POLICY_THROTL, +static struct blkcg_policy blkcg_policy_throtl = { + .pd_size = sizeof(struct throtl_grp), + .cftypes = throtl_files, + + .pd_init_fn = throtl_pd_init, + .pd_online_fn = throtl_pd_online, + .pd_exit_fn = throtl_pd_exit, + .pd_reset_stats_fn = throtl_pd_reset_stats, }; -int blk_throtl_bio(struct request_queue *q, struct bio **biop) +bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { struct throtl_data *td = q->td; + struct throtl_qnode *qn = NULL; struct throtl_grp *tg; - struct bio *bio = *biop; - bool rw = bio_data_dir(bio), update_disptime = true; + struct throtl_service_queue *sq; + bool rw = bio_data_dir(bio); + struct blkcg *blkcg; + bool throttled = false; - if (bio->bi_rw & REQ_THROTTLED) { - bio->bi_rw &= ~REQ_THROTTLED; - return 0; + /* see throtl_charge_bio() */ + if (bio->bi_rw & REQ_THROTTLED) + goto out; + + /* + * A throtl_grp pointer retrieved under rcu can be used to access + * basic fields like stats and io rates. If a group has no rules, + * just update the dispatch stats in lockless manner and return. + */ + rcu_read_lock(); + blkcg = bio_blkcg(bio); + tg = throtl_lookup_tg(td, blkcg); + if (tg) { + if (!tg->has_rules[rw]) { + throtl_update_dispatch_stats(tg_to_blkg(tg), + bio->bi_iter.bi_size, bio->bi_rw); + goto out_unlock_rcu; + } } + /* + * Either group has not been allocated yet or it is not an unlimited + * IO group + */ spin_lock_irq(q->queue_lock); - tg = throtl_get_tg(td); + tg = throtl_lookup_create_tg(td, blkcg); + if (unlikely(!tg)) + goto out_unlock; - if (tg->nr_queued[rw]) { - /* - * There is already another bio queued in same dir. No - * need to update dispatch time. - * Still update the disptime if rate limits on this group - * were changed. - */ - if (!tg->limits_changed) - update_disptime = false; - else - tg->limits_changed = false; + sq = &tg->service_queue; - goto queue_bio; - } + while (true) { + /* throtl is FIFO - if bios are already queued, should queue */ + if (sq->nr_queued[rw]) + break; + + /* if above limits, break to queue */ + if (!tg_may_dispatch(tg, bio, NULL)) + break; - /* Bio is with-in rate limit of group */ - if (tg_may_dispatch(td, tg, bio, NULL)) { + /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); - goto out; + + /* + * We need to trim slice even when bios are not being queued + * otherwise it might happen that a bio is not queued for + * a long time and slice keeps on extending and trim is not + * called for a long time. Now if limits are reduced suddenly + * we take into account all the IO dispatched so far at new + * low rate and * newly queued IO gets a really long dispatch + * time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(tg, rw); + + /* + * @bio passed through this layer without being throttled. + * Climb up the ladder. If we''re already at the top, it + * can be executed directly. + */ + qn = &tg->qnode_on_parent[rw]; + sq = sq->parent_sq; + tg = sq_to_tg(sq); + if (!tg) + goto out_unlock; } -queue_bio: - throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" - " iodisp=%u iops=%u queued=%d/%d", - rw == READ ? 'R' : 'W', - tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], - tg->io_disp[rw], tg->iops[rw], - tg->nr_queued[READ], tg->nr_queued[WRITE]); + /* out-of-limit, queue to @tg */ + throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", + rw == READ ? 'R' : 'W', + tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], + tg->io_disp[rw], tg->iops[rw], + sq->nr_queued[READ], sq->nr_queued[WRITE]); - throtl_add_bio_tg(q->td, tg, bio); - *biop = NULL; + bio_associate_current(bio); + tg->td->nr_queued[rw]++; + throtl_add_bio_tg(bio, qn, tg); + throttled = true; - if (update_disptime) { - tg_update_disptime(td, tg); - throtl_schedule_next_dispatch(td); + /* + * Update @tg's dispatch time and force schedule dispatch if @tg + * was empty before @bio. The forced scheduling isn't likely to + * cause undue delay as @bio is likely to be dispatched directly if + * its @tg's disptime is not in the future. + */ + if (tg->flags & THROTL_TG_WAS_EMPTY) { + tg_update_disptime(tg); + throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } -out: +out_unlock: spin_unlock_irq(q->queue_lock); - return 0; +out_unlock_rcu: + rcu_read_unlock(); +out: + /* + * As multiple blk-throtls may stack in the same issue path, we + * don't want bios to leave with the flag set. Clear the flag if + * being issued. + */ + if (!throttled) + bio->bi_rw &= ~REQ_THROTTLED; + return throttled; } -int blk_throtl_init(struct request_queue *q) +/* + * Dispatch all bios from all children tg's queued on @parent_sq. On + * return, @parent_sq is guaranteed to not have any active children tg's + * and all bios from previously active tg's are on @parent_sq->bio_lists[]. + */ +static void tg_drain_bios(struct throtl_service_queue *parent_sq) { - struct throtl_data *td; struct throtl_grp *tg; - td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); - if (!td) - return -ENOMEM; + while ((tg = throtl_rb_first(parent_sq))) { + struct throtl_service_queue *sq = &tg->service_queue; + struct bio *bio; - INIT_HLIST_HEAD(&td->tg_list); - td->tg_service_tree = THROTL_RB_ROOT; - atomic_set(&td->limits_changed, 0); + throtl_dequeue_tg(tg); - /* Init root group */ - tg = &td->root_tg; - INIT_HLIST_NODE(&tg->tg_node); - RB_CLEAR_NODE(&tg->rb_node); - bio_list_init(&tg->bio_lists[0]); - bio_list_init(&tg->bio_lists[1]); + while ((bio = throtl_peek_queued(&sq->queued[READ]))) + tg_dispatch_one_bio(tg, bio_data_dir(bio)); + while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) + tg_dispatch_one_bio(tg, bio_data_dir(bio)); + } +} + +/** + * blk_throtl_drain - drain throttled bios + * @q: request_queue to drain throttled bios for + * + * Dispatch all currently throttled bios on @q through ->make_request_fn(). + */ +void blk_throtl_drain(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) +{ + struct throtl_data *td = q->td; + struct blkcg_gq *blkg; + struct cgroup_subsys_state *pos_css; + struct bio *bio; + int rw; - /* Practically unlimited BW */ - tg->bps[0] = tg->bps[1] = -1; - tg->iops[0] = tg->iops[1] = -1; + queue_lockdep_assert_held(q); + rcu_read_lock(); /* - * Set root group reference to 2. One reference will be dropped when - * all groups on tg_list are being deleted during queue exit. Other - * reference will remain there as we don't want to delete this group - * as it is statically allocated and gets destroyed when throtl_data - * goes away. + * Drain each tg while doing post-order walk on the blkg tree, so + * that all bios are propagated to td->service_queue. It'd be + * better to walk service_queue tree directly but blkg walk is + * easier. */ - atomic_set(&tg->ref, 2); - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) + tg_drain_bios(&blkg_to_tg(blkg)->service_queue); - INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + /* finally, transfer bios from top-level tg's into the td */ + tg_drain_bios(&td->service_queue); - rcu_read_lock(); - blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, - 0, BLKIO_POLICY_THROTL); rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); - /* Attach throtl data to request queue */ - td->queue = q; - q->td = td; - return 0; + /* all bios now should be in td->service_queue, issue them */ + for (rw = READ; rw <= WRITE; rw++) + while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], + NULL))) + generic_make_request(bio); + + spin_lock_irq(q->queue_lock); } -void blk_throtl_exit(struct request_queue *q) +int blk_throtl_init(struct request_queue *q) { - struct throtl_data *td = q->td; - bool wait = false; - - BUG_ON(!td); - - throtl_shutdown_timer_wq(q); + struct throtl_data *td; + int ret; - spin_lock_irq(q->queue_lock); - throtl_release_tgs(td); + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; - /* If there are other groups */ - if (td->nr_undestroyed_grps > 0) - wait = true; + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); + throtl_service_queue_init(&td->service_queue, NULL); - spin_unlock_irq(q->queue_lock); + q->td = td; + td->queue = q; - /* - * Wait for tg->blkg->key accessors to exit their grace periods. - * Do this wait only if there are other undestroyed groups out - * there (other than root group). This can happen if cgroup deletion - * path claimed the responsibility of cleaning up a group before - * queue cleanup code get to the group. - * - * Do not call synchronize_rcu() unconditionally as there are drivers - * which create/delete request queue hundreds of times during scan/boot - * and synchronize_rcu() can take significant time and slow down boot. - */ - if (wait) - synchronize_rcu(); + /* activate policy */ + ret = blkcg_activate_policy(q, &blkcg_policy_throtl); + if (ret) + kfree(td); + return ret; +} - /* - * Just being safe to make sure after previous flush if some body did - * update limits through cgroup and another work got queued, cancel - * it. - */ - throtl_shutdown_timer_wq(q); - throtl_td_free(td); +void blk_throtl_exit(struct request_queue *q) +{ + BUG_ON(!q->td); + throtl_shutdown_wq(q); + blkcg_deactivate_policy(q, &blkcg_policy_throtl); + kfree(q->td); } static int __init throtl_init(void) @@ -1142,8 +1690,7 @@ static int __init throtl_init(void) if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); - blkio_policy_register(&blkio_policy_throtl); - return 0; + return blkcg_policy_register(&blkcg_policy_throtl); } module_init(throtl_init); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 4f0c06c7a33..95a09590ccf 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -7,6 +7,7 @@ #include <linux/fault-inject.h> #include "blk.h" +#include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -28,7 +29,10 @@ int blk_should_fake_timeout(struct request_queue *q) static int __init fail_io_timeout_debugfs(void) { - return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout"); + struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", + NULL, &fail_io_timeout); + + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_io_timeout_debugfs); @@ -79,16 +83,21 @@ void blk_delete_timer(struct request *req) static void blk_rq_timed_out(struct request *req) { struct request_queue *q = req->q; - enum blk_eh_timer_return ret; + enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - ret = q->rq_timed_out_fn(req); + if (q->rq_timed_out_fn) + ret = q->rq_timed_out_fn(req); switch (ret) { case BLK_EH_HANDLED: - __blk_complete_request(req); + /* Can we use req->errors here? */ + if (q->mq_ops) + __blk_mq_complete_request(req); + else + __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - blk_clear_rq_complete(req); blk_add_timer(req); + blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: /* @@ -104,6 +113,23 @@ static void blk_rq_timed_out(struct request *req) } } +void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, + unsigned int *next_set) +{ + if (time_after_eq(jiffies, rq->deadline)) { + list_del_init(&rq->timeout_list); + + /* + * Check if we raced with end io completion + */ + if (!blk_mark_rq_complete(rq)) + blk_rq_timed_out(rq); + } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { + *next_timeout = rq->deadline; + *next_set = 1; + } +} + void blk_rq_timed_out_timer(unsigned long data) { struct request_queue *q = (struct request_queue *) data; @@ -113,21 +139,8 @@ void blk_rq_timed_out_timer(unsigned long data) spin_lock_irqsave(q->queue_lock, flags); - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { - if (time_after_eq(jiffies, rq->deadline)) { - list_del_init(&rq->timeout_list); - - /* - * Check if we raced with end io completion - */ - if (blk_mark_rq_complete(rq)) - continue; - blk_rq_timed_out(rq); - } else if (!next_set || time_after(next, rq->deadline)) { - next = rq->deadline; - next_set = 1; - } - } + list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) + blk_rq_check_expired(rq, &next, &next_set); if (next_set) mod_timer(&q->timeout, round_jiffies_up(next)); @@ -153,6 +166,17 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); +unsigned long blk_rq_timeout(unsigned long timeout) +{ + unsigned long maxt; + + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + if (time_after(timeout, maxt)) + timeout = maxt; + + return timeout; +} + /** * blk_add_timer - Start timeout timer for a single request * @req: request that is about to start running. @@ -170,7 +194,6 @@ void blk_add_timer(struct request *req) return; BUG_ON(!list_empty(&req->timeout_list)); - BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); /* * Some LLDs, like scsi, peek at the timeout to prevent a @@ -180,58 +203,29 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; - list_add_tail(&req->timeout_list, &q->timeout_list); + if (!q->mq_ops) + list_add_tail(&req->timeout_list, &req->q->timeout_list); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = round_jiffies_up(req->deadline); + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); if (!timer_pending(&q->timeout) || - time_before(expiry, q->timeout.expires)) - mod_timer(&q->timeout, expiry); -} + time_before(expiry, q->timeout.expires)) { + unsigned long diff = q->timeout.expires - expiry; -/** - * blk_abort_queue -- Abort all request on given queue - * @queue: pointer to queue - * - */ -void blk_abort_queue(struct request_queue *q) -{ - unsigned long flags; - struct request *rq, *tmp; - LIST_HEAD(list); - - /* - * Not a request based block device, nothing to abort - */ - if (!q->request_fn) - return; - - spin_lock_irqsave(q->queue_lock, flags); - - elv_abort_queue(q); - - /* - * Splice entries to local list, to avoid deadlocking if entries - * get readded to the timeout list by error handling - */ - list_splice_init(&q->timeout_list, &list); - - list_for_each_entry_safe(rq, tmp, &list, timeout_list) - blk_abort_request(rq); - - /* - * Occasionally, blk_abort_request() will return without - * deleting the element from the list. Make sure we add those back - * instead of leaving them on the local stack list. - */ - list_splice(&list, &q->timeout_list); - - spin_unlock_irqrestore(q->queue_lock, flags); + /* + * Due to added timer slack to group timers, the timer + * will often be a little in front of what we asked for. + * So apply some tolerance here too, otherwise we keep + * modifying the timer because expires for value X + * will be X + something. + */ + if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) + mod_timer(&q->timeout, expiry); + } } -EXPORT_SYMBOL_GPL(blk_abort_queue); diff --git a/block/blk.h b/block/blk.h index 2db8f32838e..6748c4f8d7a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -1,40 +1,72 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H +#include <linux/idr.h> + /* Amount of time in which a process may batch requests */ #define BLK_BATCH_TIME (HZ/50UL) /* Number of requests a "batching" process may submit */ #define BLK_BATCH_REQ 32 +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT (5 * HZ) + extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; +extern struct ida blk_queue_ida; + +static inline void __blk_get_queue(struct request_queue *q) +{ + kobject_get(&q->kobj); +} +int blk_init_rl(struct request_list *rl, struct request_queue *q, + gfp_t gfp_mask); +void blk_exit_rl(struct request_list *rl); void init_request_from_bio(struct request *req, struct bio *bio); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); +void blk_queue_bypass_start(struct request_queue *q); +void blk_queue_bypass_end(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); +bool __blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes); -void blk_unplug_work(struct work_struct *work); -void blk_unplug_timeout(unsigned long data); void blk_rq_timed_out_timer(unsigned long data); +void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, + unsigned int *next_set); +unsigned long blk_rq_timeout(unsigned long timeout); +void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); -void __generic_unplug_device(struct request_queue *); + + +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, + struct bio *bio); +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, + struct bio *bio); +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count); + +void blk_account_io_start(struct request *req, bool new_io); +void blk_account_io_completion(struct request *req, unsigned int bytes); +void blk_account_io_done(struct request *req); /* * Internal atomic flags for request handling */ enum rq_atomic_flags { REQ_ATOM_COMPLETE = 0, + REQ_ATOM_STARTED, }; /* * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them suceeds + * sure that only one of them succeeds */ static inline int blk_mark_rq_complete(struct request *rq) { @@ -49,26 +81,42 @@ static inline void blk_clear_rq_complete(struct request *rq) /* * Internal elevator interface */ -#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED) -struct request *blk_do_flush(struct request_queue *q, struct request *rq); +void blk_insert_flush(struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; while (1) { - while (!list_empty(&q->queue_head)) { + if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || - rq == &q->flush_rq) - return rq; - rq = blk_do_flush(q, rq); - if (rq) - return rq; + return rq; } - if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) + /* + * Flush request is running and flush request isn't queueable + * in the drive, we can hold the queue till flush request is + * finished. Even we don't do this, driver can't dispatch next + * requests and will requeue them. And this can improve + * throughput too. For example, we have request flush1, write1, + * flush 2. flush1 is dispatched, then queue is hold, write1 + * isn't inserted to queue. After flush1 is finished, flush2 + * will be dispatched. Since disk cache is already clean, + * flush2 will be finished very soon, so looks like flush2 is + * folded to flush1. + * Since the queue is hold, a flag is set to indicate the queue + * should be restarted later. Please see flush_end_io() for + * details. + */ + if (q->flush_pending_idx != q->flush_running_idx && + !queue_flush_queueable(q)) { + q->flush_queue_delayed = 1; + return NULL; + } + if (unlikely(blk_queue_bypass(q)) || + !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) return NULL; } } @@ -77,16 +125,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_activate_req_fn) - e->ops->elevator_activate_req_fn(q, rq); + if (e->type->ops.elevator_activate_req_fn) + e->type->ops.elevator_activate_req_fn(q, rq); } static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_deactivate_req_fn) - e->ops->elevator_deactivate_req_fn(q, rq); + if (e->type->ops.elevator_deactivate_req_fn) + e->type->ops.elevator_deactivate_req_fn(q, rq); } #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -101,23 +149,24 @@ static inline int blk_should_fake_timeout(struct request_queue *q) } #endif -struct io_context *current_io_context(gfp_t gfp_flags, int node); - int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio); int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio); int attempt_back_merge(struct request_queue *q, struct request *rq); int attempt_front_merge(struct request_queue *q, struct request *rq); +int blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next); void blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); +bool blk_rq_merge_ok(struct request *rq, struct bio *bio); +int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); -int blk_dev_init(void); +void __blk_run_queue_uncond(struct request_queue *q); -void elv_quiesce_start(struct request_queue *q); -void elv_quiesce_end(struct request_queue *q); +int blk_dev_init(void); /* @@ -138,35 +187,69 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } -static inline int blk_cpu_to_group(int cpu) -{ - int group = NR_CPUS; -#ifdef CONFIG_SCHED_MC - const struct cpumask *mask = cpu_coregroup_mask(cpu); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - group = cpumask_first(topology_thread_cpumask(cpu)); -#else - return cpu; -#endif - if (likely(group < NR_CPUS)) - return group; - return cpu; -} +extern int blk_update_nr_requests(struct request_queue *, unsigned int); /* * Contribute to IO statistics IFF: * * a) it's attached to a gendisk, and * b) the queue had IO stats enabled when this request was started, and - * c) it's a file system request or a discard request + * c) it's a file system request */ static inline int blk_do_io_stat(struct request *rq) { return rq->rq_disk && (rq->cmd_flags & REQ_IO_STAT) && - (rq->cmd_type == REQ_TYPE_FS || - (rq->cmd_flags & REQ_DISCARD)); + (rq->cmd_type == REQ_TYPE_FS); } -#endif +/* + * Internal io_context interface + */ +void get_io_context(struct io_context *ioc); +struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask); +void ioc_clear_queue(struct request_queue *q); + +int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); + +/** + * create_io_context - try to create task->io_context + * @gfp_mask: allocation mask + * @node: allocation node + * + * If %current->io_context is %NULL, allocate a new io_context and install + * it. Returns the current %current->io_context which may be %NULL if + * allocation failed. + * + * Note that this function can't be called with IRQ disabled because + * task_lock which protects %current->io_context is IRQ-unsafe. + */ +static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) +{ + WARN_ON_ONCE(irqs_disabled()); + if (unlikely(!current->io_context)) + create_task_io_context(current, gfp_mask, node); + return current->io_context; +} + +/* + * Internal throttling interface + */ +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); +extern void blk_throtl_drain(struct request_queue *q); +extern int blk_throtl_init(struct request_queue *q); +extern void blk_throtl_exit(struct request_queue *q); +#else /* CONFIG_BLK_DEV_THROTTLING */ +static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +{ + return false; +} +static inline void blk_throtl_drain(struct request_queue *q) { } +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline void blk_throtl_exit(struct request_queue *q) { } +#endif /* CONFIG_BLK_DEV_THROTTLING */ + +#endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c new file mode 100644 index 00000000000..ab21ba203d5 --- /dev/null +++ b/block/bounce.c @@ -0,0 +1,290 @@ +/* bounce buffer handling for block devices + * + * - Split from highmem.c + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/export.h> +#include <linux/swap.h> +#include <linux/gfp.h> +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/printk.h> +#include <asm/tlbflush.h> + +#include <trace/events/block.h> + +#define POOL_SIZE 64 +#define ISA_POOL_SIZE 16 + +static mempool_t *page_pool, *isa_page_pool; + +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) +static __init int init_emergency_pool(void) +{ +#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) + if (max_pfn <= max_low_pfn) + return 0; +#endif + + page_pool = mempool_create_page_pool(POOL_SIZE, 0); + BUG_ON(!page_pool); + pr_info("pool size: %d pages\n", POOL_SIZE); + + return 0; +} + +__initcall(init_emergency_pool); +#endif + +#ifdef CONFIG_HIGHMEM +/* + * highmem version, map in to vec + */ +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) +{ + unsigned long flags; + unsigned char *vto; + + local_irq_save(flags); + vto = kmap_atomic(to->bv_page); + memcpy(vto + to->bv_offset, vfrom, to->bv_len); + kunmap_atomic(vto); + local_irq_restore(flags); +} + +#else /* CONFIG_HIGHMEM */ + +#define bounce_copy_vec(to, vfrom) \ + memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) + +#endif /* CONFIG_HIGHMEM */ + +/* + * allocate pages in the DMA region for the ISA pool + */ +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) +{ + return mempool_alloc_pages(gfp_mask | GFP_DMA, data); +} + +/* + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA + * as the max address, so check if the pool has already been created. + */ +int init_emergency_isa_pool(void) +{ + if (isa_page_pool) + return 0; + + isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); + BUG_ON(!isa_page_pool); + + pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); + return 0; +} + +/* + * Simple bounce buffer support for highmem pages. Depending on the + * queue gfp mask set, *to may or may not be a highmem page. kmap it + * always, it will do the Right Thing + */ +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) +{ + unsigned char *vfrom; + struct bio_vec tovec, *fromvec = from->bi_io_vec; + struct bvec_iter iter; + + bio_for_each_segment(tovec, to, iter) { + if (tovec.bv_page != fromvec->bv_page) { + /* + * fromvec->bv_offset and fromvec->bv_len might have + * been modified by the block layer, so use the original + * copy, bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec->bv_page) + + tovec.bv_offset; + + bounce_copy_vec(&tovec, vfrom); + flush_dcache_page(tovec.bv_page); + } + + fromvec++; + } +} + +static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + struct bio_vec *bvec, *org_vec; + int i; + + if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) + set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); + + /* + * free up bounce indirect pages used + */ + bio_for_each_segment_all(bvec, bio, i) { + org_vec = bio_orig->bi_io_vec + i; + if (bvec->bv_page == org_vec->bv_page) + continue; + + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + + bio_endio(bio_orig, err); + bio_put(bio); +} + +static void bounce_end_io_write(struct bio *bio, int err) +{ + bounce_end_io(bio, page_pool, err); +} + +static void bounce_end_io_write_isa(struct bio *bio, int err) +{ + + bounce_end_io(bio, isa_page_pool, err); +} + +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + copy_to_high_bio_irq(bio_orig, bio); + + bounce_end_io(bio, pool, err); +} + +static void bounce_end_io_read(struct bio *bio, int err) +{ + __bounce_end_io_read(bio, page_pool, err); +} + +static void bounce_end_io_read_isa(struct bio *bio, int err) +{ + __bounce_end_io_read(bio, isa_page_pool, err); +} + +#ifdef CONFIG_NEED_BOUNCE_POOL +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ + if (bio_data_dir(bio) != WRITE) + return 0; + + if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) + return 0; + + return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); +} +#else +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ + return 0; +} +#endif /* CONFIG_NEED_BOUNCE_POOL */ + +static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, + mempool_t *pool, int force) +{ + struct bio *bio; + int rw = bio_data_dir(*bio_orig); + struct bio_vec *to, from; + struct bvec_iter iter; + unsigned i; + + if (force) + goto bounce; + bio_for_each_segment(from, *bio_orig, iter) + if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) + goto bounce; + + return; +bounce: + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); + + bio_for_each_segment_all(to, bio, i) { + struct page *page = to->bv_page; + + if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) + continue; + + inc_zone_page_state(to->bv_page, NR_BOUNCE); + to->bv_page = mempool_alloc(pool, q->bounce_gfp); + + if (rw == WRITE) { + char *vto, *vfrom; + + flush_dcache_page(page); + + vto = page_address(to->bv_page) + to->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; + memcpy(vto, vfrom, to->bv_len); + kunmap_atomic(vfrom); + } + } + + trace_block_bio_bounce(q, *bio_orig); + + bio->bi_flags |= (1 << BIO_BOUNCED); + + if (pool == page_pool) { + bio->bi_end_io = bounce_end_io_write; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + } else { + bio->bi_end_io = bounce_end_io_write_isa; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read_isa; + } + + bio->bi_private = *bio_orig; + *bio_orig = bio; +} + +void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) +{ + int must_bounce; + mempool_t *pool; + + /* + * Data-less bio, nothing to bounce + */ + if (!bio_has_data(*bio_orig)) + return; + + must_bounce = must_snapshot_stable_pages(q, *bio_orig); + + /* + * for non-isa bounce case, just check if the bounce pfn is equal + * to or bigger than the highest pfn in the system -- in that case, + * don't waste time iterating over bio segments + */ + if (!(q->bounce_gfp & GFP_DMA)) { + if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) + return; + pool = page_pool; + } else { + BUG_ON(!isa_page_pool); + pool = isa_page_pool; + } + + /* + * slow path + */ + __blk_queue_bounce(q, bio_orig, pool, must_bounce); +} + +EXPORT_SYMBOL(blk_queue_bounce); diff --git a/block/bsg-lib.c b/block/bsg-lib.c new file mode 100644 index 00000000000..650f427d915 --- /dev/null +++ b/block/bsg-lib.c @@ -0,0 +1,232 @@ +/* + * BSG helper library + * + * Copyright (C) 2008 James Smart, Emulex Corporation + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Copyright (C) 2011 Mike Christie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/scatterlist.h> +#include <linux/bsg-lib.h> +#include <linux/export.h> +#include <scsi/scsi_cmnd.h> + +/** + * bsg_destroy_job - routine to teardown/delete a bsg job + * @job: bsg_job that is to be torn down + */ +static void bsg_destroy_job(struct bsg_job *job) +{ + put_device(job->dev); /* release reference for the request */ + + kfree(job->request_payload.sg_list); + kfree(job->reply_payload.sg_list); + kfree(job); +} + +/** + * bsg_job_done - completion routine for bsg requests + * @job: bsg_job that is complete + * @result: job reply result + * @reply_payload_rcv_len: length of payload recvd + * + * The LLD should call this when the bsg job has completed. + */ +void bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len) +{ + struct request *req = job->req; + struct request *rsp = req->next_rq; + int err; + + err = job->req->errors = result; + if (err < 0) + /* we're only returning the result field in the reply */ + job->req->sense_len = sizeof(u32); + else + job->req->sense_len = job->reply_len; + /* we assume all request payload was transferred, residual == 0 */ + req->resid_len = 0; + + if (rsp) { + WARN_ON(reply_payload_rcv_len > rsp->resid_len); + + /* set reply (bidi) residual */ + rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len); + } + blk_complete_request(req); +} +EXPORT_SYMBOL_GPL(bsg_job_done); + +/** + * bsg_softirq_done - softirq done routine for destroying the bsg requests + * @rq: BSG request that holds the job to be destroyed + */ +static void bsg_softirq_done(struct request *rq) +{ + struct bsg_job *job = rq->special; + + blk_end_request_all(rq, rq->errors); + bsg_destroy_job(job); +} + +static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) +{ + size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments); + + BUG_ON(!req->nr_phys_segments); + + buf->sg_list = kzalloc(sz, GFP_KERNEL); + if (!buf->sg_list) + return -ENOMEM; + sg_init_table(buf->sg_list, req->nr_phys_segments); + buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->payload_len = blk_rq_bytes(req); + return 0; +} + +/** + * bsg_create_job - create the bsg_job structure for the bsg request + * @dev: device that is being sent the bsg request + * @req: BSG request that needs a job structure + */ +static int bsg_create_job(struct device *dev, struct request *req) +{ + struct request *rsp = req->next_rq; + struct request_queue *q = req->q; + struct bsg_job *job; + int ret; + + BUG_ON(req->special); + + job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL); + if (!job) + return -ENOMEM; + + req->special = job; + job->req = req; + if (q->bsg_job_size) + job->dd_data = (void *)&job[1]; + job->request = req->cmd; + job->request_len = req->cmd_len; + job->reply = req->sense; + job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer + * allocated */ + if (req->bio) { + ret = bsg_map_buffer(&job->request_payload, req); + if (ret) + goto failjob_rls_job; + } + if (rsp && rsp->bio) { + ret = bsg_map_buffer(&job->reply_payload, rsp); + if (ret) + goto failjob_rls_rqst_payload; + } + job->dev = dev; + /* take a reference for the request */ + get_device(job->dev); + return 0; + +failjob_rls_rqst_payload: + kfree(job->request_payload.sg_list); +failjob_rls_job: + kfree(job); + return -ENOMEM; +} + +/** + * bsg_request_fn - generic handler for bsg requests + * @q: request queue to manage + * + * On error the create_bsg_job function should return a -Exyz error value + * that will be set to the req->errors. + * + * Drivers/subsys should pass this to the queue init function. + */ +void bsg_request_fn(struct request_queue *q) +{ + struct device *dev = q->queuedata; + struct request *req; + struct bsg_job *job; + int ret; + + if (!get_device(dev)) + return; + + while (1) { + req = blk_fetch_request(q); + if (!req) + break; + spin_unlock_irq(q->queue_lock); + + ret = bsg_create_job(dev, req); + if (ret) { + req->errors = ret; + blk_end_request_all(req, ret); + spin_lock_irq(q->queue_lock); + continue; + } + + job = req->special; + ret = q->bsg_job_fn(job); + spin_lock_irq(q->queue_lock); + if (ret) + break; + } + + spin_unlock_irq(q->queue_lock); + put_device(dev); + spin_lock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(bsg_request_fn); + +/** + * bsg_setup_queue - Create and add the bsg hooks so we can receive requests + * @dev: device to attach bsg device to + * @q: request queue setup by caller + * @name: device to give bsg device + * @job_fn: bsg job handler + * @dd_job_size: size of LLD data needed for each job + * + * The caller should have setup the reuqest queue with bsg_request_fn + * as the request_fn. + */ +int bsg_setup_queue(struct device *dev, struct request_queue *q, + char *name, bsg_job_fn *job_fn, int dd_job_size) +{ + int ret; + + q->queuedata = dev; + q->bsg_job_size = dd_job_size; + q->bsg_job_fn = job_fn; + queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + blk_queue_softirq_done(q, bsg_softirq_done); + blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); + + ret = bsg_register_queue(q, dev, name, NULL); + if (ret) { + printk(KERN_ERR "%s: bsg interface failed to " + "initialize - register queue\n", dev->kobj.name); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bsg_setup_queue); diff --git a/block/bsg.c b/block/bsg.c index 0c8b64a1648..ff46addde5d 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -182,7 +182,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -ENOMEM; } - if (copy_from_user(rq->cmd, (void *)(unsigned long)hdr->request, + if (copy_from_user(rq->cmd, (void __user *)(unsigned long)hdr->request, hdr->request_len)) return -EFAULT; @@ -196,7 +196,6 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->request_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -249,7 +248,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, struct request *rq, *next_rq = NULL; int ret, rw; unsigned int dxfer_len; - void *dxferp = NULL; + void __user *dxferp = NULL; struct bsg_class_device *bcd = &q->bsg_dev; /* if the LLD has been removed then the bsg_unregister_queue will @@ -273,6 +272,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) return ERR_PTR(-ENOMEM); + blk_rq_set_block_pc(rq); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) goto out; @@ -291,7 +292,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, rq->next_rq = next_rq; next_rq->cmd_type = rq->cmd_type; - dxferp = (void*)(unsigned long)hdr->din_xferp; + dxferp = (void __user *)(unsigned long)hdr->din_xferp; ret = blk_rq_map_user(q, next_rq, NULL, dxferp, hdr->din_xfer_len, GFP_KERNEL); if (ret) @@ -300,10 +301,10 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, if (hdr->dout_xfer_len) { dxfer_len = hdr->dout_xfer_len; - dxferp = (void*)(unsigned long)hdr->dout_xferp; + dxferp = (void __user *)(unsigned long)hdr->dout_xferp; } else if (hdr->din_xfer_len) { dxfer_len = hdr->din_xfer_len; - dxferp = (void*)(unsigned long)hdr->din_xferp; + dxferp = (void __user *)(unsigned long)hdr->din_xferp; } else dxfer_len = 0; @@ -445,7 +446,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, int len = min_t(unsigned int, hdr->max_response_len, rq->sense_len); - ret = copy_to_user((void*)(unsigned long)hdr->response, + ret = copy_to_user((void __user *)(unsigned long)hdr->response, rq->sense, len); if (!ret) hdr->response_len = len; @@ -606,7 +607,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) ret = __bsg_read(buf, count, bd, NULL, &bytes_read); *ppos = bytes_read; - if (!bytes_read || (bytes_read && err_block_err(ret))) + if (!bytes_read || err_block_err(ret)) bytes_read = ret; return bytes_read; @@ -686,7 +687,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) /* * return bytes written on non-fatal errors */ - if (!bytes_written || (bytes_written && err_block_err(ret))) + if (!bytes_written || err_block_err(ret)) bytes_written = ret; dprintk("%s: returning %Zd\n", bd->name, bytes_written); @@ -769,12 +770,10 @@ static struct bsg_device *bsg_add_device(struct inode *inode, struct file *file) { struct bsg_device *bd; - int ret; #ifdef BSG_DEBUG unsigned char buf[32]; #endif - ret = blk_get_queue(rq); - if (ret) + if (!blk_get_queue(rq)) return ERR_PTR(-ENXIO); bd = bsg_alloc_device(); @@ -802,11 +801,10 @@ static struct bsg_device *bsg_add_device(struct inode *inode, static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) { struct bsg_device *bd; - struct hlist_node *entry; mutex_lock(&bsg_mutex); - hlist_for_each_entry(bd, entry, bsg_dev_idx_hash(minor), dev_list) { + hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { if (bd->queue == q) { atomic_inc(&bd->ref_count); goto found; @@ -878,7 +876,7 @@ static unsigned int bsg_poll(struct file *file, poll_table *wait) spin_lock_irq(&bd->lock); if (!list_empty(&bd->done_list)) mask |= POLLIN | POLLRDNORM; - if (bd->queued_cmds >= bd->max_queue) + if (bd->queued_cmds < bd->max_queue) mask |= POLLOUT; spin_unlock_irq(&bd->lock); @@ -985,7 +983,8 @@ void bsg_unregister_queue(struct request_queue *q) mutex_lock(&bsg_mutex); idr_remove(&bsg_minor_idr, bcd->minor); - sysfs_remove_link(&q->kobj, "bsg"); + if (q->kobj.sd) + sysfs_remove_link(&q->kobj, "bsg"); device_unregister(bcd->class_dev); bcd->class_dev = NULL; kref_put(&bcd->ref, bsg_kref_release_function); @@ -998,7 +997,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, { struct bsg_class_device *bcd; dev_t dev; - int ret, minor; + int ret; struct device *class_dev = NULL; const char *devname; @@ -1010,7 +1009,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, /* * we need a proper transport to send commands, not a stacked device */ - if (!q->request_fn) + if (!queue_is_rq_based(q)) return 0; bcd = &q->bsg_dev; @@ -1018,23 +1017,16 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, mutex_lock(&bsg_mutex); - ret = idr_pre_get(&bsg_minor_idr, GFP_KERNEL); - if (!ret) { - ret = -ENOMEM; - goto unlock; - } - - ret = idr_get_new(&bsg_minor_idr, bcd, &minor); - if (ret < 0) + ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL); + if (ret < 0) { + if (ret == -ENOSPC) { + printk(KERN_ERR "bsg: too many bsg devices\n"); + ret = -EINVAL; + } goto unlock; - - if (minor >= BSG_MAX_DEVS) { - printk(KERN_ERR "bsg: too many bsg devices\n"); - ret = -EINVAL; - goto remove_idr; } - bcd->minor = minor; + bcd->minor = ret; bcd->queue = q; bcd->parent = get_device(parent); bcd->release = release; @@ -1060,8 +1052,7 @@ unregister_class_dev: device_unregister(class_dev); put_dev: put_device(parent); -remove_idr: - idr_remove(&bsg_minor_idr, minor); + idr_remove(&bsg_minor_idr, bcd->minor); unlock: mutex_unlock(&bsg_mutex); return ret; @@ -1070,7 +1061,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue); static struct cdev bsg_cdev; -static char *bsg_devnode(struct device *dev, mode_t *mode) +static char *bsg_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ea83a4f0c27..cadc3784174 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,7 +14,8 @@ #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> -#include "cfq.h" +#include "blk.h" +#include "blk-cgroup.h" /* * tunables @@ -53,20 +54,11 @@ static const int cfq_hist_divisor = 4; #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) -#define RQ_CIC(rq) \ - ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) -#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) +#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) static struct kmem_cache *cfq_pool; -static struct kmem_cache *cfq_ioc_pool; - -static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); -static struct completion *ioc_gone; -static DEFINE_SPINLOCK(ioc_gone_lock); - -static DEFINE_SPINLOCK(cic_index_lock); -static DEFINE_IDA(cic_index_ida); #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) @@ -75,6 +67,14 @@ static DEFINE_IDA(cic_index_ida); #define sample_valid(samples) ((samples) > 80) #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) +struct cfq_ttime { + unsigned long last_end_request; + + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; +}; + /* * Most of our rbtree usage is for sorting with min extraction, so * if we cache the leftmost node we don't have to walk down the tree @@ -85,11 +85,11 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; - unsigned total_weight; u64 min_vdisktime; + struct cfq_ttime ttime; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ - .count = 0, .min_vdisktime = 0, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ + .ttime = {.last_end_request = jiffies,},} /* * Per process-grouping structure @@ -129,14 +129,14 @@ struct cfq_queue { unsigned long slice_end; long slice_resid; - /* pending metadata requests */ - int meta_pending; + /* pending priority requests */ + int prio_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; /* io prio of this group */ unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; + unsigned short ioprio_class; pid_t pid; @@ -146,7 +146,6 @@ struct cfq_queue { struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; - struct cfq_group *orig_cfqg; /* Number of sectors dispatched from queue in single dispatch round */ unsigned long nr_sectors; }; @@ -155,7 +154,7 @@ struct cfq_queue { * First index in the service_trees. * IDLE is handled separately, so it has negative index */ -enum wl_prio_t { +enum wl_class_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, @@ -171,20 +170,102 @@ enum wl_type_t { SYNC_WORKLOAD = 2 }; +struct cfqg_stats { +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_stat sectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stat time; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* time not charged to this cgroup */ + struct blkg_stat unaccounted_time; + /* sum of number of ios queued across all samples */ + struct blkg_stat avg_queue_size_sum; + /* count of samples taken for average */ + struct blkg_stat avg_queue_size_samples; + /* how many times this group has been removed from service tree */ + struct blkg_stat dequeue; + /* total time spent waiting for it to be assigned a timeslice. */ + struct blkg_stat group_wait_time; + /* time spent idling for this blkcg_gq */ + struct blkg_stat idle_time; + /* total time with empty current active q with other requests queued */ + struct blkg_stat empty_time; + /* fields after this shouldn't be cleared on stat reset */ + uint64_t start_group_wait_time; + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; +#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ +}; + /* This is per cgroup per device grouping structure */ struct cfq_group { + /* must be the first member */ + struct blkg_policy_data pd; + /* group service_tree member */ struct rb_node rb_node; /* group service_tree key */ u64 vdisktime; + + /* + * The number of active cfqgs and sum of their weights under this + * cfqg. This covers this cfqg's leaf_weight and all children's + * weights, but does not cover weights of further descendants. + * + * If a cfqg is on the service tree, it's active. An active cfqg + * also activates its parent and contributes to the children_weight + * of the parent. + */ + int nr_active; + unsigned int children_weight; + + /* + * vfraction is the fraction of vdisktime that the tasks in this + * cfqg are entitled to. This is determined by compounding the + * ratios walking up from this cfqg to the root. + * + * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all + * vfractions on a service tree is approximately 1. The sum may + * deviate a bit due to rounding errors and fluctuations caused by + * cfqgs entering and leaving the service tree. + */ + unsigned int vfraction; + + /* + * There are two weights - (internal) weight is the weight of this + * cfqg against the sibling cfqgs. leaf_weight is the wight of + * this cfqg against the child cfqgs. For the root cfqg, both + * weights are kept in sync for backward compatibility. + */ unsigned int weight; + unsigned int new_weight; + unsigned int dev_weight; + + unsigned int leaf_weight; + unsigned int new_leaf_weight; + unsigned int dev_leaf_weight; /* number of cfqq currently on this group */ int nr_cfqq; /* - * Per group busy queus average. Useful for workload slice calc. We + * Per group busy queues average. Useful for workload slice calc. We * create the array for each prio class but at run time it is used * only for RT and BE class and slot for IDLE class remains unused. * This is primarily done to avoid confusion and a gcc warning. @@ -201,16 +282,25 @@ struct cfq_group { struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; - unsigned long saved_workload_slice; - enum wl_type_t saved_workload; - enum wl_prio_t saved_serving_prio; - struct blkio_group blkg; -#ifdef CONFIG_CFQ_GROUP_IOSCHED - struct hlist_node cfqd_node; - int ref; -#endif + unsigned long saved_wl_slice; + enum wl_type_t saved_wl_type; + enum wl_class_t saved_wl_class; + /* number of requests that are on the dispatch list or inside driver */ int dispatched; + struct cfq_ttime ttime; + struct cfqg_stats stats; /* stats for this cfqg */ + struct cfqg_stats dead_stats; /* stats pushed from dead children */ +}; + +struct cfq_io_cq { + struct io_cq icq; /* must be the first member */ + struct cfq_queue *cfqq[2]; + struct cfq_ttime ttime; + int ioprio; /* the current ioprio */ +#ifdef CONFIG_CFQ_GROUP_IOSCHED + uint64_t blkcg_id; /* the current blkcg ID */ +#endif }; /* @@ -220,13 +310,13 @@ struct cfq_data { struct request_queue *queue; /* Root service tree for cfq_groups */ struct cfq_rb_root grp_service_tree; - struct cfq_group root_group; + struct cfq_group *root_group; /* * The priority currently being served */ - enum wl_prio_t serving_prio; - enum wl_type_t serving_type; + enum wl_class_t serving_wl_class; + enum wl_type_t serving_wl_type; unsigned long workload_expires; struct cfq_group *serving_group; @@ -238,6 +328,7 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; + unsigned int busy_sync_queues; int rq_in_driver; int rq_in_flight[2]; @@ -263,7 +354,7 @@ struct cfq_data { struct work_struct unplug_work; struct cfq_queue *active_queue; - struct cfq_io_context *active_cic; + struct cfq_io_cq *active_cic; /* * async queue for each priority case @@ -285,10 +376,7 @@ struct cfq_data { unsigned int cfq_slice_idle; unsigned int cfq_group_idle; unsigned int cfq_latency; - unsigned int cfq_group_isolation; - - unsigned int cic_index; - struct list_head cic_list; + unsigned int cfq_target_latency; /* * Fallback dummy cfqq for extreme OOM conditions @@ -296,25 +384,21 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_delayed_sync; - - /* List of cfq groups being managed on this device*/ - struct hlist_head cfqg_list; - struct rcu_head rcu; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); -static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, - enum wl_prio_t prio, +static struct cfq_rb_root *st_for(struct cfq_group *cfqg, + enum wl_class_t class, enum wl_type_t type) { if (!cfqg) return NULL; - if (prio == IDLE_WORKLOAD) + if (class == IDLE_WORKLOAD) return &cfqg->service_tree_idle; - return &cfqg->service_trees[prio][type]; + return &cfqg->service_trees[class][type]; } enum cfqq_state_flags { @@ -362,21 +446,337 @@ CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct cfq_group, pd) : NULL; +} + +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) +{ + return pd_to_blkg(&cfqg->pd); +} + +#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) + +/* cfqg stats flags */ +enum cfqg_stats_flags { + CFQG_stats_waiting = 0, + CFQG_stats_idling, + CFQG_stats_empty, +}; + +#define CFQG_FLAG_FNS(name) \ +static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ +{ \ + stats->flags |= (1 << CFQG_stats_##name); \ +} \ +static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ +{ \ + stats->flags &= ~(1 << CFQG_stats_##name); \ +} \ +static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ +{ \ + return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ +} \ + +CFQG_FLAG_FNS(waiting) +CFQG_FLAG_FNS(idling) +CFQG_FLAG_FNS(empty) +#undef CFQG_FLAG_FNS + +/* This should be called with the queue_lock held. */ +static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) +{ + unsigned long long now; + + if (!cfqg_stats_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + blkg_stat_add(&stats->group_wait_time, + now - stats->start_group_wait_time); + cfqg_stats_clear_waiting(stats); +} + +/* This should be called with the queue_lock held. */ +static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, + struct cfq_group *curr_cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + if (cfqg_stats_waiting(stats)) + return; + if (cfqg == curr_cfqg) + return; + stats->start_group_wait_time = sched_clock(); + cfqg_stats_mark_waiting(stats); +} + +/* This should be called with the queue_lock held. */ +static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) +{ + unsigned long long now; + + if (!cfqg_stats_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + blkg_stat_add(&stats->empty_time, + now - stats->start_empty_time); + cfqg_stats_clear_empty(stats); +} + +static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) +{ + blkg_stat_add(&cfqg->stats.dequeue, 1); +} + +static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + if (blkg_rwstat_total(&stats->queued)) + return; + + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if (cfqg_stats_empty(stats)) + return; + + stats->start_empty_time = sched_clock(); + cfqg_stats_mark_empty(stats); +} + +static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + if (cfqg_stats_idling(stats)) { + unsigned long long now = sched_clock(); + + if (time_after64(now, stats->start_idle_time)) + blkg_stat_add(&stats->idle_time, + now - stats->start_idle_time); + cfqg_stats_clear_idling(stats); + } +} + +static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + BUG_ON(cfqg_stats_idling(stats)); + + stats->start_idle_time = sched_clock(); + cfqg_stats_mark_idling(stats); +} + +static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + blkg_stat_add(&stats->avg_queue_size_sum, + blkg_rwstat_total(&stats->queued)); + blkg_stat_add(&stats->avg_queue_size_samples, 1); + cfqg_stats_update_group_wait_time(stats); +} + +#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + +static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } +static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } +static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } +static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } +static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } +static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } +static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } + +#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + #ifdef CONFIG_CFQ_GROUP_IOSCHED -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - blkg_path(&(cfqq)->cfqg->blkg), ##args); -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ - blkg_path(&(cfqg)->blkg), ##args); \ +static struct blkcg_policy blkcg_policy_cfq; -#else -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) +{ + return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); +} + +static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) +{ + struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; + + return pblkg ? blkg_to_cfqg(pblkg) : NULL; +} + +static inline void cfqg_get(struct cfq_group *cfqg) +{ + return blkg_get(cfqg_to_blkg(cfqg)); +} + +static inline void cfqg_put(struct cfq_group *cfqg) +{ + return blkg_put(cfqg_to_blkg(cfqg)); +} + +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ + __pbuf, ##args); \ +} while (0) + +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ +} while (0) + +static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, + struct cfq_group *curr_cfqg, int rw) +{ + blkg_rwstat_add(&cfqg->stats.queued, rw, 1); + cfqg_stats_end_empty_time(&cfqg->stats); + cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); +} + +static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, + unsigned long time, unsigned long unaccounted_time) +{ + blkg_stat_add(&cfqg->stats.time, time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); +#endif +} + +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) +{ + blkg_rwstat_add(&cfqg->stats.queued, rw, -1); +} + +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) +{ + blkg_rwstat_add(&cfqg->stats.merged, rw, 1); +} + +static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, + uint64_t bytes, int rw) +{ + blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); + blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); + blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); +} + +static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, + uint64_t start_time, uint64_t io_start_time, int rw) +{ + struct cfqg_stats *stats = &cfqg->stats; + unsigned long long now = sched_clock(); + + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&stats->wait_time, rw, + io_start_time - start_time); +} + +/* @stats = 0 */ +static void cfqg_stats_reset(struct cfqg_stats *stats) +{ + /* queued stats shouldn't be cleared */ + blkg_rwstat_reset(&stats->service_bytes); + blkg_rwstat_reset(&stats->serviced); + blkg_rwstat_reset(&stats->merged); + blkg_rwstat_reset(&stats->service_time); + blkg_rwstat_reset(&stats->wait_time); + blkg_stat_reset(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_reset(&stats->unaccounted_time); + blkg_stat_reset(&stats->avg_queue_size_sum); + blkg_stat_reset(&stats->avg_queue_size_samples); + blkg_stat_reset(&stats->dequeue); + blkg_stat_reset(&stats->group_wait_time); + blkg_stat_reset(&stats->idle_time); + blkg_stat_reset(&stats->empty_time); +#endif +} + +/* @to += @from */ +static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +{ + /* queued stats shouldn't be cleared */ + blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); + blkg_rwstat_merge(&to->serviced, &from->serviced); + blkg_rwstat_merge(&to->merged, &from->merged); + blkg_rwstat_merge(&to->service_time, &from->service_time); + blkg_rwstat_merge(&to->wait_time, &from->wait_time); + blkg_stat_merge(&from->time, &from->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_merge(&to->dequeue, &from->dequeue); + blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); + blkg_stat_merge(&to->idle_time, &from->idle_time); + blkg_stat_merge(&to->empty_time, &from->empty_time); #endif +} + +/* + * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * recursive stats can still account for the amount used by this cfqg after + * it's gone. + */ +static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) +{ + struct cfq_group *parent = cfqg_parent(cfqg); + + lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); + + if (unlikely(!parent)) + return; + + cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); + cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_reset(&cfqg->stats); + cfqg_stats_reset(&cfqg->dead_stats); +} + +#else /* CONFIG_CFQ_GROUP_IOSCHED */ + +static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } +static inline void cfqg_get(struct cfq_group *cfqg) { } +static inline void cfqg_put(struct cfq_group *cfqg) { } + +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ + ##args) +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) + +static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, + struct cfq_group *curr_cfqg, int rw) { } +static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, + unsigned long time, unsigned long unaccounted_time) { } +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } +static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, + uint64_t bytes, int rw) { } +static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, + uint64_t start_time, uint64_t io_start_time, int rw) { } + +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -390,6 +790,18 @@ CFQ_CFQQ_FNS(wait_busy); j++, st = i < IDLE_WORKLOAD ? \ &cfqg->service_trees[i][j]: NULL) \ +static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, + struct cfq_ttime *ttime, bool group_idle) +{ + unsigned long slice; + if (!sample_valid(ttime->ttime_samples)) + return false; + if (group_idle) + slice = cfqd->cfq_group_idle; + else + slice = cfqd->cfq_slice_idle; + return ttime->ttime_mean > slice; +} static inline bool iops_mode(struct cfq_data *cfqd) { @@ -406,7 +818,7 @@ static inline bool iops_mode(struct cfq_data *cfqd) return false; } -static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) +static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) { if (cfq_class_idle(cfqq)) return IDLE_WORKLOAD; @@ -425,59 +837,58 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) return SYNC_WORKLOAD; } -static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, +static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, struct cfq_data *cfqd, struct cfq_group *cfqg) { - if (wl == IDLE_WORKLOAD) + if (wl_class == IDLE_WORKLOAD) return cfqg->service_tree_idle.count; - return cfqg->service_trees[wl][ASYNC_WORKLOAD].count - + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count - + cfqg->service_trees[wl][SYNC_WORKLOAD].count; + return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; } static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, struct cfq_group *cfqg) { - return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count - + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; } static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, - struct io_context *, gfp_t); -static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, - struct io_context *); +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, + struct cfq_io_cq *cic, struct bio *bio, + gfp_t gfp_mask); -static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, - bool is_sync) +static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { - return cic->cfqq[is_sync]; + /* cic->icq is the first member, %NULL will convert to %NULL */ + return container_of(icq, struct cfq_io_cq, icq); } -static inline void cic_set_cfqq(struct cfq_io_context *cic, - struct cfq_queue *cfqq, bool is_sync) +static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, + struct io_context *ioc) { - cic->cfqq[is_sync] = cfqq; + if (ioc) + return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); + return NULL; } -#define CIC_DEAD_KEY 1ul -#define CIC_DEAD_INDEX_SHIFT 1 - -static inline void *cfqd_dead_key(struct cfq_data *cfqd) +static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) { - return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); + return cic->cfqq[is_sync]; } -static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) +static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, + bool is_sync) { - struct cfq_data *cfqd = cic->key; - - if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) - return NULL; + cic->cfqq[is_sync] = cfqq; +} - return cfqd; +static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) +{ + return cic->icq.q->elevator->elevator_data; } /* @@ -497,17 +908,10 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) { if (cfqd->busy_queues) { cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); + kblockd_schedule_work(&cfqd->unplug_work); } } -static int cfq_queue_empty(struct request_queue *q) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - return !cfqd->rq_queued; -} - /* * Scale schedule slice based on io priority. Use the sync time slice only * if a queue is marked sync and has sync io queued. A sync queue with async @@ -529,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +/** + * cfqg_scale_charge - scale disk time charge according to cfqg weight + * @charge: disk time being charged + * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT + * + * Scale @charge according to @vfraction, which is in range (0, 1]. The + * scaling is inversely proportional. + * + * scaled = charge / vfraction + * + * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. + */ +static inline u64 cfqg_scale_charge(unsigned long charge, + unsigned int vfraction) { - u64 d = delta << CFQ_SERVICE_SHIFT; + u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ - d = d * BLKIO_WEIGHT_DEFAULT; - do_div(d, cfqg->weight); - return d; + /* charge / vfraction */ + c <<= CFQ_SERVICE_SHIFT; + do_div(c, vfraction); + return c; } static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) @@ -558,15 +976,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) static void update_min_vdisktime(struct cfq_rb_root *st) { - u64 vdisktime = st->min_vdisktime; struct cfq_group *cfqg; if (st->left) { cfqg = rb_entry_cfqg(st->left); - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); + st->min_vdisktime = max_vdisktime(st->min_vdisktime, + cfqg->vdisktime); } - - st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); } /* @@ -593,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, static inline unsigned cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) { - struct cfq_rb_root *st = &cfqd->grp_service_tree; - - return cfq_target_latency * cfqg->weight / st->total_weight; + return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; } static inline unsigned @@ -673,15 +1087,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq2 == NULL) return rq1; - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) - return rq1; - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) - return rq2; - if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) - return rq1; - else if ((rq2->cmd_flags & REQ_META) && - !(rq1->cmd_flags & REQ_META)) - return rq2; + if (rq_is_sync(rq1) != rq_is_sync(rq2)) + return rq_is_sync(rq1) ? rq1 : rq2; + + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) + return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -863,7 +1273,68 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) } static void -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) +cfq_update_group_weight(struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + + if (cfqg->new_weight) { + cfqg->weight = cfqg->new_weight; + cfqg->new_weight = 0; + } + + if (cfqg->new_leaf_weight) { + cfqg->leaf_weight = cfqg->new_leaf_weight; + cfqg->new_leaf_weight = 0; + } +} + +static void +cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ + struct cfq_group *pos = cfqg; + struct cfq_group *parent; + bool propagate; + + /* add to the service tree */ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + + cfq_update_group_weight(cfqg); + __cfq_group_service_tree_add(st, cfqg); + + /* + * Activate @cfqg and calculate the portion of vfraction @cfqg is + * entitled to. vfraction is calculated by walking the tree + * towards the root calculating the fraction it has at each level. + * The compounded ratio is how much vfraction @cfqg owns. + * + * Start with the proportion tasks in this cfqg has against active + * children cfqgs - its leaf_weight against children_weight. + */ + propagate = !pos->nr_active++; + pos->children_weight += pos->leaf_weight; + vfr = vfr * pos->leaf_weight / pos->children_weight; + + /* + * Compound ->weight walking up the tree. Both activation and + * vfraction calculation are done in the same loop. Propagation + * stops once an already activated node is met. vfraction + * calculation should always continue to the root. + */ + while ((parent = cfqg_parent(pos))) { + if (propagate) { + propagate = !parent->nr_active++; + parent->children_weight += pos->weight; + } + vfr = vfr * pos->weight / parent->children_weight; + pos = parent; + } + + cfqg->vfraction = max_t(unsigned, vfr, 1); +} + +static void +cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; struct cfq_group *__cfqg; @@ -876,7 +1347,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) /* * Currently put the group at the end. Later implement something * so that groups get lesser vtime based on their weights, so that - * if group does not loose all if it was not continously backlogged. + * if group does not loose all if it was not continuously backlogged. */ n = rb_last(&st->rb); if (n) { @@ -884,13 +1355,44 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; } else cfqg->vdisktime = st->min_vdisktime; + cfq_group_service_tree_add(st, cfqg); +} - __cfq_group_service_tree_add(st, cfqg); - st->total_weight += cfqg->weight; +static void +cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + struct cfq_group *pos = cfqg; + bool propagate; + + /* + * Undo activation from cfq_group_service_tree_add(). Deactivate + * @cfqg and propagate deactivation upwards. + */ + propagate = !--pos->nr_active; + pos->children_weight -= pos->leaf_weight; + + while (propagate) { + struct cfq_group *parent = cfqg_parent(pos); + + /* @pos has 0 nr_active at this point */ + WARN_ON_ONCE(pos->children_weight); + pos->vfraction = 0; + + if (!parent) + break; + + propagate = !--parent->nr_active; + parent->children_weight -= pos->weight; + pos = parent; + } + + /* remove from the service tree */ + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + cfq_rb_erase(&cfqg->rb_node, st); } static void -cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) +cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; @@ -902,14 +1404,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) return; cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - st->total_weight -= cfqg->weight; - if (!RB_EMPTY_NODE(&cfqg->rb_node)) - cfq_rb_erase(&cfqg->rb_node, st); - cfqg->saved_workload_slice = 0; - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); + cfq_group_service_tree_del(st, cfqg); + cfqg->saved_wl_slice = 0; + cfqg_stats_update_dequeue(cfqg); } -static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) +static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, + unsigned int *unaccounted_time) { unsigned int slice_used; @@ -928,8 +1429,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 1); } else { slice_used = jiffies - cfqq->slice_start; - if (slice_used > cfqq->allocated_slice) + if (slice_used > cfqq->allocated_slice) { + *unaccounted_time = slice_used - cfqq->allocated_slice; slice_used = cfqq->allocated_slice; + } + if (time_after(cfqq->slice_start, cfqq->dispatch_start)) + *unaccounted_time += cfqq->slice_start - + cfqq->dispatch_start; } return slice_used; @@ -939,135 +1445,169 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, struct cfq_queue *cfqq) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - unsigned int used_sl, charge; + unsigned int used_sl, charge, unaccounted_sl = 0; int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - cfqg->service_tree_idle.count; + unsigned int vfr; BUG_ON(nr_sync < 0); - used_sl = charge = cfq_cfqq_slice_usage(cfqq); + used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); if (iops_mode(cfqd)) charge = cfqq->slice_dispatch; else if (!cfq_cfqq_sync(cfqq) && !nr_sync) charge = cfqq->allocated_slice; - /* Can't update vdisktime while group is on service tree */ - cfq_rb_erase(&cfqg->rb_node, st); - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); - __cfq_group_service_tree_add(st, cfqg); + /* + * Can't update vdisktime while on service tree and cfqg->vfraction + * is valid only while on it. Cache vfr, leave the service tree, + * update vdisktime and go back on. The re-addition to the tree + * will also update the weights as necessary. + */ + vfr = cfqg->vfraction; + cfq_group_service_tree_del(st, cfqg); + cfqg->vdisktime += cfqg_scale_charge(charge, vfr); + cfq_group_service_tree_add(st, cfqg); /* This group is being expired. Save the context */ if (time_after(cfqd->workload_expires, jiffies)) { - cfqg->saved_workload_slice = cfqd->workload_expires + cfqg->saved_wl_slice = cfqd->workload_expires - jiffies; - cfqg->saved_workload = cfqd->serving_type; - cfqg->saved_serving_prio = cfqd->serving_prio; + cfqg->saved_wl_type = cfqd->serving_wl_type; + cfqg->saved_wl_class = cfqd->serving_wl_class; } else - cfqg->saved_workload_slice = 0; + cfqg->saved_wl_slice = 0; cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" - " sect=%u", used_sl, cfqq->slice_dispatch, charge, - iops_mode(cfqd), cfqq->nr_sectors); - cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); - cfq_blkiocg_set_start_empty_time(&cfqg->blkg); + cfq_log_cfqq(cfqq->cfqd, cfqq, + "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", + used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); + cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); + cfqg_stats_set_start_empty_time(cfqg); } -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +/** + * cfq_init_cfqg_base - initialize base part of a cfq_group + * @cfqg: cfq_group to initialize + * + * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED + * is enabled or not. + */ +static void cfq_init_cfqg_base(struct cfq_group *cfqg) { - if (blkg) - return container_of(blkg, struct cfq_group, blkg); - return NULL; + struct cfq_rb_root *st; + int i, j; + + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + cfqg->ttime.last_end_request = jiffies; } -void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, - unsigned int weight) -{ - cfqg_of_blkg(blkg)->weight = weight; +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void cfqg_stats_init(struct cfqg_stats *stats) +{ + blkg_rwstat_init(&stats->service_bytes); + blkg_rwstat_init(&stats->serviced); + blkg_rwstat_init(&stats->merged); + blkg_rwstat_init(&stats->service_time); + blkg_rwstat_init(&stats->wait_time); + blkg_rwstat_init(&stats->queued); + + blkg_stat_init(&stats->sectors); + blkg_stat_init(&stats->time); + +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_init(&stats->unaccounted_time); + blkg_stat_init(&stats->avg_queue_size_sum); + blkg_stat_init(&stats->avg_queue_size_samples); + blkg_stat_init(&stats->dequeue); + blkg_stat_init(&stats->group_wait_time); + blkg_stat_init(&stats->idle_time); + blkg_stat_init(&stats->empty_time); +#endif } -static struct cfq_group * -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +static void cfq_pd_init(struct blkcg_gq *blkg) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - struct cfq_group *cfqg = NULL; - void *key = cfqd; - int i, j; - struct cfq_rb_root *st; - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; - unsigned int major, minor; - - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfqg->blkg.dev = MKDEV(major, minor); - goto done; - } - if (cfqg || !create) - goto done; - - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); - if (!cfqg) - goto done; + struct cfq_group *cfqg = blkg_to_cfqg(blkg); - for_each_cfqg_st(cfqg, i, j, st) - *st = CFQ_RB_ROOT; - RB_CLEAR_NODE(&cfqg->rb_node); + cfq_init_cfqg_base(cfqg); + cfqg->weight = blkg->blkcg->cfq_weight; + cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; + cfqg_stats_init(&cfqg->stats); + cfqg_stats_init(&cfqg->dead_stats); +} +static void cfq_pd_offline(struct blkcg_gq *blkg) +{ /* - * Take the initial reference that will be released on destroy - * This can be thought of a joint reference by cgroup and - * elevator which will be dropped by either elevator exit - * or cgroup deletion path depending on who is exiting first. + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so + * that they don't get lost. If IOs complete after this point, the + * stats for them will be lost. Oh well... */ - cfqg->ref = 1; + cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); +} - /* - * Add group onto cgroup list. It might happen that bdi->dev is - * not initialized yet. Initialize this new group without major - * and minor info and this info will be filled in once a new thread - * comes for IO. See code above. - */ - if (bdi->dev) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - MKDEV(major, minor)); - } else - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - 0); +/* offset delta from cfqg->stats to cfqg->dead_stats */ +static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - + offsetof(struct cfq_group, stats); - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); +/* to be used by recursive prfill, sums live and dead stats recursively */ +static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +{ + u64 sum = 0; - /* Add group on cfqd list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + sum += blkg_stat_recursive_sum(pd, off); + sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); + return sum; +} -done: - return cfqg; +/* to be used by recursive prfill, sums live and dead rwstats recursively */ +static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat a, b; + + a = blkg_rwstat_recursive_sum(pd, off); + b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); + blkg_rwstat_merge(&a, &b); + return a; +} + +static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +{ + struct cfq_group *cfqg = blkg_to_cfqg(blkg); + + cfqg_stats_reset(&cfqg->stats); + cfqg_stats_reset(&cfqg->dead_stats); } /* - * Search for the cfq group current task belongs to. If create = 1, then also - * create the cfq group if it does not exist. request_queue lock must be held. + * Search for the cfq group current task belongs to. request_queue lock must + * be held. */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct cgroup *cgroup; + struct request_queue *q = cfqd->queue; struct cfq_group *cfqg = NULL; - rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); - if (!cfqg && create) - cfqg = &cfqd->root_group; - rcu_read_unlock(); - return cfqg; -} + /* avoid lookup for the common case where there's no blkcg */ + if (blkcg == &blkcg_root) { + cfqg = cfqd->root_group; + } else { + struct blkcg_gq *blkg; + + blkg = blkg_lookup_create(blkcg, q); + if (!IS_ERR(blkg)) + cfqg = blkg_to_cfqg(blkg); + } -static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) -{ - cfqg->ref++; return cfqg; } @@ -1075,90 +1615,383 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { /* Currently, all async queues are mapped to root group */ if (!cfq_cfqq_sync(cfqq)) - cfqg = &cfqq->cfqd->root_group; + cfqg = cfqq->cfqd->root_group; cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ - cfqq->cfqg->ref++; + cfqg_get(cfqg); } -static void cfq_put_cfqg(struct cfq_group *cfqg) +static u64 cfqg_prfill_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - struct cfq_rb_root *st; - int i, j; + struct cfq_group *cfqg = pd_to_cfqg(pd); - BUG_ON(cfqg->ref <= 0); - cfqg->ref--; - if (cfqg->ref) - return; - for_each_cfqg_st(cfqg, i, j, st) - BUG_ON(!RB_EMPTY_ROOT(&st->rb)); - kfree(cfqg); + if (!cfqg->dev_weight) + return 0; + return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +static int cfqg_print_weight_device(struct seq_file *sf, void *v) { - /* Something wrong if we are trying to remove same group twice */ - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_weight_device, &blkcg_policy_cfq, + 0, false); + return 0; +} - hlist_del_init(&cfqg->cfqd_node); +static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); - /* - * Put the reference taken at the time of creation so that when all - * queues are gone, group can be destroyed. - */ - cfq_put_cfqg(cfqg); + if (!cfqg->dev_leaf_weight) + return 0; + return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); } -static void cfq_release_cfq_groups(struct cfq_data *cfqd) +static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) { - struct hlist_node *pos, *n; + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, + 0, false); + return 0; +} + +static int cfq_print_weight(struct seq_file *sf, void *v) +{ + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); + return 0; +} + +static int cfq_print_leaf_weight(struct seq_file *sf, void *v) +{ + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); + return 0; +} + +static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + bool is_leaf_weight) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; struct cfq_group *cfqg; + int ret; - hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { - /* - * If cgroup removal path got to blk_group first and removed - * it from cgroup list, then it will take care of destroying - * cfqg also. - */ - if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) - cfq_destroy_cfqg(cfqd, cfqg); + ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + cfqg = blkg_to_cfqg(ctx.blkg); + if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + if (!is_leaf_weight) { + cfqg->dev_weight = ctx.v; + cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; + } else { + cfqg->dev_leaf_weight = ctx.v; + cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; + } + ret = 0; } + + blkg_conf_finish(&ctx); + return ret ?: nbytes; } -/* - * Blk cgroup controller notification saying that blkio_group object is being - * delinked as associated cgroup object is going away. That also means that - * no new IO will come in this group. So get rid of this group as soon as - * any pending IO in the group is finished. - * - * This function is called under rcu_read_lock(). key is the rcu protected - * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu - * read lock. - * - * "key" was fetched from blkio_group under blkio_cgroup->lock. That means - * it should not be NULL as even if elevator was exiting, cgroup deltion - * path got to it first. - */ -void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - unsigned long flags; - struct cfq_data *cfqd = key; + return __cfqg_set_weight_device(of, buf, nbytes, off, false); +} - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cfqg_set_weight_device(of, buf, nbytes, off, true); } -#else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val, bool is_leaf_weight) { - return &cfqd->root_group; + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) + return -EINVAL; + + spin_lock_irq(&blkcg->lock); + + if (!is_leaf_weight) + blkcg->cfq_weight = val; + else + blkcg->cfq_leaf_weight = val; + + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct cfq_group *cfqg = blkg_to_cfqg(blkg); + + if (!cfqg) + continue; + + if (!is_leaf_weight) { + if (!cfqg->dev_weight) + cfqg->new_weight = blkcg->cfq_weight; + } else { + if (!cfqg->dev_leaf_weight) + cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; + } + } + + spin_unlock_irq(&blkcg->lock); + return 0; } -static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - return cfqg; + return __cfq_set_weight(css, cft, val, false); +} + +static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + return __cfq_set_weight(css, cft, val, true); +} + +static int cfqg_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_cfq, seq_cft(sf)->private, false); + return 0; +} + +static int cfqg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_cfq, seq_cft(sf)->private, true); + return 0; +} + +static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + u64 sum = cfqg_stat_pd_recursive_sum(pd, off); + + return __blkg_prfill_u64(sf, pd, sum); +} + +static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); + + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_stat_recursive, &blkcg_policy_cfq, + seq_cft(sf)->private, false); + return 0; +} + +static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, + seq_cft(sf)->private, true); + return 0; +} + +#ifdef CONFIG_DEBUG_BLK_CGROUP +static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); + u64 v = 0; + + if (samples) { + v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); + v = div64_u64(v, samples); + } + __blkg_prfill_u64(sf, pd, v); + return 0; +} + +/* print avg_queue_size */ +static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, + 0, false); + return 0; +} +#endif /* CONFIG_DEBUG_BLK_CGROUP */ + +static struct cftype cfq_blkcg_files[] = { + /* on root, weight is mapped to leaf_weight */ + { + .name = "weight_device", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cfqg_print_leaf_weight_device, + .write = cfqg_set_leaf_weight_device, + }, + { + .name = "weight", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cfq_print_leaf_weight, + .write_u64 = cfq_set_leaf_weight, + }, + + /* no such mapping necessary for !roots */ + { + .name = "weight_device", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfqg_print_weight_device, + .write = cfqg_set_weight_device, + }, + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight, + .write_u64 = cfq_set_weight, + }, + + { + .name = "leaf_weight_device", + .seq_show = cfqg_print_leaf_weight_device, + .write = cfqg_set_leaf_weight_device, + }, + { + .name = "leaf_weight", + .seq_show = cfq_print_leaf_weight, + .write_u64 = cfq_set_leaf_weight, + }, + + /* statistics, covers only the tasks in the cfqg */ + { + .name = "time", + .private = offsetof(struct cfq_group, stats.time), + .seq_show = cfqg_print_stat, + }, + { + .name = "sectors", + .private = offsetof(struct cfq_group, stats.sectors), + .seq_show = cfqg_print_stat, + }, + { + .name = "io_service_bytes", + .private = offsetof(struct cfq_group, stats.service_bytes), + .seq_show = cfqg_print_rwstat, + }, + { + .name = "io_serviced", + .private = offsetof(struct cfq_group, stats.serviced), + .seq_show = cfqg_print_rwstat, + }, + { + .name = "io_service_time", + .private = offsetof(struct cfq_group, stats.service_time), + .seq_show = cfqg_print_rwstat, + }, + { + .name = "io_wait_time", + .private = offsetof(struct cfq_group, stats.wait_time), + .seq_show = cfqg_print_rwstat, + }, + { + .name = "io_merged", + .private = offsetof(struct cfq_group, stats.merged), + .seq_show = cfqg_print_rwstat, + }, + { + .name = "io_queued", + .private = offsetof(struct cfq_group, stats.queued), + .seq_show = cfqg_print_rwstat, + }, + + /* the same statictics which cover the cfqg and its descendants */ + { + .name = "time_recursive", + .private = offsetof(struct cfq_group, stats.time), + .seq_show = cfqg_print_stat_recursive, + }, + { + .name = "sectors_recursive", + .private = offsetof(struct cfq_group, stats.sectors), + .seq_show = cfqg_print_stat_recursive, + }, + { + .name = "io_service_bytes_recursive", + .private = offsetof(struct cfq_group, stats.service_bytes), + .seq_show = cfqg_print_rwstat_recursive, + }, + { + .name = "io_serviced_recursive", + .private = offsetof(struct cfq_group, stats.serviced), + .seq_show = cfqg_print_rwstat_recursive, + }, + { + .name = "io_service_time_recursive", + .private = offsetof(struct cfq_group, stats.service_time), + .seq_show = cfqg_print_rwstat_recursive, + }, + { + .name = "io_wait_time_recursive", + .private = offsetof(struct cfq_group, stats.wait_time), + .seq_show = cfqg_print_rwstat_recursive, + }, + { + .name = "io_merged_recursive", + .private = offsetof(struct cfq_group, stats.merged), + .seq_show = cfqg_print_rwstat_recursive, + }, + { + .name = "io_queued_recursive", + .private = offsetof(struct cfq_group, stats.queued), + .seq_show = cfqg_print_rwstat_recursive, + }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "avg_queue_size", + .seq_show = cfqg_print_avg_queue_size, + }, + { + .name = "group_wait_time", + .private = offsetof(struct cfq_group, stats.group_wait_time), + .seq_show = cfqg_print_stat, + }, + { + .name = "idle_time", + .private = offsetof(struct cfq_group, stats.idle_time), + .seq_show = cfqg_print_stat, + }, + { + .name = "empty_time", + .private = offsetof(struct cfq_group, stats.empty_time), + .seq_show = cfqg_print_stat, + }, + { + .name = "dequeue", + .private = offsetof(struct cfq_group, stats.dequeue), + .seq_show = cfqg_print_stat, + }, + { + .name = "unaccounted_time", + .private = offsetof(struct cfq_group, stats.unaccounted_time), + .seq_show = cfqg_print_stat, + }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ + { } /* terminate */ +}; +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) +{ + return cfqd->root_group; } static inline void @@ -1166,9 +1999,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { cfqq->cfqg = cfqg; } -static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} -static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} - #endif /* GROUP_IOSCHED */ /* @@ -1182,42 +2012,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; - struct cfq_rb_root *service_tree; + struct cfq_rb_root *st; int left; int new_cfqq = 1; - int group_changed = 0; - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - if (!cfqd->cfq_group_isolation - && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD - && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { - /* Move this cfq to root group */ - cfq_log_cfqq(cfqd, cfqq, "moving to root group"); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_group_service_tree_del(cfqd, cfqq->cfqg); - cfqq->orig_cfqg = cfqq->cfqg; - cfqq->cfqg = &cfqd->root_group; - cfqd->root_group.ref++; - group_changed = 1; - } else if (!cfqd->cfq_group_isolation - && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { - /* cfqq is sequential now needs to go to its original group */ - BUG_ON(cfqq->cfqg != &cfqd->root_group); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_group_service_tree_del(cfqd, cfqq->cfqg); - cfq_put_cfqg(cfqq->cfqg); - cfqq->cfqg = cfqq->orig_cfqg; - cfqq->orig_cfqg = NULL; - group_changed = 1; - cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); - } -#endif - service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), - cfqq_type(cfqq)); + st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&service_tree->rb); + parent = rb_last(&st->rb); if (parent && parent != &cfqq->rb_node) { __cfqq = rb_entry(parent, struct cfq_queue, rb_node); rb_key += __cfqq->rb_key; @@ -1235,7 +2037,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->slice_resid = 0; } else { rb_key = -HZ; - __cfqq = cfq_rb_first(service_tree); + __cfqq = cfq_rb_first(st); rb_key += __cfqq ? __cfqq->rb_key : jiffies; } @@ -1244,8 +2046,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * same position, nothing more to do */ - if (rb_key == cfqq->rb_key && - cfqq->service_tree == service_tree) + if (rb_key == cfqq->rb_key && cfqq->service_tree == st) return; cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); @@ -1254,11 +2055,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, left = 1; parent = NULL; - cfqq->service_tree = service_tree; - p = &service_tree->rb.rb_node; + cfqq->service_tree = st; + p = &st->rb.rb_node; while (*p) { - struct rb_node **n; - parent = *p; __cfqq = rb_entry(parent, struct cfq_queue, rb_node); @@ -1266,25 +2065,23 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, * sort by key, that represents service time. */ if (time_before(rb_key, __cfqq->rb_key)) - n = &(*p)->rb_left; + p = &parent->rb_left; else { - n = &(*p)->rb_right; + p = &parent->rb_right; left = 0; } - - p = n; } if (left) - service_tree->left = &cfqq->rb_node; + st->left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &service_tree->rb); - service_tree->count++; - if ((add_front || !new_cfqq) && !group_changed) + rb_insert_color(&cfqq->rb_node, &st->rb); + st->count++; + if (add_front || !new_cfqq) return; - cfq_group_service_tree_add(cfqd, cfqq->cfqg); + cfq_group_notify_queue_add(cfqd, cfqq->cfqg); } static struct cfq_queue * @@ -1372,6 +2169,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; + if (cfq_cfqq_sync(cfqq)) + cfqd->busy_sync_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -1395,9 +2194,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfqq->p_root = NULL; } - cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfq_group_notify_queue_del(cfqd, cfqq->cfqg); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + if (cfq_cfqq_sync(cfqq)) + cfqd->busy_sync_queues--; } /* @@ -1430,16 +2231,11 @@ static void cfq_add_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); struct cfq_data *cfqd = cfqq->cfqd; - struct request *__alias, *prev; + struct request *prev; cfqq->queued[rq_is_sync(rq)]++; - /* - * looks a little odd, but the first insert might return an alias. - * if that happens, put the alias on the dispatch list - */ - while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) - cfq_dispatch_insert(cfqd->queue, __alias); + elv_rb_add(&cfqq->sort_list, rq); if (!cfq_cfqq_on_rr(cfqq)) cfq_add_cfqq_rr(cfqd, cfqq); @@ -1463,19 +2259,17 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(rq), rq_is_sync(rq)); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); cfq_add_rq_rb(rq); - cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, - &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), - rq_is_sync(rq)); + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, + rq->cmd_flags); } static struct request * cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) { struct task_struct *tsk = current; - struct cfq_io_context *cic; + struct cfq_io_cq *cic; struct cfq_queue *cfqq; cic = cfq_cic_lookup(cfqd, tsk->io_context); @@ -1483,11 +2277,8 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) return NULL; cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); - if (cfqq) { - sector_t sector = bio->bi_sector + bio_sectors(bio); - - return elv_rb_find(&cfqq->sort_list, sector); - } + if (cfqq) + return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); return NULL; } @@ -1524,11 +2315,10 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(rq), rq_is_sync(rq)); - if (rq->cmd_flags & REQ_META) { - WARN_ON(!cfqq->meta_pending); - cfqq->meta_pending--; + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); + if (rq->cmd_flags & REQ_PRIO) { + WARN_ON(!cfqq->prio_pending); + cfqq->prio_pending--; } } @@ -1560,8 +2350,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, - bio_data_dir(bio), cfq_bio_sync(bio)); + cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw); } static void @@ -1569,27 +2358,39 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { struct cfq_queue *cfqq = RQ_CFQQ(rq); + struct cfq_data *cfqd = q->elevator->elevator_data; + /* * reposition in fifo if next is older than rq */ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - time_before(rq_fifo_time(next), rq_fifo_time(rq))) { + time_before(next->fifo_time, rq->fifo_time) && + cfqq == RQ_CFQQ(next)) { list_move(&rq->queuelist, &next->queuelist); - rq_set_fifo_time(rq, rq_fifo_time(next)); + rq->fifo_time = next->fifo_time; } if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(next), rq_is_sync(next)); + cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); + + cfqq = RQ_CFQQ(next); + /* + * all requests of this queue are merged to other queues, delete it + * from the service tree. If it's the active_queue, + * cfq_dispatch_requests() will choose to expire it or do idle + */ + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && + cfqq != cfqd->active_queue) + cfq_del_cfqq_rr(cfqd, cfqq); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_io_context *cic; + struct cfq_io_cq *cic; struct cfq_queue *cfqq; /* @@ -1599,7 +2400,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, return false; /* - * Lookup the cfqq that this bio will be queued with. Allow + * Lookup the cfqq that this bio will be queued with and allow * merge only if rq is queued there. */ cic = cfq_cic_lookup(cfqd, current->io_context); @@ -1613,16 +2414,16 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) { del_timer(&cfqd->idle_slice_timer); - cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); + cfqg_stats_update_idle_time(cfqq->cfqg); } static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", - cfqd->serving_prio, cfqd->serving_type); - cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); + cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", + cfqd->serving_wl_class, cfqd->serving_wl_type); + cfqg_stats_update_avg_queue_size(cfqq->cfqg); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; cfqq->allocated_slice = 0; @@ -1688,7 +2489,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->active_queue = NULL; if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->ioc); + put_io_context(cfqd->active_cic->icq.ioc); cfqd->active_cic = NULL; } } @@ -1707,19 +2508,18 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - struct cfq_rb_root *service_tree = - service_tree_for(cfqd->serving_group, cfqd->serving_prio, - cfqd->serving_type); + struct cfq_rb_root *st = st_for(cfqd->serving_group, + cfqd->serving_wl_class, cfqd->serving_wl_type); if (!cfqd->rq_queued) return NULL; /* There is nothing to dispatch */ - if (!service_tree) + if (!st) return NULL; - if (RB_EMPTY_ROOT(&service_tree->rb)) + if (RB_EMPTY_ROOT(&st->rb)) return NULL; - return cfq_rb_first(service_tree); + return cfq_rb_first(st); } static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) @@ -1875,17 +2675,17 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - enum wl_prio_t prio = cfqq_prio(cfqq); - struct cfq_rb_root *service_tree = cfqq->service_tree; + enum wl_class_t wl_class = cfqq_class(cfqq); + struct cfq_rb_root *st = cfqq->service_tree; - BUG_ON(!service_tree); - BUG_ON(!service_tree->count); + BUG_ON(!st); + BUG_ON(!st->count); if (!cfqd->cfq_slice_idle) return false; /* We never do for idle class queues. */ - if (prio == IDLE_WORKLOAD) + if (wl_class == IDLE_WORKLOAD) return false; /* We do for queues that were marked with idle window flag. */ @@ -1897,17 +2697,17 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) + if (st->count == 1 && cfq_cfqq_sync(cfqq) && + !cfq_io_thinktime_big(cfqd, &st->ttime, false)) return true; - cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", - service_tree->count); + cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); return false; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; - struct cfq_io_context *cic; + struct cfq_io_cq *cic; unsigned long sl, group_idle = 0; /* @@ -1942,7 +2742,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * task has exited, don't wait */ cic = cfqd->active_cic; - if (!cic || !atomic_read(&cic->ioc->nr_tasks)) + if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) return; /* @@ -1950,10 +2750,10 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * slice, then don't idle. This avoids overrunning the allotted * time slice. */ - if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) { - cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", - cic->ttime_mean); + if (sample_valid(cic->ttime.ttime_samples) && + (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) { + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", + cic->ttime.ttime_mean); return; } @@ -1969,7 +2769,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); + cfqg_stats_set_start_idle_time(cfqq->cfqg); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, group_idle ? 1 : 0); } @@ -1992,8 +2792,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), - rq_data_dir(rq), rq_is_sync(rq)); + cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* @@ -2012,7 +2811,7 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) return NULL; rq = rq_entry_fifo(cfqq->fifo.next); - if (time_before(jiffies, rq_fifo_time(rq))) + if (time_before(jiffies, rq->fifo_time)) rq = NULL; cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); @@ -2026,7 +2825,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); + return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); } /* @@ -2084,8 +2883,8 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) } } -static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, - struct cfq_group *cfqg, enum wl_prio_t prio) +static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, + struct cfq_group *cfqg, enum wl_class_t wl_class) { struct cfq_queue *queue; int i; @@ -2095,7 +2894,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, for (i = 0; i <= SYNC_WORKLOAD; ++i) { /* select the one with lowest rb_key */ - queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); + queue = cfq_rb_first(st_for(cfqg, wl_class, i)); if (queue && (!key_valid || time_before(queue->rb_key, lowest_key))) { lowest_key = queue->rb_key; @@ -2107,26 +2906,27 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, return cur_best; } -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) +static void +choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) { unsigned slice; unsigned count; struct cfq_rb_root *st; unsigned group_slice; - enum wl_prio_t original_prio = cfqd->serving_prio; + enum wl_class_t original_class = cfqd->serving_wl_class; /* Choose next priority. RT > BE > IDLE */ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) - cfqd->serving_prio = RT_WORKLOAD; + cfqd->serving_wl_class = RT_WORKLOAD; else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) - cfqd->serving_prio = BE_WORKLOAD; + cfqd->serving_wl_class = BE_WORKLOAD; else { - cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->serving_wl_class = IDLE_WORKLOAD; cfqd->workload_expires = jiffies + 1; return; } - if (original_prio != cfqd->serving_prio) + if (original_class != cfqd->serving_wl_class) goto new_workload; /* @@ -2134,7 +2934,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload * expiration time */ - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); count = st->count; /* @@ -2145,9 +2945,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) new_workload: /* otherwise select new workload type */ - cfqd->serving_type = - cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, + cfqd->serving_wl_class); + st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); count = st->count; /* @@ -2158,10 +2958,11 @@ new_workload: group_slice = cfq_group_slice(cfqd, cfqg); slice = group_slice * count / - max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], - cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); + max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], + cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, + cfqg)); - if (cfqd->serving_type == ASYNC_WORKLOAD) { + if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { unsigned int tmp; /* @@ -2171,7 +2972,8 @@ new_workload: * to have higher weight. A more accurate thing would be to * calculate system wide asnc/sync ratio. */ - tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); + tmp = cfqd->cfq_target_latency * + cfqg_busy_async_queues(cfqd, cfqg); tmp = tmp/cfqd->busy_queues; slice = min_t(unsigned, slice, tmp); @@ -2206,14 +3008,14 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd) cfqd->serving_group = cfqg; /* Restore the workload type data */ - if (cfqg->saved_workload_slice) { - cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; - cfqd->serving_type = cfqg->saved_workload; - cfqd->serving_prio = cfqg->saved_serving_prio; + if (cfqg->saved_wl_slice) { + cfqd->workload_expires = jiffies + cfqg->saved_wl_slice; + cfqd->serving_wl_type = cfqg->saved_wl_type; + cfqd->serving_wl_class = cfqg->saved_wl_class; } else cfqd->workload_expires = jiffies - 1; - choose_service_tree(cfqd, cfqg); + choose_wl_class_and_type(cfqd, cfqg); } /* @@ -2309,8 +3111,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * this group, wait for requests to complete. */ check_group_idle: - if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 - && cfqq->cfqg->dispatched) { + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && + cfqq->cfqg->dispatched && + !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { cfqq = NULL; goto keep_queue; } @@ -2405,6 +3208,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Does this cfqq already have too much IO in flight? */ if (cfqq->dispatched >= max_dispatch) { + bool promote_sync = false; /* * idle queue must always only have a single IO in flight */ @@ -2412,15 +3216,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) return false; /* + * If there is only one sync queue + * we can ignore async queue here and give the sync + * queue no dispatch limit. The reason is a sync queue can + * preempt async queue, limiting the sync queue doesn't make + * sense. This is useful for aiostress test. + */ + if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) + promote_sync = true; + + /* * We have other queues, don't allow more IO from this one */ - if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) + if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && + !promote_sync) return false; /* * Sole queue user, no limit */ - if (cfqd->busy_queues == 1) + if (cfqd->busy_queues == 1 || promote_sync) max_dispatch = -1; else /* @@ -2480,9 +3295,9 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_dispatch_insert(cfqd->queue, rq); if (!cfqd->active_cic) { - struct cfq_io_context *cic = RQ_CIC(rq); + struct cfq_io_cq *cic = RQ_CIC(rq); - atomic_long_inc(&cic->ioc->refcount); + atomic_long_inc(&cic->icq.ioc->refcount); cfqd->active_cic = cic; } @@ -2542,7 +3357,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; - struct cfq_group *cfqg, *orig_cfqg; + struct cfq_group *cfqg; BUG_ON(cfqq->ref <= 0); @@ -2554,7 +3369,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); cfqg = cfqq->cfqg; - orig_cfqg = cfqq->orig_cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); @@ -2563,95 +3377,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); - cfq_put_cfqg(cfqg); - if (orig_cfqg) - cfq_put_cfqg(orig_cfqg); -} - -/* - * Must always be called with the rcu_read_lock() held - */ -static void -__call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, struct cfq_io_context *)) -{ - struct cfq_io_context *cic; - struct hlist_node *n; - - hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) - func(ioc, cic); -} - -/* - * Call func for each cic attached to this ioc. - */ -static void -call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, struct cfq_io_context *)) -{ - rcu_read_lock(); - __call_for_each_cic(ioc, func); - rcu_read_unlock(); -} - -static void cfq_cic_free_rcu(struct rcu_head *head) -{ - struct cfq_io_context *cic; - - cic = container_of(head, struct cfq_io_context, rcu_head); - - kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(cfq_ioc_count); - - if (ioc_gone) { - /* - * CFQ scheduler is exiting, grab exit lock and check - * the pending io context count. If it hits zero, - * complete ioc_gone and set it back to NULL - */ - spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { - complete(ioc_gone); - ioc_gone = NULL; - } - spin_unlock(&ioc_gone_lock); - } -} - -static void cfq_cic_free(struct cfq_io_context *cic) -{ - call_rcu(&cic->rcu_head, cfq_cic_free_rcu); -} - -static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) -{ - unsigned long flags; - unsigned long dead_key = (unsigned long) cic->key; - - BUG_ON(!(dead_key & CIC_DEAD_KEY)); - - spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); - hlist_del_rcu(&cic->cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - cfq_cic_free(cic); -} - -/* - * Must be called with rcu_read_lock() held or preemption otherwise disabled. - * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), - * and ->trim() which is called with the task lock held - */ -static void cfq_free_io_context(struct io_context *ioc) -{ - /* - * ioc->refcount is zero here, or we are called from elv_unregister(), - * so no more cic's are allowed to be linked into this ioc. So it - * should be ok to iterate over the known list, we will see all cic's - * since no new ones are added. - */ - __call_for_each_cic(ioc, cic_free_func); + cfqg_put(cfqg); } static void cfq_put_cooperator(struct cfq_queue *cfqq) @@ -2687,21 +3413,17 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_put_queue(cfqq); } -static void __cfq_exit_single_io_context(struct cfq_data *cfqd, - struct cfq_io_context *cic) +static void cfq_init_icq(struct io_cq *icq) { - struct io_context *ioc = cic->ioc; - - list_del_init(&cic->queue_list); + struct cfq_io_cq *cic = icq_to_cic(icq); - /* - * Make sure dead mark is seen for dead queues - */ - smp_wmb(); - cic->key = cfqd_dead_key(cfqd); + cic->ttime.last_end_request = jiffies; +} - if (ioc->ioc_data == cic) - rcu_assign_pointer(ioc->ioc_data, NULL); +static void cfq_exit_icq(struct io_cq *icq) +{ + struct cfq_io_cq *cic = icq_to_cic(icq); + struct cfq_data *cfqd = cic_to_cfqd(cic); if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); @@ -2714,58 +3436,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, } } -static void cfq_exit_single_io_context(struct io_context *ioc, - struct cfq_io_context *cic) -{ - struct cfq_data *cfqd = cic_to_cfqd(cic); - - if (cfqd) { - struct request_queue *q = cfqd->queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - - /* - * Ensure we get a fresh copy of the ->key to prevent - * race between exiting task and queue - */ - smp_read_barrier_depends(); - if (cic->key == cfqd) - __cfq_exit_single_io_context(cfqd, cic); - - spin_unlock_irqrestore(q->queue_lock, flags); - } -} - -/* - * The process that ioc belongs to has exited, we need to clean up - * and put the internal structures we have that belongs to that process. - */ -static void cfq_exit_io_context(struct io_context *ioc) -{ - call_for_each_cic(ioc, cfq_exit_single_io_context); -} - -static struct cfq_io_context * -cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) -{ - struct cfq_io_context *cic; - - cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, - cfqd->queue->node); - if (cic) { - cic->last_end_request = jiffies; - INIT_LIST_HEAD(&cic->queue_list); - INIT_HLIST_NODE(&cic->cic_list); - cic->dtor = cfq_free_io_context; - cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(cfq_ioc_count); - } - - return cic; -} - -static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) +static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) { struct task_struct *tsk = current; int ioprio_class; @@ -2773,7 +3444,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) if (!cfq_cfqq_prio_changed(cfqq)) return; - ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); + ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); switch (ioprio_class) { default: printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); @@ -2785,11 +3456,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) cfqq->ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - cfqq->ioprio = task_ioprio(ioc); + cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); cfqq->ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - cfqq->ioprio = task_ioprio(ioc); + cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); cfqq->ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: @@ -2804,26 +3475,27 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) * elevate the priority of this queue */ cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; cfq_clear_cfqq_prio_changed(cfqq); } -static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) +static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) { + int ioprio = cic->icq.ioc->ioprio; struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; - unsigned long flags; - if (unlikely(!cfqd)) + /* + * Check whether ioprio has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - cfqq = cic->cfqq[BLK_RW_ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, - GFP_ATOMIC); + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, + GFP_ATOMIC); if (new_cfqq) { cic->cfqq[BLK_RW_ASYNC] = new_cfqq; cfq_put_queue(cfqq); @@ -2834,13 +3506,7 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); -} - -static void cfq_ioc_set_ioprio(struct io_context *ioc) -{ - call_for_each_cic(ioc, changed_ioprio); - ioc->ioprio_changed = 0; + cic->ioprio = ioprio; } static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, @@ -2864,20 +3530,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { - struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); struct cfq_data *cfqd = cic_to_cfqd(cic); - unsigned long flags; - struct request_queue *q; + struct cfq_queue *sync_cfqq; + uint64_t id; - if (unlikely(!cfqd)) - return; - - q = cfqd->queue; + rcu_read_lock(); + id = bio_blkcg(bio)->id; + rcu_read_unlock(); - spin_lock_irqsave(q->queue_lock, flags); + /* + * Check whether blkcg has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) + return; + sync_cfqq = cic_to_cfqq(cic, 1); if (sync_cfqq) { /* * Drop reference to sync queue. A new sync queue will be @@ -2888,28 +3558,25 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) cfq_put_queue(sync_cfqq); } - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void cfq_ioc_set_cgroup(struct io_context *ioc) -{ - call_for_each_cic(ioc, changed_cgroup); - ioc->cgroup_changed = 0; + cic->blkcg_id = id; } +#else +static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, - struct io_context *ioc, gfp_t gfp_mask) +cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, + struct bio *bio, gfp_t gfp_mask) { + struct blkcg *blkcg; struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_io_context *cic; struct cfq_group *cfqg; retry: - cfqg = cfq_get_cfqg(cfqd, 1); - cic = cfq_cic_lookup(cfqd, ioc); - /* cic always exists here */ + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); cfqq = cic_to_cfqq(cic, is_sync); /* @@ -2922,6 +3589,7 @@ retry: cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { + rcu_read_unlock(); spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask | __GFP_ZERO, @@ -2929,6 +3597,8 @@ retry: spin_lock_irq(cfqd->queue->queue_lock); if (new_cfqq) goto retry; + else + return &cfqd->oom_cfqq; } else { cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask | __GFP_ZERO, @@ -2937,7 +3607,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, ioc); + cfq_init_prio_data(cfqq, cic); cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else @@ -2947,6 +3617,7 @@ retry: if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); + rcu_read_unlock(); return cfqq; } @@ -2956,6 +3627,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) switch (ioprio_class) { case IOPRIO_CLASS_RT: return &cfqd->async_cfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + /* fall through */ case IOPRIO_CLASS_BE: return &cfqd->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: @@ -2966,11 +3640,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) } static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, - gfp_t gfp_mask) +cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, + struct bio *bio, gfp_t gfp_mask) { - const int ioprio = task_ioprio(ioc); - const int ioprio_class = task_ioprio_class(ioc); + const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); + const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; @@ -2980,7 +3654,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, } if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); /* * pin the queue now that it's allocated, scheduler exit will prune it @@ -2994,161 +3668,29 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, return cfqq; } -/* - * We drop cfq io contexts lazily, so we may find a dead one. - */ static void -cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, - struct cfq_io_context *cic) +__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) { - unsigned long flags; - - WARN_ON(!list_empty(&cic->queue_list)); - BUG_ON(cic->key != cfqd_dead_key(cfqd)); - - spin_lock_irqsave(&ioc->lock, flags); - - BUG_ON(ioc->ioc_data == cic); + unsigned long elapsed = jiffies - ttime->last_end_request; + elapsed = min(elapsed, 2UL * slice_idle); - radix_tree_delete(&ioc->radix_root, cfqd->cic_index); - hlist_del_rcu(&cic->cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - cfq_cic_free(cic); + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; + ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8; + ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples; } -static struct cfq_io_context * -cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) -{ - struct cfq_io_context *cic; - unsigned long flags; - - if (unlikely(!ioc)) - return NULL; - - rcu_read_lock(); - - /* - * we maintain a last-hit cache, to avoid browsing over the tree - */ - cic = rcu_dereference(ioc->ioc_data); - if (cic && cic->key == cfqd) { - rcu_read_unlock(); - return cic; - } - - do { - cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); - rcu_read_unlock(); - if (!cic) - break; - if (unlikely(cic->key != cfqd)) { - cfq_drop_dead_cic(cfqd, ioc, cic); - rcu_read_lock(); - continue; - } - - spin_lock_irqsave(&ioc->lock, flags); - rcu_assign_pointer(ioc->ioc_data, cic); - spin_unlock_irqrestore(&ioc->lock, flags); - break; - } while (1); - - return cic; -} - -/* - * Add cic into ioc, using cfqd as the search key. This enables us to lookup - * the process specific cfq io context when entered from the block layer. - * Also adds the cic to a per-cfqd list, used when this queue is removed. - */ -static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, - struct cfq_io_context *cic, gfp_t gfp_mask) +static void +cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_io_cq *cic) { - unsigned long flags; - int ret; - - ret = radix_tree_preload(gfp_mask); - if (!ret) { - cic->ioc = ioc; - cic->key = cfqd; - - spin_lock_irqsave(&ioc->lock, flags); - ret = radix_tree_insert(&ioc->radix_root, - cfqd->cic_index, cic); - if (!ret) - hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); - spin_unlock_irqrestore(&ioc->lock, flags); - - radix_tree_preload_end(); - - if (!ret) { - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - list_add(&cic->queue_list, &cfqd->cic_list); - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); - } + if (cfq_cfqq_sync(cfqq)) { + __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); + __cfq_update_io_thinktime(&cfqq->service_tree->ttime, + cfqd->cfq_slice_idle); } - - if (ret) - printk(KERN_ERR "cfq: cic link failed!\n"); - - return ret; -} - -/* - * Setup general io context and cfq io context. There can be several cfq - * io contexts per general io context, if this process is doing io to more - * than one device managed by cfq. - */ -static struct cfq_io_context * -cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) -{ - struct io_context *ioc = NULL; - struct cfq_io_context *cic; - - might_sleep_if(gfp_mask & __GFP_WAIT); - - ioc = get_io_context(gfp_mask, cfqd->queue->node); - if (!ioc) - return NULL; - - cic = cfq_cic_lookup(cfqd, ioc); - if (cic) - goto out; - - cic = cfq_alloc_io_context(cfqd, gfp_mask); - if (cic == NULL) - goto err; - - if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) - goto err_free; - -out: - smp_read_barrier_depends(); - if (unlikely(ioc->ioprio_changed)) - cfq_ioc_set_ioprio(ioc); - #ifdef CONFIG_CFQ_GROUP_IOSCHED - if (unlikely(ioc->cgroup_changed)) - cfq_ioc_set_cgroup(ioc); + __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); #endif - return cic; -err_free: - cfq_cic_free(cic); -err: - put_io_context(ioc); - return NULL; -} - -static void -cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) -{ - unsigned long elapsed = jiffies - cic->last_end_request; - unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); - - cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; - cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; - cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; } static void @@ -3177,7 +3719,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, */ static void cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_io_context *cic) + struct cfq_io_cq *cic) { int old_idle, enable_idle; @@ -3194,11 +3736,12 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) enable_idle = 0; - else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) + else if (!atomic_read(&cic->icq.ioc->active_ref) || + !cfqd->cfq_slice_idle || + (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; - else if (sample_valid(cic->ttime_samples)) { - if (cic->ttime_mean > cfqd->cfq_slice_idle) + else if (sample_valid(cic->ttime.ttime_samples)) { + if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) enable_idle = 0; else enable_idle = 1; @@ -3253,7 +3796,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, return true; /* Allow preemption only if we are idling on sync-noidle tree */ - if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && + if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && new_cfqq->service_tree->count == 2 && RB_EMPTY_ROOT(&cfqq->sort_list)) @@ -3263,7 +3806,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. */ - if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) + if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) return true; /* @@ -3295,7 +3838,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_queue *old_cfqq = cfqd->active_queue; + enum wl_type_t old_type = cfqq_type(cfqd->active_queue); cfq_log_cfqq(cfqd, cfqq, "preempt"); cfq_slice_expired(cfqd, 1); @@ -3304,8 +3847,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) * workload type is changed, don't save slice, otherwise preempt * doesn't happen */ - if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) - cfqq->cfqg->saved_workload_slice = 0; + if (old_type != cfqq_type(cfqq)) + cfqq->cfqg->saved_wl_slice = 0; /* * Put the new queue at the front of the of the current list, @@ -3327,13 +3870,13 @@ static void cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct request *rq) { - struct cfq_io_context *cic = RQ_CIC(rq); + struct cfq_io_cq *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq->cmd_flags & REQ_META) - cfqq->meta_pending++; + if (rq->cmd_flags & REQ_PRIO) + cfqq->prio_pending++; - cfq_update_io_thinktime(cfqd, cic); + cfq_update_io_thinktime(cfqd, cfqq, cic); cfq_update_io_seektime(cfqd, cfqq, rq); cfq_update_idle_window(cfqd, cfqq, cic); @@ -3355,10 +3898,9 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->busy_queues > 1) { cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); - __blk_run_queue(cfqd->queue, false); + __blk_run_queue(cfqd->queue); } else { - cfq_blkiocg_update_idle_time_stats( - &cfqq->cfqg->blkg); + cfqg_stats_update_idle_time(cfqq->cfqg); cfq_mark_cfqq_must_dispatch(cfqq); } } @@ -3370,7 +3912,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, * this new queue is RT and the current one is BE */ cfq_preempt_queue(cfqd, cfqq); - __blk_run_queue(cfqd->queue, false); + __blk_run_queue(cfqd->queue); } } @@ -3380,14 +3922,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_log_cfqq(cfqd, cfqq, "insert_request"); - cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); + cfq_init_prio_data(cfqq, RQ_CIC(rq)); - rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); + rq->fifo_time = jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, - &cfqd->serving_group->blkg, rq_data_dir(rq), - rq_is_sync(rq)); + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, + rq->cmd_flags); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -3430,7 +3971,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_io_context *cic = cfqd->active_cic; + struct cfq_io_cq *cic = cfqd->active_cic; /* If the queue already has requests, don't wait */ if (!RB_EMPTY_ROOT(&cfqq->sort_list)) @@ -3440,12 +3981,16 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) if (cfqq->cfqg->nr_cfqq > 1) return false; + /* the only queue in the group, but think time is big */ + if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) + return false; + if (cfq_slice_used(cfqq)) return true; /* if slice left is less than think time, wait busy */ - if (cic && sample_valid(cic->ttime_samples) - && (cfqq->slice_end - jiffies < cic->ttime_mean)) + if (cic && sample_valid(cic->ttime.ttime_samples) + && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) return true; /* @@ -3479,18 +4024,31 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_driver--; cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; - cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, - rq_start_time_ns(rq), rq_io_start_time_ns(rq), - rq_data_dir(rq), rq_is_sync(rq)); + cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), + rq_io_start_time_ns(rq), rq->cmd_flags); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { - RQ_CIC(rq)->last_end_request = now; + struct cfq_rb_root *st; + + RQ_CIC(rq)->ttime.last_end_request = now; + + if (cfq_cfqq_on_rr(cfqq)) + st = cfqq->service_tree; + else + st = st_for(cfqq->cfqg, cfqq_class(cfqq), + cfqq_type(cfqq)); + + st->ttime.last_end_request = now; if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) cfqd->last_delayed_sync = now; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED + cfqq->cfqg->ttime.last_end_request = now; +#endif + /* * If this is the active queue, check if it needs to be expired, * or if we want to idle in case it has no pending requests. @@ -3536,30 +4094,6 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -/* - * we temporarily boost lower priority queues if they are holding fs exclusive - * resources. they are boosted to normal prio (CLASS_BE/4) - */ -static void cfq_prio_boost(struct cfq_queue *cfqq) -{ - if (has_fs_excl()) { - /* - * boost idle prio on transactions that would lock out other - * users of the filesystem - */ - if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; - } else { - /* - * unboost the queue (if needed) - */ - cfqq->ioprio_class = cfqq->org_ioprio_class; - cfqq->ioprio = cfqq->org_ioprio; - } -} - static inline int __cfq_may_queue(struct cfq_queue *cfqq) { if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { @@ -3574,7 +4108,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; - struct cfq_io_context *cic; + struct cfq_io_cq *cic; struct cfq_queue *cfqq; /* @@ -3589,8 +4123,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); if (cfqq) { - cfq_init_prio_data(cfqq, cic->ioc); - cfq_prio_boost(cfqq); + cfq_init_prio_data(cfqq, cic); return __cfq_may_queue(cfqq); } @@ -3611,21 +4144,17 @@ static void cfq_put_request(struct request *rq) BUG_ON(!cfqq->allocated[rw]); cfqq->allocated[rw]--; - put_io_context(RQ_CIC(rq)->ioc); - - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - /* Put down rq reference on cfqg */ - cfq_put_cfqg(RQ_CFQG(rq)); - rq->elevator_private3 = NULL; + cfqg_put(RQ_CFQG(rq)); + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; cfq_put_queue(cfqq); } } static struct cfq_queue * -cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, +cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, struct cfq_queue *cfqq) { cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); @@ -3640,7 +4169,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, * was the last process referring to said cfqq. */ static struct cfq_queue * -split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) +split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) { if (cfqq_process_refs(cfqq) == 1) { cfqq->pid = current->pid; @@ -3660,28 +4189,25 @@ split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) * Allocate cfq data structures associated with this request. */ static int -cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, + gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_io_context *cic; + struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - unsigned long flags; might_sleep_if(gfp_mask & __GFP_WAIT); - cic = cfq_get_io_context(cfqd, gfp_mask); - - spin_lock_irqsave(q->queue_lock, flags); - - if (!cic) - goto queue_fail; + spin_lock_irq(q->queue_lock); + check_ioprio_changed(cic, bio); + check_blkcg_changed(cic, bio); new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -3705,23 +4231,13 @@ new_queue: } cfqq->allocated[rw]++; - cfqq->ref++; - rq->elevator_private = cic; - rq->elevator_private2 = cfqq; - rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); - - spin_unlock_irqrestore(q->queue_lock, flags); + cfqq->ref++; + cfqg_get(cfqq->cfqg); + rq->elv.priv[0] = cfqq; + rq->elv.priv[1] = cfqq->cfqg; + spin_unlock_irq(q->queue_lock); return 0; - -queue_fail: - if (cic) - put_io_context(cic->ioc); - - cfq_schedule_dispatch(cfqd); - spin_unlock_irqrestore(q->queue_lock, flags); - cfq_log(cfqd, "set_request fail"); - return 1; } static void cfq_kick_queue(struct work_struct *work) @@ -3731,7 +4247,7 @@ static void cfq_kick_queue(struct work_struct *work) struct request_queue *q = cfqd->queue; spin_lock_irq(q->queue_lock); - __blk_run_queue(cfqd->queue, false); + __blk_run_queue(cfqd->queue); spin_unlock_irq(q->queue_lock); } @@ -3812,11 +4328,6 @@ static void cfq_put_async_queues(struct cfq_data *cfqd) cfq_put_queue(cfqd->async_idle_cfqq); } -static void cfq_cfqd_free(struct rcu_head *head) -{ - kfree(container_of(head, struct cfq_data, rcu)); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -3829,92 +4340,65 @@ static void cfq_exit_queue(struct elevator_queue *e) if (cfqd->active_queue) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - while (!list_empty(&cfqd->cic_list)) { - struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, - struct cfq_io_context, - queue_list); - - __cfq_exit_single_io_context(cfqd, cic); - } - cfq_put_async_queues(cfqd); - cfq_release_cfq_groups(cfqd); - cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); - spin_lock(&cic_index_lock); - ida_remove(&cic_index_ida, cfqd->cic_index); - spin_unlock(&cic_index_lock); - - /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ - call_rcu(&cfqd->rcu, cfq_cfqd_free); -} - -static int cfq_alloc_cic_index(void) -{ - int index, error; - - do { - if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(&cic_index_lock); - error = ida_get_new(&cic_index_ida, &index); - spin_unlock(&cic_index_lock); - if (error && error != -EAGAIN) - return error; - } while (error); - - return index; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + blkcg_deactivate_policy(q, &blkcg_policy_cfq); +#else + kfree(cfqd->root_group); +#endif + kfree(cfqd); } -static void *cfq_init_queue(struct request_queue *q) +static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) { struct cfq_data *cfqd; - int i, j; - struct cfq_group *cfqg; - struct cfq_rb_root *st; + struct blkcg_gq *blkg __maybe_unused; + int i, ret; + struct elevator_queue *eq; - i = cfq_alloc_cic_index(); - if (i < 0) - return NULL; + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; - cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); - if (!cfqd) - return NULL; + cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); + if (!cfqd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = cfqd; - /* - * Don't need take queue_lock in the routine, since we are - * initializing the ioscheduler, and nobody is using cfqd - */ - cfqd->cic_index = i; + cfqd->queue = q; + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; - /* Init root group */ - cfqg = &cfqd->root_group; - for_each_cfqg_st(cfqg, i, j, st) - *st = CFQ_RB_ROOT; - RB_CLEAR_NODE(&cfqg->rb_node); + /* Init root group and prefer root group over other groups by default */ +#ifdef CONFIG_CFQ_GROUP_IOSCHED + ret = blkcg_activate_policy(q, &blkcg_policy_cfq); + if (ret) + goto out_free; - /* Give preference to root group over other groups */ - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; + cfqd->root_group = blkg_to_cfqg(q->root_blkg); +#else + ret = -ENOMEM; + cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), + GFP_KERNEL, cfqd->queue->node); + if (!cfqd->root_group) + goto out_free; -#ifdef CONFIG_CFQ_GROUP_IOSCHED - /* - * Take a reference to root group which we never drop. This is just - * to make sure that cfq_put_cfqg() does not try to kfree root group - */ - cfqg->ref = 1; - rcu_read_lock(); - cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, - (void *)cfqd, 0); - rcu_read_unlock(); + cfq_init_cfqg_base(cfqd->root_group); #endif + cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; + /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -3926,15 +4410,17 @@ static void *cfq_init_queue(struct request_queue *q) /* * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. + * will not attempt to free it. oom_cfqq is linked to root_group + * but shouldn't hold a reference as it'll never be unlinked. Lose + * the reference from linking right away. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); cfqd->oom_cfqq.ref++; - cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); - - INIT_LIST_HEAD(&cfqd->cic_list); - cfqd->queue = q; + spin_lock_irq(q->queue_lock); + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); + cfqg_put(cfqd->root_group); + spin_unlock_irq(q->queue_lock); init_timer(&cfqd->idle_slice_timer); cfqd->idle_slice_timer.function = cfq_idle_slice_timer; @@ -3949,46 +4435,23 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_back_penalty = cfq_back_penalty; cfqd->cfq_slice[0] = cfq_slice_async; cfqd->cfq_slice[1] = cfq_slice_sync; + cfqd->cfq_target_latency = cfq_target_latency; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_group_idle = cfq_group_idle; cfqd->cfq_latency = 1; - cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; /* * we optimistically start assuming sync ops weren't delayed in last * second, in order to have larger depth for async operations. */ cfqd->last_delayed_sync = jiffies - HZ; - return cfqd; -} - -static void cfq_slab_kill(void) -{ - /* - * Caller already ensured that pending RCU callbacks are completed, - * so we should have no busy allocations at this point. - */ - if (cfq_pool) - kmem_cache_destroy(cfq_pool); - if (cfq_ioc_pool) - kmem_cache_destroy(cfq_ioc_pool); -} - -static int __init cfq_slab_setup(void) -{ - cfq_pool = KMEM_CACHE(cfq_queue, 0); - if (!cfq_pool) - goto fail; - - cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); - if (!cfq_ioc_pool) - goto fail; - return 0; -fail: - cfq_slab_kill(); - return -ENOMEM; + +out_free: + kfree(cfqd); + kobject_put(&eq->kobj); + return ret; } /* @@ -3997,7 +4460,7 @@ fail: static ssize_t cfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t @@ -4029,7 +4492,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); -SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); +SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -4063,7 +4526,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); -STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); +STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -4081,7 +4544,7 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_idle), CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), - CFQ_ATTR(group_isolation), + CFQ_ATTR(target_latency), __ATTR_NULL }; @@ -4096,36 +4559,39 @@ static struct elevator_type iosched_cfq = { .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, .elevator_deactivate_req_fn = cfq_deactivate_request, - .elevator_queue_empty_fn = cfq_queue_empty, .elevator_completed_req_fn = cfq_completed_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_init_icq_fn = cfq_init_icq, + .elevator_exit_icq_fn = cfq_exit_icq, .elevator_set_req_fn = cfq_set_request, .elevator_put_req_fn = cfq_put_request, .elevator_may_queue_fn = cfq_may_queue, .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, - .trim = cfq_free_io_context, }, + .icq_size = sizeof(struct cfq_io_cq), + .icq_align = __alignof__(struct cfq_io_cq), .elevator_attrs = cfq_attrs, - .elevator_name = "cfq", + .elevator_name = "cfq", .elevator_owner = THIS_MODULE, }; #ifdef CONFIG_CFQ_GROUP_IOSCHED -static struct blkio_policy_type blkio_policy_cfq = { - .ops = { - .blkio_unlink_group_fn = cfq_unlink_blkio_group, - .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, - }, - .plid = BLKIO_POLICY_PROP, +static struct blkcg_policy blkcg_policy_cfq = { + .pd_size = sizeof(struct cfq_group), + .cftypes = cfq_blkcg_files, + + .pd_init_fn = cfq_pd_init, + .pd_offline_fn = cfq_pd_offline, + .pd_reset_stats_fn = cfq_pd_reset_stats, }; -#else -static struct blkio_policy_type blkio_policy_cfq; #endif static int __init cfq_init(void) { + int ret; + /* * could be 0 on HZ < 1000 setups */ @@ -4137,35 +4603,41 @@ static int __init cfq_init(void) #ifdef CONFIG_CFQ_GROUP_IOSCHED if (!cfq_group_idle) cfq_group_idle = 1; + + ret = blkcg_policy_register(&blkcg_policy_cfq); + if (ret) + return ret; #else - cfq_group_idle = 0; + cfq_group_idle = 0; #endif - if (cfq_slab_setup()) - return -ENOMEM; - elv_register(&iosched_cfq); - blkio_policy_register(&blkio_policy_cfq); + ret = -ENOMEM; + cfq_pool = KMEM_CACHE(cfq_queue, 0); + if (!cfq_pool) + goto err_pol_unreg; + + ret = elv_register(&iosched_cfq); + if (ret) + goto err_free_pool; return 0; + +err_free_pool: + kmem_cache_destroy(cfq_pool); +err_pol_unreg: +#ifdef CONFIG_CFQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_cfq); +#endif + return ret; } static void __exit cfq_exit(void) { - DECLARE_COMPLETION_ONSTACK(all_gone); - blkio_policy_unregister(&blkio_policy_cfq); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_cfq); +#endif elv_unregister(&iosched_cfq); - ioc_gone = &all_gone; - /* ioc_gone's update must be visible before reading ioc_count */ - smp_wmb(); - - /* - * this also protects us from entering cfq_slab_kill() with - * pending RCU callbacks - */ - if (elv_ioc_count_read(cfq_ioc_count)) - wait_for_completion(&all_gone); - ida_destroy(&cic_index_ida); - cfq_slab_kill(); + kmem_cache_destroy(cfq_pool); } module_init(cfq_init); diff --git a/block/cfq.h b/block/cfq.h deleted file mode 100644 index 54a6d90f8e8..00000000000 --- a/block/cfq.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef _CFQ_H -#define _CFQ_H -#include "blk-cgroup.h" - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) -{ - blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); -} - -static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) -{ - blkiocg_update_dequeue_stats(blkg, dequeue); -} - -static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time) -{ - blkiocg_update_timeslice_used(blkg, time); -} - -static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) -{ - blkiocg_set_start_empty_time(blkg); -} - -static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) -{ - blkiocg_update_io_remove_stats(blkg, direction, sync); -} - -static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) -{ - blkiocg_update_io_merged_stats(blkg, direction, sync); -} - -static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ - blkiocg_update_idle_time_stats(blkg); -} - -static inline void -cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) -{ - blkiocg_update_avg_queue_size_stats(blkg); -} - -static inline void -cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) -{ - blkiocg_update_set_idle_time_stats(blkg); -} - -static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) -{ - blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); -} - -static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) -{ - blkiocg_update_completion_stats(blkg, start_time, io_start_time, - direction, sync); -} - -static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) { - blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); -} - -static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - return blkiocg_del_blkio_group(blkg); -} - -#else /* CFQ_GROUP_IOSCHED */ -static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) {} - -static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) {} - -static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time) {} -static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} -static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ -} -static inline void -cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} - -static inline void -cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} - -static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) {} -static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} - -static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) {} -static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - return 0; -} - -#endif /* CFQ_GROUP_IOSCHED */ -#endif diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c new file mode 100644 index 00000000000..9dbc67e42a9 --- /dev/null +++ b/block/cmdline-parser.c @@ -0,0 +1,254 @@ +/* + * Parse command line, get partition information + * + * Written by Cai Zhiyong <caizhiyong@huawei.com> + * + */ +#include <linux/export.h> +#include <linux/cmdline-parser.h> + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} +EXPORT_SYMBOL(cmdline_parts_free); + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} +EXPORT_SYMBOL(cmdline_parts_parse); + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} +EXPORT_SYMBOL(cmdline_parts_find); + +/* + * add_part() + * 0 success. + * 1 can not add so many partitions. + */ +int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, param)) + break; + } + + return slot; +} +EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index cc3eb78e333..a0926a6094b 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -59,6 +59,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, if (!disk->fops->getgeo) return -ENOTTY; + memset(&geo, 0, sizeof(geo)); /* * We need to set the startsect first, the driver may * want to override it. @@ -69,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, return ret; ret = copy_to_user(ugeo, &geo, 4); - ret |= __put_user(geo.start, &ugeo->start); + ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; @@ -208,19 +209,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode, #define BLKBSZSET_32 _IOW(0x12, 113, int) #define BLKGETSIZE64_32 _IOR(0x12, 114, int) -struct compat_floppy_struct { - compat_uint_t size; - compat_uint_t sect; - compat_uint_t head; - compat_uint_t track; - compat_uint_t stretch; - unsigned char gap; - unsigned char rate; - unsigned char spec1; - unsigned char fmt_gap; - const compat_caddr_t name; -}; - struct compat_floppy_drive_params { char cmos; compat_ulong_t max_dtr; @@ -288,7 +276,6 @@ struct compat_floppy_write_errors { #define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct) #define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct) -#define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) #define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params) #define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params) #define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct) @@ -703,6 +690,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKROSET: case BLKDISCARD: case BLKSECDISCARD: + case BLKZEROOUT: /* * the ones below are implemented in blkdev_locked_ioctl, * but we call blkdev_ioctl, which gets the lock for us @@ -733,6 +721,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKSECTGET: return compat_put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); + case BLKROTATIONAL: + return compat_put_ushort(arg, + !blk_queue_nonrot(bdev_get_queue(bdev))); case BLKRASET: /* compatible, but no compat_ptr (!) */ case BLKFRASET: if (!capable(CAP_SYS_ADMIN)) diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b547cbca7b2..a753df2b3fc 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -77,10 +77,8 @@ static void deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) { struct rb_root *root = deadline_rb_root(dd, rq); - struct request *__alias; - while (unlikely(__alias = elv_rb_add(root, rq))) - deadline_move_request(dd, __alias); + elv_rb_add(root, rq); } static inline void @@ -108,7 +106,7 @@ deadline_add_request(struct request_queue *q, struct request *rq) /* * set expire time and add to fifo list */ - rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); + rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); } @@ -134,7 +132,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) * check for front merge */ if (dd->front_merges) { - sector_t sector = bio->bi_sector + bio_sectors(bio); + sector_t sector = bio_end_sector(bio); __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); if (__rq) { @@ -176,9 +174,9 @@ deadline_merged_requests(struct request_queue *q, struct request *req, * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { - if (time_before(rq_fifo_time(next), rq_fifo_time(req))) { + if (time_before(next->fifo_time, req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); - rq_set_fifo_time(req, rq_fifo_time(next)); + req->fifo_time = next->fifo_time; } } @@ -232,7 +230,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) /* * rq is expired! */ - if (time_after(jiffies, rq_fifo_time(rq))) + if (time_after_eq(jiffies, rq->fifo_time)) return 1; return 0; @@ -326,14 +324,6 @@ dispatch_request: return 1; } -static int deadline_queue_empty(struct request_queue *q) -{ - struct deadline_data *dd = q->elevator->elevator_data; - - return list_empty(&dd->fifo_list[WRITE]) - && list_empty(&dd->fifo_list[READ]); -} - static void deadline_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -347,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static void *deadline_init_queue(struct request_queue *q) +static int deadline_init_queue(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; + struct elevator_queue *eq; - dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); - if (!dd) - return NULL; + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); + if (!dd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = dd; INIT_LIST_HEAD(&dd->fifo_list[READ]); INIT_LIST_HEAD(&dd->fifo_list[WRITE]); @@ -364,7 +362,11 @@ static void *deadline_init_queue(struct request_queue *q) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; - return dd; + + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); + return 0; } /* @@ -445,7 +447,6 @@ static struct elevator_type iosched_deadline = { .elevator_merge_req_fn = deadline_merged_requests, .elevator_dispatch_fn = deadline_dispatch_requests, .elevator_add_req_fn = deadline_add_request, - .elevator_queue_empty_fn = deadline_queue_empty, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, .elevator_init_fn = deadline_init_queue, @@ -459,9 +460,7 @@ static struct elevator_type iosched_deadline = { static int __init deadline_init(void) { - elv_register(&iosched_deadline); - - return 0; + return elv_register(&iosched_deadline); } static void __exit deadline_exit(void) diff --git a/block/elevator.c b/block/elevator.c index 236e93c1f46..24c28b659bb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -31,14 +31,15 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> -#include <linux/delay.h> #include <linux/blktrace_api.h> #include <linux/hash.h> #include <linux/uaccess.h> +#include <linux/pm_runtime.h> #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -46,11 +47,6 @@ static LIST_HEAD(elv_list); /* * Merge hash stuff. */ -static const int elv_hash_shift = 6; -#define ELV_HASH_BLOCK(sec) ((sec) >> 3) -#define ELV_HASH_FN(sec) \ - (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) -#define ELV_HASH_ENTRIES (1 << elv_hash_shift) #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) /* @@ -62,8 +58,8 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->ops->elevator_allow_merge_fn) - return e->ops->elevator_allow_merge_fn(q, rq, bio); + if (e->type->ops.elevator_allow_merge_fn) + return e->type->ops.elevator_allow_merge_fn(q, rq, bio); return 1; } @@ -71,39 +67,9 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) /* * can we safely merge with this request? */ -int elv_rq_merge_ok(struct request *rq, struct bio *bio) +bool elv_rq_merge_ok(struct request *rq, struct bio *bio) { - if (!rq_mergeable(rq)) - return 0; - - /* - * Don't merge file system requests and discard requests - */ - if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) - return 0; - - /* - * Don't merge discard requests and secure discard requests - */ - if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) - return 0; - - /* - * different data direction or already started, don't merge - */ - if (bio_data_dir(bio) != rq_data_dir(rq)) - return 0; - - /* - * must be same device and not a special request - */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) - return 0; - - /* - * only merge integrity protected bio into ditto rq - */ - if (bio_integrity(bio) != blk_integrity_rq(rq)) + if (!blk_rq_merge_ok(rq, bio)) return 0; if (!elv_iosched_allow_merge(rq, bio)) @@ -113,23 +79,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_rq_merge_ok); -static inline int elv_try_merge(struct request *__rq, struct bio *bio) -{ - int ret = ELEVATOR_NO_MERGE; - - /* - * we can merge and sequence is ok, check if it's possible - */ - if (elv_rq_merge_ok(__rq, bio)) { - if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector) - ret = ELEVATOR_BACK_MERGE; - else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector) - ret = ELEVATOR_FRONT_MERGE; - } - - return ret; -} - static struct elevator_type *elevator_find(const char *name) { struct elevator_type *e; @@ -147,21 +96,16 @@ static void elevator_put(struct elevator_type *e) module_put(e->elevator_owner); } -static struct elevator_type *elevator_get(const char *name) +static struct elevator_type *elevator_get(const char *name, bool try_loading) { struct elevator_type *e; spin_lock(&elv_list_lock); e = elevator_find(name); - if (!e) { - char elv[ELV_NAME_MAX + strlen("-iosched")]; - + if (!e && try_loading) { spin_unlock(&elv_list_lock); - - snprintf(elv, sizeof(elv), "%s-iosched", name); - - request_module("%s", elv); + request_module("%s-iosched", name); spin_lock(&elv_list_lock); e = elevator_find(name); } @@ -174,20 +118,7 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static void *elevator_init_queue(struct request_queue *q, - struct elevator_queue *eq) -{ - return eq->ops->elevator_init_fn(q); -} - -static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, - void *data) -{ - q->elevator = eq; - eq->elevator_data = data; -} - -static char chosen_elevator[16]; +static char chosen_elevator[ELV_NAME_MAX]; static int __init elevator_setup(char *str) { @@ -201,30 +132,37 @@ static int __init elevator_setup(char *str) __setup("elevator=", elevator_setup); +/* called during boot to load the elevator chosen by the elevator param */ +void __init load_default_elevator_module(void) +{ + struct elevator_type *e; + + if (!chosen_elevator[0]) + return; + + spin_lock(&elv_list_lock); + e = elevator_find(chosen_elevator); + spin_unlock(&elv_list_lock); + + if (!e) + request_module("%s-iosched", chosen_elevator); +} + static struct kobj_type elv_ktype; -static struct elevator_queue *elevator_alloc(struct request_queue *q, +struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_type *e) { struct elevator_queue *eq; - int i; - eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); + eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); if (unlikely(!eq)) goto err; - eq->ops = &e->ops; - eq->elevator_type = e; + eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); - - eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES, - GFP_KERNEL, q->node); - if (!eq->hash) - goto err; - - for (i = 0; i < ELV_HASH_ENTRIES; i++) - INIT_HLIST_HEAD(&eq->hash[i]); + hash_init(eq->hash); return eq; err: @@ -232,22 +170,27 @@ err: elevator_put(e); return NULL; } +EXPORT_SYMBOL(elevator_alloc); static void elevator_release(struct kobject *kobj) { struct elevator_queue *e; e = container_of(kobj, struct elevator_queue, kobj); - elevator_put(e->elevator_type); - kfree(e->hash); + elevator_put(e->type); kfree(e); } int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; - struct elevator_queue *eq; - void *data; + int err; + + /* + * q->sysfs_lock must be held to provide mutual exclusion between + * elevator_switch() and here. + */ + lockdep_assert_held(&q->sysfs_lock); if (unlikely(q->elevator)) return 0; @@ -258,39 +201,34 @@ int elevator_init(struct request_queue *q, char *name) q->boundary_rq = NULL; if (name) { - e = elevator_get(name); + e = elevator_get(name, true); if (!e) return -EINVAL; } + /* + * Use the default elevator specified by config boot param or + * config option. Don't try to load modules as we could be running + * off async and request_module() isn't allowed from async. + */ if (!e && *chosen_elevator) { - e = elevator_get(chosen_elevator); + e = elevator_get(chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", chosen_elevator); } if (!e) { - e = elevator_get(CONFIG_DEFAULT_IOSCHED); + e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ "Using noop.\n"); - e = elevator_get("noop"); + e = elevator_get("noop", false); } } - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - data = elevator_init_queue(q, eq); - if (!data) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - - elevator_attach(q, eq, data); + err = e->ops.elevator_init_fn(q, e); return 0; } EXPORT_SYMBOL(elevator_init); @@ -298,9 +236,8 @@ EXPORT_SYMBOL(elevator_init); void elevator_exit(struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); - if (e->ops->elevator_exit_fn) - e->ops->elevator_exit_fn(e); - e->ops = NULL; + if (e->type->ops.elevator_exit_fn) + e->type->ops.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -309,7 +246,8 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { - hlist_del_init(&rq->hash); + hash_del(&rq->hash); + rq->cmd_flags &= ~REQ_HASHED; } static void elv_rqhash_del(struct request_queue *q, struct request *rq) @@ -323,7 +261,8 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) struct elevator_queue *e = q->elevator; BUG_ON(ELV_ON_HASH(rq)); - hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]); + hash_add(e->hash, &rq->hash, rq_hash_key(rq)); + rq->cmd_flags |= REQ_HASHED; } static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) @@ -335,11 +274,10 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; - struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)]; - struct hlist_node *entry, *next; + struct hlist_node *next; struct request *rq; - hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) { + hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { BUG_ON(!ELV_ON_HASH(rq)); if (unlikely(!rq_mergeable(rq))) { @@ -358,7 +296,7 @@ static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) * RB-tree support functions for inserting/lookup/removal of requests * in a sorted RB tree. */ -struct request *elv_rb_add(struct rb_root *root, struct request *rq) +void elv_rb_add(struct rb_root *root, struct request *rq) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -370,15 +308,12 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq) if (blk_rq_pos(rq) < blk_rq_pos(__rq)) p = &(*p)->rb_left; - else if (blk_rq_pos(rq) > blk_rq_pos(__rq)) + else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) p = &(*p)->rb_right; - else - return __rq; } rb_link_node(&rq->rb_node, parent, p); rb_insert_color(&rq->rb_node, root); - return NULL; } EXPORT_SYMBOL(elv_rb_add); @@ -493,8 +428,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) /* * First try one-hit cache. */ - if (q->last_merge) { - ret = elv_try_merge(q->last_merge, bio); + if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { + ret = blk_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; return ret; @@ -507,24 +442,66 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) /* * See if our hash lookup can find a potential backmerge. */ - __rq = elv_rqhash_find(q, bio->bi_sector); + __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_rq_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_BACK_MERGE; } - if (e->ops->elevator_merge_fn) - return e->ops->elevator_merge_fn(q, req, bio); + if (e->type->ops.elevator_merge_fn) + return e->type->ops.elevator_merge_fn(q, req, bio); return ELEVATOR_NO_MERGE; } +/* + * Attempt to do an insertion back merge. Only check for the case where + * we can append 'rq' to an existing request, so we can throw 'rq' away + * afterwards. + * + * Returns true if we merged, false otherwise + */ +static bool elv_attempt_insert_merge(struct request_queue *q, + struct request *rq) +{ + struct request *__rq; + bool ret; + + if (blk_queue_nomerges(q)) + return false; + + /* + * First try one-hit cache. + */ + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + return true; + + if (blk_queue_noxmerges(q)) + return false; + + ret = false; + /* + * See if our hash lookup can find a potential backmerge. + */ + while (1) { + __rq = elv_rqhash_find(q, blk_rq_pos(rq)); + if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) + break; + + /* The merged request could be merged with others, try again */ + ret = true; + rq = __rq; + } + + return ret; +} + void elv_merged_request(struct request_queue *q, struct request *rq, int type) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_merged_fn) - e->ops->elevator_merged_fn(q, rq, type); + if (e->type->ops.elevator_merged_fn) + e->type->ops.elevator_merged_fn(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); @@ -536,14 +513,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; + const int next_sorted = next->cmd_flags & REQ_SORTED; - if (e->ops->elevator_merge_req_fn) - e->ops->elevator_merge_req_fn(q, rq, next); + if (next_sorted && e->type->ops.elevator_merge_req_fn) + e->type->ops.elevator_merge_req_fn(q, rq, next); elv_rqhash_reposition(q, rq); - elv_rqhash_del(q, next); - q->nr_sorted--; + if (next_sorted) { + elv_rqhash_del(q, next); + q->nr_sorted--; + } + q->last_merge = rq; } @@ -552,10 +533,31 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_bio_merged_fn) - e->ops->elevator_bio_merged_fn(q, rq, bio); + if (e->type->ops.elevator_bio_merged_fn) + e->type->ops.elevator_bio_merged_fn(q, rq, bio); } +#ifdef CONFIG_PM_RUNTIME +static void blk_pm_requeue_request(struct request *rq) +{ + if (rq->q->dev && !(rq->cmd_flags & REQ_PM)) + rq->q->nr_pending--; +} + +static void blk_pm_add_request(struct request_queue *q, struct request *rq) +{ + if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 && + (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING)) + pm_request_resume(q->dev); +} +#else +static inline void blk_pm_requeue_request(struct request *rq) {} +static inline void blk_pm_add_request(struct request_queue *q, + struct request *rq) +{ +} +#endif + void elv_requeue_request(struct request_queue *q, struct request *rq) { /* @@ -570,68 +572,47 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) rq->cmd_flags &= ~REQ_STARTED; - elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); + blk_pm_requeue_request(rq); + + __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); } void elv_drain_elevator(struct request_queue *q) { static int printed; - while (q->elevator->ops->elevator_dispatch_fn(q, 1)) + + lockdep_assert_held(q->queue_lock); + + while (q->elevator->type->ops.elevator_dispatch_fn(q, 1)) ; - if (q->nr_sorted == 0) - return; - if (printed++ < 10) { + if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " "(nr_sorted=%u), please report this\n", - q->elevator->elevator_type->elevator_name, q->nr_sorted); - } -} - -/* - * Call with queue lock held, interrupts disabled - */ -void elv_quiesce_start(struct request_queue *q) -{ - if (!q->elevator) - return; - - queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); - - /* - * make sure we don't have any requests in flight - */ - elv_drain_elevator(q); - while (q->rq.elvpriv) { - __blk_run_queue(q, false); - spin_unlock_irq(q->queue_lock); - msleep(10); - spin_lock_irq(q->queue_lock); - elv_drain_elevator(q); + q->elevator->type->elevator_name, q->nr_sorted); } } -void elv_quiesce_end(struct request_queue *q) -{ - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); -} - -void elv_insert(struct request_queue *q, struct request *rq, int where) +void __elv_add_request(struct request_queue *q, struct request *rq, int where) { - int unplug_it = 1; - trace_block_rq_insert(q, rq); + blk_pm_add_request(q, rq); + rq->q = q; + if (rq->cmd_flags & REQ_SOFTBARRIER) { + /* barriers are scheduling boundary, update end_sector */ + if (rq->cmd_type == REQ_TYPE_FS) { + q->end_sector = rq_end_sector(rq); + q->boundary_rq = rq; + } + } else if (!(rq->cmd_flags & REQ_ELVPRIV) && + (where == ELEVATOR_INSERT_SORT || + where == ELEVATOR_INSERT_SORT_MERGE)) + where = ELEVATOR_INSERT_BACK; + switch (where) { case ELEVATOR_INSERT_REQUEUE: - /* - * Most requeues happen because of a busy condition, - * don't force unplug of the queue for that case. - * Clear unplug_it and fall through. - */ - unplug_it = 0; - case ELEVATOR_INSERT_FRONT: rq->cmd_flags |= REQ_SOFTBARRIER; list_add(&rq->queuelist, &q->queue_head); @@ -651,12 +632,19 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) * with anything. There's no point in delaying queue * processing. */ - __blk_run_queue(q, false); + __blk_run_queue(q); break; + case ELEVATOR_INSERT_SORT_MERGE: + /* + * If we succeed in merging this request with one in the + * queue already, we are done - rq has now been freed, + * so no need to do anything further. + */ + if (elv_attempt_insert_merge(q, rq)) + break; case ELEVATOR_INSERT_SORT: - BUG_ON(rq->cmd_type != REQ_TYPE_FS && - !(rq->cmd_flags & REQ_DISCARD)); + BUG_ON(rq->cmd_type != REQ_TYPE_FS); rq->cmd_flags |= REQ_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { @@ -670,76 +658,37 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) * rq cannot be accessed after calling * elevator_add_req_fn. */ - q->elevator->ops->elevator_add_req_fn(q, rq); + q->elevator->type->ops.elevator_add_req_fn(q, rq); break; + case ELEVATOR_INSERT_FLUSH: + rq->cmd_flags |= REQ_SOFTBARRIER; + blk_insert_flush(rq); + break; default: printk(KERN_ERR "%s: bad insertion point %d\n", __func__, where); BUG(); } - - if (unplug_it && blk_queue_plugged(q)) { - int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - - queue_in_flight(q); - - if (nrq >= q->unplug_thresh) - __generic_unplug_device(q); - } -} - -void __elv_add_request(struct request_queue *q, struct request *rq, int where, - int plug) -{ - if (rq->cmd_flags & REQ_SOFTBARRIER) { - /* barriers are scheduling boundary, update end_sector */ - if (rq->cmd_type == REQ_TYPE_FS || - (rq->cmd_flags & REQ_DISCARD)) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - } - } else if (!(rq->cmd_flags & REQ_ELVPRIV) && - where == ELEVATOR_INSERT_SORT) - where = ELEVATOR_INSERT_BACK; - - if (plug) - blk_plug_device(q); - - elv_insert(q, rq, where); } EXPORT_SYMBOL(__elv_add_request); -void elv_add_request(struct request_queue *q, struct request *rq, int where, - int plug) +void elv_add_request(struct request_queue *q, struct request *rq, int where) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - __elv_add_request(q, rq, where, plug); + __elv_add_request(q, rq, where); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(elv_add_request); -int elv_queue_empty(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - - if (!list_empty(&q->queue_head)) - return 0; - - if (e->ops->elevator_queue_empty_fn) - return e->ops->elevator_queue_empty_fn(q); - - return 1; -} -EXPORT_SYMBOL(elv_queue_empty); - struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_latter_req_fn) - return e->ops->elevator_latter_req_fn(q, rq); + if (e->type->ops.elevator_latter_req_fn) + return e->type->ops.elevator_latter_req_fn(q, rq); return NULL; } @@ -747,19 +696,18 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_former_req_fn) - return e->ops->elevator_former_req_fn(q, rq); + if (e->type->ops.elevator_former_req_fn) + return e->type->ops.elevator_former_req_fn(q, rq); return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_set_req_fn) - return e->ops->elevator_set_req_fn(q, rq, gfp_mask); - - rq->elevator_private = NULL; + if (e->type->ops.elevator_set_req_fn) + return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; } @@ -767,38 +715,20 @@ void elv_put_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_put_req_fn) - e->ops->elevator_put_req_fn(rq); + if (e->type->ops.elevator_put_req_fn) + e->type->ops.elevator_put_req_fn(rq); } int elv_may_queue(struct request_queue *q, int rw) { struct elevator_queue *e = q->elevator; - if (e->ops->elevator_may_queue_fn) - return e->ops->elevator_may_queue_fn(q, rw); + if (e->type->ops.elevator_may_queue_fn) + return e->type->ops.elevator_may_queue_fn(q, rw); return ELV_MQUEUE_MAY; } -void elv_abort_queue(struct request_queue *q) -{ - struct request *rq; - - while (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - rq->cmd_flags |= REQ_QUIET; - trace_block_rq_abort(q, rq); - /* - * Mark this request as started so we don't trigger - * any debug logic in the end I/O path. - */ - blk_start_request(rq); - __blk_end_request_all(rq, -EIO); - } -} -EXPORT_SYMBOL(elv_abort_queue); - void elv_completed_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; @@ -809,8 +739,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq) if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; if ((rq->cmd_flags & REQ_SORTED) && - e->ops->elevator_completed_req_fn) - e->ops->elevator_completed_req_fn(q, rq); + e->type->ops.elevator_completed_req_fn) + e->type->ops.elevator_completed_req_fn(q, rq); } } @@ -828,7 +758,7 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->ops ? entry->show(e, page) : -ENOENT; + error = e->type ? entry->show(e, page) : -ENOENT; mutex_unlock(&e->sysfs_lock); return error; } @@ -846,7 +776,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr, e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->ops ? entry->store(e, page, length) : -ENOENT; + error = e->type ? entry->store(e, page, length) : -ENOENT; mutex_unlock(&e->sysfs_lock); return error; } @@ -868,7 +798,7 @@ int elv_register_queue(struct request_queue *q) error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { - struct elv_fs_entry *attr = e->elevator_type->elevator_attrs; + struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { while (attr->attr.name) { if (sysfs_create_file(&e->kobj, &attr->attr)) @@ -883,29 +813,48 @@ int elv_register_queue(struct request_queue *q) } EXPORT_SYMBOL(elv_register_queue); -static void __elv_unregister_queue(struct elevator_queue *e) -{ - kobject_uevent(&e->kobj, KOBJ_REMOVE); - kobject_del(&e->kobj); - e->registered = 0; -} - void elv_unregister_queue(struct request_queue *q) { - if (q) - __elv_unregister_queue(q->elevator); + if (q) { + struct elevator_queue *e = q->elevator; + + kobject_uevent(&e->kobj, KOBJ_REMOVE); + kobject_del(&e->kobj); + e->registered = 0; + } } EXPORT_SYMBOL(elv_unregister_queue); -void elv_register(struct elevator_type *e) +int elv_register(struct elevator_type *e) { char *def = ""; + /* create icq_cache if requested */ + if (e->icq_size) { + if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || + WARN_ON(e->icq_align < __alignof__(struct io_cq))) + return -EINVAL; + + snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), + "%s_io_cq", e->elevator_name); + e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, + e->icq_align, 0, NULL); + if (!e->icq_cache) + return -ENOMEM; + } + + /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - BUG_ON(elevator_find(e->elevator_name)); + if (elevator_find(e->elevator_name)) { + spin_unlock(&elv_list_lock); + if (e->icq_cache) + kmem_cache_destroy(e->icq_cache); + return -EBUSY; + } list_add_tail(&e->list, &elv_list); spin_unlock(&elv_list_lock); + /* print pretty message */ if (!strcmp(e->elevator_name, chosen_elevator) || (!*chosen_elevator && !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) @@ -913,30 +862,26 @@ void elv_register(struct elevator_type *e) printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def); + return 0; } EXPORT_SYMBOL_GPL(elv_register); void elv_unregister(struct elevator_type *e) { - struct task_struct *g, *p; + /* unregister */ + spin_lock(&elv_list_lock); + list_del_init(&e->list); + spin_unlock(&elv_list_lock); /* - * Iterate every thread in the process to remove the io contexts. + * Destroy icq_cache if it exists. icq's are RCU managed. Make + * sure all RCU operations are complete before proceeding. */ - if (e->ops.trim) { - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - if (p->io_context) - e->ops.trim(p->io_context); - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (e->icq_cache) { + rcu_barrier(); + kmem_cache_destroy(e->icq_cache); + e->icq_cache = NULL; } - - spin_lock(&elv_list_lock); - list_del_init(&e->list); - spin_unlock(&elv_list_lock); } EXPORT_SYMBOL_GPL(elv_unregister); @@ -948,73 +893,53 @@ EXPORT_SYMBOL_GPL(elv_unregister); */ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - struct elevator_queue *old_elevator, *e; - void *data; + struct elevator_queue *old = q->elevator; + bool registered = old->registered; int err; /* - * Allocate new elevator + * Turn on BYPASS and drain all requests w/ elevator private data. + * Block layer doesn't call into a quiesced elevator - all requests + * are directly put on the dispatch list without elevator data + * using INSERT_BACK. All requests have SOFTBARRIER set and no + * merge happens either. */ - e = elevator_alloc(q, new_e); - if (!e) - return -ENOMEM; + blk_queue_bypass_start(q); - data = elevator_init_queue(q, e); - if (!data) { - kobject_put(&e->kobj); - return -ENOMEM; - } + /* unregister and clear all auxiliary data of the old elevator */ + if (registered) + elv_unregister_queue(q); - /* - * Turn on BYPASS and drain all requests w/ elevator private data - */ spin_lock_irq(q->queue_lock); - elv_quiesce_start(q); - - /* - * Remember old elevator. - */ - old_elevator = q->elevator; - - /* - * attach and start new elevator - */ - elevator_attach(q, e, data); - + ioc_clear_queue(q); spin_unlock_irq(q->queue_lock); - if (old_elevator->registered) { - __elv_unregister_queue(old_elevator); + /* allocate, init and register new elevator */ + err = new_e->ops.elevator_init_fn(q, new_e); + if (err) + goto fail_init; + if (registered) { err = elv_register_queue(q); if (err) goto fail_register; } - /* - * finally exit old elevator and turn off BYPASS. - */ - elevator_exit(old_elevator); - spin_lock_irq(q->queue_lock); - elv_quiesce_end(q); - spin_unlock_irq(q->queue_lock); + /* done, kill the old one and finish */ + elevator_exit(old); + blk_queue_bypass_end(q); - blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); return 0; fail_register: - /* - * switch failed, exit the new io scheduler and reattach the old - * one again (along with re-adding the sysfs dir) - */ - elevator_exit(e); - q->elevator = old_elevator; + elevator_exit(q->elevator); +fail_init: + /* switch failed, restore and re-register old elevator */ + q->elevator = old; elv_register_queue(q); - - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); + blk_queue_bypass_end(q); return err; } @@ -1022,7 +947,7 @@ fail_register: /* * Switch this queue to the given IO scheduler. */ -int elevator_change(struct request_queue *q, const char *name) +static int __elevator_change(struct request_queue *q, const char *name) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; @@ -1031,19 +956,31 @@ int elevator_change(struct request_queue *q, const char *name) return -ENXIO; strlcpy(elevator_name, name, sizeof(elevator_name)); - e = elevator_get(strstrip(elevator_name)); + e = elevator_get(strstrip(elevator_name), true); if (!e) { printk(KERN_ERR "elevator: type %s not found\n", elevator_name); return -EINVAL; } - if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { + if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { elevator_put(e); return 0; } return elevator_switch(q, e); } + +int elevator_change(struct request_queue *q, const char *name) +{ + int ret; + + /* Protect q->elevator from elevator_init() */ + mutex_lock(&q->sysfs_lock); + ret = __elevator_change(q, name); + mutex_unlock(&q->sysfs_lock); + + return ret; +} EXPORT_SYMBOL(elevator_change); ssize_t elv_iosched_store(struct request_queue *q, const char *name, @@ -1054,7 +991,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, if (!q->elevator) return count; - ret = elevator_change(q, name); + ret = __elevator_change(q, name); if (!ret) return count; @@ -1072,7 +1009,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) if (!q->elevator || !blk_queue_stackable(q)) return sprintf(name, "none\n"); - elv = e->elevator_type; + elv = e->type; spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { diff --git a/block/genhd.c b/block/genhd.c index cbf1112a885..791f4194313 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -15,10 +15,10 @@ #include <linux/slab.h> #include <linux/kmod.h> #include <linux/kobj_map.h> -#include <linux/buffer_head.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> +#include <linux/pm_runtime.h> #include "blk.h" @@ -26,7 +26,7 @@ static DEFINE_MUTEX(block_class_lock); struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ -#define MAX_EXT_DEVT (1 << MINORBITS) +#define NR_EXT_DEVT (1 << MINORBITS) /* For extended devt allocation. ext_devt_mutex prevents look up * results from going away underneath its user. @@ -36,6 +36,9 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr); +static void disk_alloc_events(struct gendisk *disk); static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); @@ -154,7 +157,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!part->nr_sects && + if (!part_nr_sects_read(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) @@ -191,7 +194,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part->nr_sects; + sector < part->start_sect + part_nr_sects_read(part); } /** @@ -408,7 +411,7 @@ static int blk_mangle_minor(int minor) int blk_alloc_devt(struct hd_struct *part, dev_t *devt) { struct gendisk *disk = part_to_disk(part); - int idx, rc; + int idx; /* in consecutive minor range? */ if (part->partno < disk->minors) { @@ -417,19 +420,11 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) } /* allocate ext devt */ - do { - if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) - return -ENOMEM; - rc = idr_get_new(&ext_devt_idr, part, &idx); - } while (rc == -EAGAIN); - - if (rc) - return rc; - - if (idx > MAX_EXT_DEVT) { - idr_remove(&ext_devt_idr, idx); - return -EBUSY; - } + mutex_lock(&ext_devt_mutex); + idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_KERNEL); + mutex_unlock(&ext_devt_mutex); + if (idx < 0) + return idx == -ENOSPC ? -EBUSY : idx; *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); return 0; @@ -507,7 +502,7 @@ static int exact_lock(dev_t devt, void *data) return 0; } -void register_disk(struct gendisk *disk) +static void register_disk(struct gendisk *disk) { struct device *ddev = disk_to_dev(disk); struct block_device *bdev; @@ -517,7 +512,7 @@ void register_disk(struct gendisk *disk) ddev->parent = disk->driverfs_dev; - dev_set_name(ddev, disk->disk_name); + dev_set_name(ddev, "%s", disk->disk_name); /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); @@ -532,11 +527,19 @@ void register_disk(struct gendisk *disk) return; } } + + /* + * avoid probable deadlock caused by allocating memory with + * GFP_KERNEL in runtime_resume callback of its all ancestor + * devices + */ + pm_runtime_set_memalloc_noio(ddev, true); + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); /* No minors to use for partitions */ - if (!disk_partitionable(disk)) + if (!disk_part_scan_enabled(disk)) goto exit; /* No such device (e.g., media were just removed) */ @@ -602,7 +605,9 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); - /* Register BDI before referencing it from bdev */ + disk_alloc_events(disk); + + /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, disk_devt(disk)); @@ -611,6 +616,12 @@ void add_disk(struct gendisk *disk) register_disk(disk); blk_register_queue(disk); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(!blk_get_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -636,7 +647,6 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_exit(&piter); invalidate_partition(disk, 0); - blk_free_devt(disk_to_dev(disk)->devt); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; @@ -653,7 +663,9 @@ void del_gendisk(struct gendisk *disk) disk->driverfs_dev = NULL; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); + blk_free_devt(disk_to_dev(disk)->devt); } EXPORT_SYMBOL(del_gendisk); @@ -735,11 +747,10 @@ void __init printk_all_partitions(void) struct hd_struct *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; - u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; /* * Don't show empty devices or things that have been - * surpressed + * suppressed */ if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) @@ -754,14 +765,11 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; - uuid[0] = 0; - if (part->info) - part_unpack_uuid(part->info->uuid, uuid); - printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf), uuid); + (unsigned long long)part_nr_sects_read(part) >> 1 + , disk_name(disk, part->partno, name_buf), + part->info ? part->info->uuid : ""); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) @@ -825,7 +833,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { - static void *p; + void *p; p = disk_seqf_start(seqf, pos); if (!IS_ERR_OR_NULL(p) && !*pos) @@ -841,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_partitionable(sgp) && + if (!get_capacity(sgp) || (!disk_max_parts(sgp) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) @@ -852,7 +860,7 @@ static int show_partition(struct seq_file *seqf, void *v) while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part->nr_sects >> 1, + (unsigned long long)part_nr_sects_read(part) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -1018,14 +1026,6 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; -static void disk_free_ptbl_rcu_cb(struct rcu_head *head) -{ - struct disk_part_tbl *ptbl = - container_of(head, struct disk_part_tbl, rcu_head); - - kfree(ptbl); -} - /** * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way * @disk: disk to replace part_tbl for @@ -1046,7 +1046,7 @@ static void disk_replace_part_tbl(struct gendisk *disk, if (old_ptbl) { rcu_assign_pointer(old_ptbl->last_lookup, NULL); - call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); + kfree_rcu(old_ptbl, rcu_head); } } @@ -1103,13 +1103,16 @@ static void disk_release(struct device *dev) disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); free_part_info(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { .name = "block", }; -static char *block_devnode(struct device *dev, mode_t *mode) +static char *block_devnode(struct device *dev, umode_t *mode, + kuid_t *uid, kgid_t *gid) { struct gendisk *disk = dev_to_disk(dev); @@ -1148,31 +1151,31 @@ static int diskstats_show(struct seq_file *seqf, void *v) "wsect wuse running use aveq" "\n\n"); */ - + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); part_round_stats(cpu, hd); part_stat_unlock(); - seq_printf(seqf, "%4d %7d %s %lu %lu %llu " - "%u %lu %lu %llu %u %u %u %u\n", + seq_printf(seqf, "%4d %7d %s %lu %lu %lu " + "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[0]), - part_stat_read(hd, merges[0]), - (unsigned long long)part_stat_read(hd, sectors[0]), - jiffies_to_msecs(part_stat_read(hd, ticks[0])), - part_stat_read(hd, ios[1]), - part_stat_read(hd, merges[1]), - (unsigned long long)part_stat_read(hd, sectors[1]), - jiffies_to_msecs(part_stat_read(hd, ticks[1])), + part_stat_read(hd, ios[READ]), + part_stat_read(hd, merges[READ]), + part_stat_read(hd, sectors[READ]), + jiffies_to_msecs(part_stat_read(hd, ticks[READ])), + part_stat_read(hd, ios[WRITE]), + part_stat_read(hd, merges[WRITE]), + part_stat_read(hd, sectors[WRITE]), + jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); } disk_part_iter_exit(&piter); - + return 0; } @@ -1241,7 +1244,7 @@ EXPORT_SYMBOL(blk_lookup_devt); struct gendisk *alloc_disk(int minors) { - return alloc_disk_node(minors, -1); + return alloc_disk_node(minors, NUMA_NO_NODE); } EXPORT_SYMBOL(alloc_disk); @@ -1249,8 +1252,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - disk = kmalloc_node(sizeof(struct gendisk), - GFP_KERNEL | __GFP_ZERO, node_id); + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { if (!init_part_stats(&disk->part0)) { kfree(disk); @@ -1264,6 +1266,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } disk->part_tbl->part[0] = &disk->part0; + /* + * set_capacity() and get_capacity() currently don't use + * seqcounter to read/update the part0->nr_sects. Still init + * the counter as we can read the sectors in IO submission + * patch using seqence counters. + * + * TODO: Ideally set_capacity() and get_capacity() should be + * converted to make use of bd_mutex and sequence counters. + */ + seqcount_init(&disk->part0.nr_sects_seq); hd_ref_init(&disk->part0); disk->minors = minors; @@ -1371,6 +1383,7 @@ struct disk_events { struct gendisk *disk; /* the associated disk */ spinlock_t lock; + struct mutex block_mutex; /* protects blocking */ int block; /* event blocking depth */ unsigned int pending; /* events already sent out */ unsigned int clearing; /* events being cleared */ @@ -1414,22 +1427,44 @@ static unsigned long disk_events_poll_jiffies(struct gendisk *disk) return msecs_to_jiffies(intv_msecs); } -static void __disk_block_events(struct gendisk *disk, bool sync) +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) { struct disk_events *ev = disk->ev; unsigned long flags; bool cancel; + if (!ev) + return; + + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + spin_lock_irqsave(&ev->lock, flags); cancel = !ev->block++; spin_unlock_irqrestore(&ev->lock, flags); - if (cancel) { - if (sync) - cancel_delayed_work_sync(&disk->ev->dwork); - else - cancel_delayed_work(&disk->ev->dwork); - } + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); } static void __disk_unblock_events(struct gendisk *disk, bool check_now) @@ -1453,35 +1488,16 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) intv = disk_events_poll_jiffies(disk); set_timer_slack(&ev->dwork.timer, intv / 4); if (check_now) - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); else if (intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } /** - * disk_block_events - block and flush disk event checking - * @disk: disk to block events for - * - * On return from this function, it is guaranteed that event checking - * isn't in progress and won't happen until unblocked by - * disk_unblock_events(). Events blocking is counted and the actual - * unblocking happens after the matching number of unblocks are done. - * - * Note that this intentionally does not block event checking from - * disk_clear_events(). - * - * CONTEXT: - * Might sleep. - */ -void disk_block_events(struct gendisk *disk) -{ - if (disk->ev) - __disk_block_events(disk, true); -} - -/** * disk_unblock_events - unblock disk event checking * @disk: disk to unblock events for * @@ -1494,26 +1510,35 @@ void disk_block_events(struct gendisk *disk) void disk_unblock_events(struct gendisk *disk) { if (disk->ev) - __disk_unblock_events(disk, true); + __disk_unblock_events(disk, false); } /** - * disk_check_events - schedule immediate event checking - * @disk: disk to check events for + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush * - * Schedule immediate event checking on @disk if not blocked. + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. * * CONTEXT: - * Don't care. Safe to call from irq context. + * If @mask is non-zero must be called with bdev->bd_mutex held. */ -void disk_check_events(struct gendisk *disk) +void disk_flush_events(struct gendisk *disk, unsigned int mask) { - if (disk->ev) { - __disk_block_events(disk, false); - __disk_unblock_events(disk, true); - } + struct disk_events *ev = disk->ev; + + if (!ev) + return; + + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + if (!ev->block) + mod_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + spin_unlock_irq(&ev->lock); } -EXPORT_SYMBOL_GPL(disk_check_events); /** * disk_clear_events - synchronously check, clear and return pending events @@ -1531,6 +1556,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) const struct block_device_operations *bdops = disk->fops; struct disk_events *ev = disk->ev; unsigned int pending; + unsigned int clearing = mask; if (!ev) { /* for drivers still using the old ->media_changed method */ @@ -1540,34 +1566,53 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) return 0; } - /* tell the workfn about the events being cleared */ + disk_block_events(disk); + + /* + * store the union of mask and ev->clearing on the stack so that the + * race with disk_flush_events does not cause ambiguity (ev->clearing + * can still be modified even if events are blocked). + */ spin_lock_irq(&ev->lock); - ev->clearing |= mask; + clearing |= ev->clearing; + ev->clearing = 0; spin_unlock_irq(&ev->lock); - /* uncondtionally schedule event check and wait for it to finish */ - __disk_block_events(disk, true); - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); - flush_delayed_work(&ev->dwork); - __disk_unblock_events(disk, false); + disk_check_events(ev, &clearing); + /* + * if ev->clearing is not 0, the disk_flush_events got called in the + * middle of this function, so we want to run the workfn without delay. + */ + __disk_unblock_events(disk, ev->clearing ? true : false); /* then, fetch and clear pending events */ spin_lock_irq(&ev->lock); - WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */ pending = ev->pending & mask; ev->pending &= ~mask; spin_unlock_irq(&ev->lock); + WARN_ON_ONCE(clearing & mask); return pending; } +/* + * Separate this part out so that a different pointer for clearing_ptr can be + * passed in for disk_clear_events. + */ static void disk_events_workfn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + + disk_check_events(ev, &ev->clearing); +} + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr) +{ struct gendisk *disk = ev->disk; char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; - unsigned int clearing = ev->clearing; + unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; int nr_events = 0, i; @@ -1580,17 +1625,22 @@ static void disk_events_workfn(struct work_struct *work) events &= ~ev->pending; ev->pending |= events; - ev->clearing &= ~clearing; + *clearing_ptr &= ~clearing; intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); spin_unlock_irq(&ev->lock); - /* tell userland about new events */ + /* + * Tell userland about new events. Only the events listed in + * @disk->events are reported. Unlisted events are processed the + * same internally but never get reported to userland. + */ for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if (events & (1 << i)) + if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i]; if (nr_events) @@ -1660,7 +1710,7 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev, if (intv < 0 && intv != -1) return -EINVAL; - __disk_block_events(disk, true); + disk_block_events(disk); disk->ev->poll_msecs = intv; __disk_unblock_events(disk, true); @@ -1699,7 +1749,7 @@ static int disk_events_set_dfl_poll_msecs(const char *val, mutex_lock(&disk_events_mutex); list_for_each_entry(ev, &disk_events, node) - disk_check_events(ev->disk); + disk_flush_events(ev->disk, 0); mutex_unlock(&disk_events_mutex); @@ -1718,13 +1768,13 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, &disk_events_dfl_poll_msecs, 0644); /* - * disk_{add|del|release}_events - initialize and destroy disk_events. + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -static void disk_add_events(struct gendisk *disk) +static void disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; - if (!disk->fops->check_events || !(disk->events | disk->async_events)) + if (!disk->fops->check_events) return; ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -1733,25 +1783,29 @@ static void disk_add_events(struct gendisk *disk) return; } - if (sysfs_create_files(&disk_to_dev(disk)->kobj, - disk_events_attrs) < 0) { - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - kfree(ev); - return; - } - - disk->ev = ev; - INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); ev->block = 1; ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + disk->ev = ev; +} + +static void disk_add_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + /* FIXME: error handling */ + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + mutex_lock(&disk_events_mutex); - list_add_tail(&ev->node, &disk_events); + list_add_tail(&disk->ev->node, &disk_events); mutex_unlock(&disk_events_mutex); /* @@ -1766,7 +1820,7 @@ static void disk_del_events(struct gendisk *disk) if (!disk->ev) return; - __disk_block_events(disk, true); + disk_block_events(disk); mutex_lock(&disk_events_mutex); list_del_init(&disk->ev->node); diff --git a/block/ioctl.c b/block/ioctl.c index 1124cd29726..7d5c3b20af4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -1,10 +1,11 @@ #include <linux/capability.h> #include <linux/blkdev.h> +#include <linux/export.h> #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> #include <linux/backing-dev.h> -#include <linux/buffer_head.h> +#include <linux/fs.h> #include <linux/blktrace_api.h> #include <asm/uaccess.h> @@ -12,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user { struct block_device *bdevp; struct gendisk *disk; - struct hd_struct *part; + struct hd_struct *part, *lpart; struct blkpg_ioctl_arg a; struct blkpg_partition p; struct disk_part_iter piter; @@ -35,12 +36,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user case BLKPG_ADD_PARTITION: start = p.start >> 9; length = p.length >> 9; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) == sizeof(long) && + /* check for fit in a hd_struct */ + if (sizeof(sector_t) == sizeof(long) && sizeof(long long) > sizeof(long)) { long pstart = start, plength = length; if (pstart != start || plength != length - || pstart < 0 || plength < 0) + || pstart < 0 || plength < 0 || partno > 65535) return -EINVAL; } @@ -63,7 +64,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); mutex_unlock(&bdev->bd_mutex); - return IS_ERR(part) ? PTR_ERR(part) : 0; + return PTR_ERR_OR_ZERO(part); case BLKPG_DEL_PARTITION: part = disk_get_part(disk, partno); if (!part) @@ -91,6 +92,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user bdput(bdevp); return 0; + case BLKPG_RESIZE_PARTITION: + start = p.start >> 9; + /* new length of partition in bytes */ + length = p.length >> 9; + /* check for fit in a hd_struct */ + if (sizeof(sector_t) == sizeof(long) && + sizeof(long long) > sizeof(long)) { + long pstart = start, plength = length; + if (pstart != start || plength != length + || pstart < 0 || plength < 0) + return -EINVAL; + } + part = disk_get_part(disk, partno); + if (!part) + return -ENXIO; + bdevp = bdget(part_devt(part)); + if (!bdevp) { + disk_put_part(part); + return -ENOMEM; + } + mutex_lock(&bdevp->bd_mutex); + mutex_lock_nested(&bdev->bd_mutex, 1); + if (start != part->start_sect) { + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); + disk_put_part(part); + return -EINVAL; + } + /* overlap? */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY); + while ((lpart = disk_part_iter_next(&piter))) { + if (lpart->partno != partno && + !(start + length <= lpart->start_sect || + start >= lpart->start_sect + lpart->nr_sects) + ) { + disk_part_iter_exit(&piter); + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); + disk_put_part(part); + return -EBUSY; + } + } + disk_part_iter_exit(&piter); + part_nr_sects_write(part, (sector_t)length); + i_size_write(bdevp->bd_inode, p.length); + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); + disk_put_part(part); + return 0; default: return -EINVAL; } @@ -101,7 +155,7 @@ static int blkdev_reread_part(struct block_device *bdev) struct gendisk *disk = bdev->bd_disk; int res; - if (!disk_partitionable(disk) || bdev != bdev->bd_contains) + if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -131,6 +185,22 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } +static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, + uint64_t len) +{ + if (start & 511) + return -EINVAL; + if (len & 511) + return -EINVAL; + start >>= 9; + len >>= 9; + + if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + return -EINVAL; + + return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL); +} + static int put_ushort(unsigned long arg, unsigned short val) { return put_user(val, (unsigned short __user *)arg); @@ -179,6 +249,26 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); /* + * Is it an unrecognized ioctl? The correct returns are either + * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a + * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl + * code before returning. + * + * Confused drivers sometimes return EINVAL, which is wrong. It + * means "I understood the ioctl command, but the parameters to + * it were wrong". + * + * We should aim to just fix the broken drivers, the EINVAL case + * should go away. + */ +static inline int is_unrecognized_ioctl(int ret) +{ + return ret == -EINVAL || + ret == -ENOTTY || + ret == -ENOIOCTLCMD; +} + +/* * always keep this in sync with compat_blkdev_ioctl() */ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, @@ -195,8 +285,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -EACCES; ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) + if (!is_unrecognized_ioctl(ret)) return ret; fsync_bdev(bdev); @@ -205,8 +294,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKROSET: ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) + if (!is_unrecognized_ioctl(ret)) return ret; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -228,6 +316,17 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return blk_ioctl_discard(bdev, range[0], range[1], cmd == BLKSECDISCARD); } + case BLKZEROOUT: { + uint64_t range[2]; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + return blk_ioctl_zeroout(bdev, range[0], range[1]); + } case HDIO_GETGEO: { struct hd_geometry geo; @@ -277,6 +376,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKSECTGET: return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); + case BLKROTATIONAL: + return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev))); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/block/ioprio.c b/block/ioprio.c new file mode 100644 index 00000000000..e50170ca7c3 --- /dev/null +++ b/block/ioprio.c @@ -0,0 +1,241 @@ +/* + * fs/ioprio.c + * + * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> + * + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.txt + * + */ +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/ioprio.h> +#include <linux/blkdev.h> +#include <linux/capability.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/pid_namespace.h> + +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + struct io_context *ioc; + const struct cred *cred = current_cred(), *tcred; + + rcu_read_lock(); + tcred = __task_cred(task); + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + err = security_task_setioprio(task, ioprio); + if (err) + return err; + + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc->ioprio = ioprio; + put_io_context(ioc); + } + + return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) +{ + int class = IOPRIO_PRIO_CLASS(ioprio); + int data = IOPRIO_PRIO_DATA(ioprio); + struct task_struct *p, *g; + struct user_struct *user; + struct pid *pgrp; + kuid_t uid; + int ret; + + switch (class) { + case IOPRIO_CLASS_RT: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* fall through, rt has prio field too */ + case IOPRIO_CLASS_BE: + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + break; + case IOPRIO_CLASS_IDLE: + break; + case IOPRIO_CLASS_NONE: + if (data) + return -EINVAL; + break; + default: + return -EINVAL; + } + + ret = -ESRCH; + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!uid_valid(uid)) + break; + if (!who) + user = current_user(); + else + user = find_user(uid); + + if (!user) + break; + + do_each_thread(g, p) { + if (!uid_eq(task_uid(p), uid)) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + goto free_uid; + } while_each_thread(g, p); +free_uid: + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} + +static int get_task_ioprio(struct task_struct *p) +{ + int ret; + + ret = security_task_getioprio(p); + if (ret) + goto out; + ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + if (p->io_context) + ret = p->io_context->ioprio; +out: + return ret; +} + +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); + unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); + + if (aclass == IOPRIO_CLASS_NONE) + aclass = IOPRIO_CLASS_BE; + if (bclass == IOPRIO_CLASS_NONE) + bclass = IOPRIO_CLASS_BE; + + if (aclass == bclass) + return min(aprio, bprio); + if (aclass > bclass) + return bprio; + else + return aprio; +} + +SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +{ + struct task_struct *g, *p; + struct user_struct *user; + struct pid *pgrp; + kuid_t uid; + int ret = -ESRCH; + int tmpio; + + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = get_task_ioprio(p); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!who) + user = current_user(); + else + user = find_user(uid); + + if (!user) + break; + + do_each_thread(g, p) { + if (!uid_eq(task_uid(p), user->uid)) + continue; + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } while_each_thread(g, p); + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 232c4b38cd3..3de89d4690f 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq) list_add_tail(&rq->queuelist, &nd->queue); } -static int noop_queue_empty(struct request_queue *q) -{ - struct noop_data *nd = q->elevator->elevator_data; - - return list_empty(&nd->queue); -} - static struct request * noop_former_request(struct request_queue *q, struct request *rq) { @@ -66,15 +59,28 @@ noop_latter_request(struct request_queue *q, struct request *rq) return list_entry(rq->queuelist.next, struct request, queuelist); } -static void *noop_init_queue(struct request_queue *q) +static int noop_init_queue(struct request_queue *q, struct elevator_type *e) { struct noop_data *nd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); - if (!nd) - return NULL; + if (!nd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = nd; + INIT_LIST_HEAD(&nd->queue); - return nd; + + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); + return 0; } static void noop_exit_queue(struct elevator_queue *e) @@ -90,7 +96,6 @@ static struct elevator_type elevator_noop = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, - .elevator_queue_empty_fn = noop_queue_empty, .elevator_former_req_fn = noop_former_request, .elevator_latter_req_fn = noop_latter_request, .elevator_init_fn = noop_init_queue, @@ -102,9 +107,7 @@ static struct elevator_type elevator_noop = { static int __init noop_init(void) { - elv_register(&elevator_noop); - - return 0; + return elv_register(&elevator_noop); } static void __exit noop_exit(void) diff --git a/block/partition-generic.c b/block/partition-generic.c new file mode 100644 index 00000000000..789cdea0589 --- /dev/null +++ b/block/partition-generic.c @@ -0,0 +1,571 @@ +/* + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + * + * We now have independent partition support from the + * block drivers, which allows all the partition code to + * be grouped in one location, and it to be mostly self + * contained. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/ctype.h> +#include <linux/genhd.h> +#include <linux/blktrace_api.h> + +#include "partitions/check.h" + +#ifdef CONFIG_BLK_DEV_MD +extern void md_autodetect_dev(dev_t dev); +#endif + +/* + * disk_name() is used by partition check code and the genhd driver. + * It formats the devicename of the indicated disk into + * the supplied buffer (of size at least 32), and returns + * a pointer to that same buffer (for convenience). + */ + +char *disk_name(struct gendisk *hd, int partno, char *buf) +{ + if (!partno) + snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); + else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); + else + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + + return buf; +} + +const char *bdevname(struct block_device *bdev, char *buf) +{ + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); +} + +EXPORT_SYMBOL(bdevname); + +/* + * There's very little reason to use this, you should really + * have a struct block_device just about everywhere and use + * bdevname() instead. + */ +const char *__bdevname(dev_t dev, char *buffer) +{ + scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", + MAJOR(dev), MINOR(dev)); + return buffer; +} + +EXPORT_SYMBOL(__bdevname); + +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + +static ssize_t part_start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); +} + +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); +} + +static ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%d\n", p->policy ? 1 : 0); +} + +static ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); +} + +static ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%u\n", p->discard_alignment); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + int cpu; + + cpu = part_stat_lock(); + part_round_stats(cpu, p); + part_stat_unlock(); + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u" + "\n", + part_stat_read(p, ios[READ]), + part_stat_read(p, merges[READ]), + (unsigned long long)part_stat_read(p, sectors[READ]), + jiffies_to_msecs(part_stat_read(p, ticks[READ])), + part_stat_read(p, ios[WRITE]), + part_stat_read(p, merges[WRITE]), + (unsigned long long)part_stat_read(p, sectors[WRITE]), + jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), + part_in_flight(p), + jiffies_to_msecs(part_stat_read(p, io_ticks)), + jiffies_to_msecs(part_stat_read(p, time_in_queue))); +} + +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), + atomic_read(&p->in_flight[1])); +} + +#ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} +#endif + +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); +static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); +static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); +static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, + NULL); +static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +#ifdef CONFIG_FAIL_MAKE_REQUEST +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); +#endif + +static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, + &dev_attr_start.attr, + &dev_attr_size.attr, + &dev_attr_ro.attr, + &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, + &dev_attr_stat.attr, + &dev_attr_inflight.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &dev_attr_fail.attr, +#endif + NULL +}; + +static struct attribute_group part_attr_group = { + .attrs = part_attrs, +}; + +static const struct attribute_group *part_attr_groups[] = { + &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif + NULL +}; + +static void part_release(struct device *dev) +{ + struct hd_struct *p = dev_to_part(dev); + free_part_stats(p); + free_part_info(p); + kfree(p); +} + +struct device_type part_type = { + .name = "partition", + .groups = part_attr_groups, + .release = part_release, +}; + +static void delete_partition_rcu_cb(struct rcu_head *head) +{ + struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); + + part->start_sect = 0; + part->nr_sects = 0; + part_stat_set_all(part, 0); + put_device(part_to_dev(part)); +} + +void __delete_partition(struct hd_struct *part) +{ + call_rcu(&part->rcu_head, delete_partition_rcu_cb); +} + +void delete_partition(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *ptbl = disk->part_tbl; + struct hd_struct *part; + + if (partno >= ptbl->len) + return; + + part = ptbl->part[partno]; + if (!part) + return; + + rcu_assign_pointer(ptbl->part[partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); + kobject_put(part->holder_dir); + device_del(part_to_dev(part)); + blk_free_devt(part_devt(part)); + + hd_struct_put(part); +} + +static ssize_t whole_disk_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} +static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, + whole_disk_show, NULL); + +struct hd_struct *add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info) +{ + struct hd_struct *p; + dev_t devt = MKDEV(0, 0); + struct device *ddev = disk_to_dev(disk); + struct device *pdev; + struct disk_part_tbl *ptbl; + const char *dname; + int err; + + err = disk_expand_part_tbl(disk, partno); + if (err) + return ERR_PTR(err); + ptbl = disk->part_tbl; + + if (ptbl->part[partno]) + return ERR_PTR(-EBUSY); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-EBUSY); + + if (!init_part_stats(p)) { + err = -ENOMEM; + goto out_free; + } + + seqcount_init(&p->nr_sects_seq); + pdev = part_to_dev(p); + + p->start_sect = start; + p->alignment_offset = + queue_limit_alignment_offset(&disk->queue->limits, start); + p->discard_alignment = + queue_limit_discard_alignment(&disk->queue->limits, start); + p->nr_sects = len; + p->partno = partno; + p->policy = get_disk_ro(disk); + + if (info) { + struct partition_meta_info *pinfo = alloc_part_info(disk); + if (!pinfo) + goto out_free_stats; + memcpy(pinfo, info, sizeof(*info)); + p->info = pinfo; + } + + dname = dev_name(ddev); + if (isdigit(dname[strlen(dname) - 1])) + dev_set_name(pdev, "%sp%d", dname, partno); + else + dev_set_name(pdev, "%s%d", dname, partno); + + device_initialize(pdev); + pdev->class = &block_class; + pdev->type = &part_type; + pdev->parent = ddev; + + err = blk_alloc_devt(p, &devt); + if (err) + goto out_free_info; + pdev->devt = devt; + + /* delay uevent until 'holders' subdir is created */ + dev_set_uevent_suppress(pdev, 1); + err = device_add(pdev); + if (err) + goto out_put; + + err = -ENOMEM; + p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!p->holder_dir) + goto out_del; + + dev_set_uevent_suppress(pdev, 0); + if (flags & ADDPART_FLAG_WHOLEDISK) { + err = device_create_file(pdev, &dev_attr_whole_disk); + if (err) + goto out_del; + } + + /* everything is up and running, commence */ + rcu_assign_pointer(ptbl->part[partno], p); + + /* suppress uevent if the disk suppresses it */ + if (!dev_get_uevent_suppress(ddev)) + kobject_uevent(&pdev->kobj, KOBJ_ADD); + + hd_ref_init(p); + return p; + +out_free_info: + free_part_info(p); +out_free_stats: + free_part_stats(p); +out_free: + kfree(p); + return ERR_PTR(err); +out_del: + kobject_put(p->holder_dir); + device_del(pdev); +out_put: + put_device(pdev); + blk_free_devt(devt); + return ERR_PTR(err); +} + +static bool disk_unlock_native_capacity(struct gendisk *disk) +{ + const struct block_device_operations *bdops = disk->fops; + + if (bdops->unlock_native_capacity && + !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { + printk(KERN_CONT "enabling native capacity\n"); + bdops->unlock_native_capacity(disk); + disk->flags |= GENHD_FL_NATIVE_CAPACITY; + return true; + } else { + printk(KERN_CONT "truncated\n"); + return false; + } +} + +static int drop_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct disk_part_iter piter; + struct hd_struct *part; + int res; + + if (bdev->bd_part_count) + return -EBUSY; + res = invalidate_partition(disk, 0); + if (res) + return res; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(disk, part->partno); + disk_part_iter_exit(&piter); + + return 0; +} + +int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct parsed_partitions *state = NULL; + struct hd_struct *part; + int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + free_partitions(state); + state = NULL; + } + + res = drop_partitions(disk, bdev); + if (res) + return res; + + if (disk->fops->revalidate_disk) + disk->fops->revalidate_disk(disk); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) + return 0; + if (IS_ERR(state)) { + /* + * I/O error reading the partition table. If any + * partition code tried to read beyond EOD, retry + * after unlocking native capacity. + */ + if (PTR_ERR(state) == -ENOSPC) { + printk(KERN_WARNING "%s: partition table beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } + return -EIO; + } + /* + * If any partition code tried to read beyond EOD, try + * unlocking native capacity even if partition table is + * successfully read as we could be missing some partitions. + */ + if (state->access_beyond_eod) { + printk(KERN_WARNING + "%s: partition table partially beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } + + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + /* Detect the highest partition number and preallocate + * disk->part_tbl. This is an optimization and not strictly + * necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + + disk_expand_part_tbl(disk, highest); + + /* add partitions */ + for (p = 1; p < state->limit; p++) { + sector_t size, from; + struct partition_meta_info *info = NULL; + + size = state->parts[p].size; + if (!size) + continue; + + from = state->parts[p].from; + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d start %llu is beyond EOD, ", + disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) + goto rescan; + continue; + } + + if (from + size > get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d size %llu extends beyond EOD, ", + disk->disk_name, p, (unsigned long long) size); + + if (disk_unlock_native_capacity(disk)) { + /* free state and restart */ + goto rescan; + } else { + /* + * we can not ignore partitions of broken tables + * created by for example camera firmware, but + * we limit them to the end of the disk to avoid + * creating invalid block devices + */ + size = get_capacity(disk) - from; + } + } + + if (state->parts[p].has_info) + info = &state->parts[p].info; + part = add_partition(disk, p, from, size, + state->parts[p].flags, + &state->parts[p].info); + if (IS_ERR(part)) { + printk(KERN_ERR " %s: p%d could not be added: %ld\n", + disk->disk_name, p, -PTR_ERR(part)); + continue; + } +#ifdef CONFIG_BLK_DEV_MD + if (state->parts[p].flags & ADDPART_FLAG_RAID) + md_autodetect_dev(part_to_dev(part)->devt); +#endif + } + free_partitions(state); + return 0; +} + +int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) +{ + int res; + + if (!bdev->bd_invalidated) + return 0; + + res = drop_partitions(disk, bdev); + if (res) + return res; + + set_capacity(disk, 0); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + return 0; +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); + if (!IS_ERR(page)) { + if (PageError(page)) + goto fail; + p->v = page; + return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9); +fail: + page_cache_release(page); + } + p->v = NULL; + return NULL; +} + +EXPORT_SYMBOL(read_dev_sector); diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig new file mode 100644 index 00000000000..9b29a996c31 --- /dev/null +++ b/block/partitions/Kconfig @@ -0,0 +1,269 @@ +# +# Partition configuration +# +config PARTITION_ADVANCED + bool "Advanced partition selection" + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under an operating system running on a different + architecture than your Linux system. + + Note that the answer to this question won't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about foreign partitioning schemes. + + If unsure, say N. + +config ACORN_PARTITION + bool "Acorn partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + help + Support hard disks partitioned under Acorn operating systems. + +config ACORN_PARTITION_CUMANA + bool "Cumana partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using the Cumana interface on Acorn machines. + +config ACORN_PARTITION_EESOX + bool "EESOX partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + +config ACORN_PARTITION_ICS + bool "ICS partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using the ICS interface on Acorn machines. + +config ACORN_PARTITION_ADFS + bool "Native filecore partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + The Acorn Disc Filing System is the standard file system of the + RiscOS operating system which runs on Acorn's ARM-based Risc PC + systems and the Acorn Archimedes range of machines. If you say + `Y' here, Linux will support disk partitions created under ADFS. + +config ACORN_PARTITION_POWERTEC + bool "PowerTec partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Support reading partition tables created on Acorn machines using + the PowerTec SCSI drive. + +config ACORN_PARTITION_RISCIX + bool "RISCiX partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Once upon a time, there was a native Unix port for the Acorn series + of machines called RISCiX. If you say 'Y' here, Linux will be able + to read disks partitioned under RISCiX. + +config AIX_PARTITION + bool "AIX basic partition table support" if PARTITION_ADVANCED + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM or Motorola PowerPC machines + running AIX. AIX actually uses a Logical Volume Manager, where + "logical volumes" can be spread across one or multiple disks, + but this driver works only for the simple case of partitions which + are contiguous. + Otherwise, say N. + +config OSF_PARTITION + bool "Alpha OSF partition support" if PARTITION_ADVANCED + default y if ALPHA + help + Say Y here if you would like to use hard disks under Linux which + were partitioned on an Alpha machine. + +config AMIGA_PARTITION + bool "Amiga partition table support" if PARTITION_ADVANCED + default y if (AMIGA || AFFS_FS=y) + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under AmigaOS. + +config ATARI_PARTITION + bool "Atari partition table support" if PARTITION_ADVANCED + default y if ATARI + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under the Atari OS. + +config IBM_PARTITION + bool "IBM disk label and partition support" + depends on PARTITION_ADVANCED && S390 + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM DASD disks operating under CMS. + Otherwise, say N. + +config MAC_PARTITION + bool "Macintosh partition map support" if PARTITION_ADVANCED + default y if (MAC || PPC_PMAC) + help + Say Y here if you would like to use hard disks under Linux which + were partitioned on a Macintosh. + +config MSDOS_PARTITION + bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED + default y + help + Say Y here. + +config BSD_DISKLABEL + bool "BSD disklabel (FreeBSD partition tables) support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + FreeBSD uses its own hard disk partition scheme on your PC. It + requires only one entry in the primary partition table of your disk + and manages it similarly to DOS extended partitions, putting in its + first sector a new partition table in BSD disklabel format. Saying Y + here allows you to read these disklabels and further mount FreeBSD + partitions from within Linux if you have also said Y to "UFS + file system support", above. If you don't know what all this is + about, say N. + +config MINIX_SUBPARTITION + bool "Minix subpartition support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + Minix 2.0.0/2.0.2 subpartition table support for Linux. + Say Y here if you want to mount and use Minix 2.0.0/2.0.2 + subpartitions. + +config SOLARIS_X86_PARTITION + bool "Solaris (x86) partition table support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + Like most systems, Solaris x86 uses its own hard disk partition + table format, incompatible with all others. Saying Y here allows you + to read these partition tables and further mount Solaris x86 + partitions from within Linux if you have also said Y to "UFS + file system support", above. + +config UNIXWARE_DISKLABEL + bool "Unixware slices support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + ---help--- + Like some systems, UnixWare uses its own slice table inside a + partition (VTOC - Virtual Table of Contents). Its format is + incompatible with all other OSes. Saying Y here allows you to read + VTOC and further mount UnixWare partitions read-only from within + Linux if you have also said Y to "UFS file system support" or + "System V and Coherent file system support", above. + + This is mainly used to carry data from a UnixWare box to your + Linux box via a removable medium like magneto-optical, ZIP or + removable IDE drives. Note, however, that a good portable way to + transport files and directories between unixes (and even other + operating systems) is given by the tar program ("man tar" or + preferably "info tar"). + + If you don't know what all this is about, say N. + +config LDM_PARTITION + bool "Windows Logical Disk Manager (Dynamic Disk) support" + depends on PARTITION_ADVANCED + ---help--- + Say Y here if you would like to use hard disks under Linux which + were partitioned using Windows 2000's/XP's or Vista's Logical Disk + Manager. They are also known as "Dynamic Disks". + + Note this driver only supports Dynamic Disks with a protective MBR + label, i.e. DOS partition table. It does not support GPT labelled + Dynamic Disks yet as can be created with Vista. + + Windows 2000 introduced the concept of Dynamic Disks to get around + the limitations of the PC's partitioning scheme. The Logical Disk + Manager allows the user to repartition a disk and create spanned, + mirrored, striped or RAID volumes, all without the need for + rebooting. + + Normal partitions are now called Basic Disks under Windows 2000, XP, + and Vista. + + For a fuller description read <file:Documentation/ldm.txt>. + + If unsure, say N. + +config LDM_DEBUG + bool "Windows LDM extra logging" + depends on LDM_PARTITION + help + Say Y here if you would like LDM to log verbosely. This could be + helpful if the driver doesn't work as expected and you'd like to + report a bug. + + If unsure, say N. + +config SGI_PARTITION + bool "SGI partition support" if PARTITION_ADVANCED + default y if DEFAULT_SGI_PARTITION + help + Say Y here if you would like to be able to read the hard disk + partition table format used by SGI machines. + +config ULTRIX_PARTITION + bool "Ultrix partition table support" if PARTITION_ADVANCED + default y if MACH_DECSTATION + help + Say Y here if you would like to be able to read the hard disk + partition table format used by DEC (now Compaq) Ultrix machines. + Otherwise, say N. + +config SUN_PARTITION + bool "Sun partition tables support" if PARTITION_ADVANCED + default y if (SPARC || SUN3 || SUN3X) + ---help--- + Like most systems, SunOS uses its own hard disk partition table + format, incompatible with all others. Saying Y here allows you to + read these partition tables and further mount SunOS partitions from + within Linux if you have also said Y to "UFS file system support", + above. This is mainly used to carry data from a SPARC under SunOS to + your Linux box via a removable medium like magneto-optical or ZIP + drives; note however that a good portable way to transport files and + directories between unixes (and even other operating systems) is + given by the tar program ("man tar" or preferably "info tar"). If + you don't know what all this is about, say N. + +config KARMA_PARTITION + bool "Karma Partition support" + depends on PARTITION_ADVANCED + help + Say Y here if you would like to mount the Rio Karma MP3 player, as it + uses a proprietary partition table. + +config EFI_PARTITION + bool "EFI GUID Partition support" if PARTITION_ADVANCED + default y + select CRC32 + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using EFI GPT. + +config SYSV68_PARTITION + bool "SYSV68 partition table support" if PARTITION_ADVANCED + default y if VME + help + Say Y here if you would like to be able to read the hard disk + partition table format used by Motorola Delta machines (using + sysv68). + Otherwise, say N. + +config CMDLINE_PARTITION + bool "Command line partition support" if PARTITION_ADVANCED + select BLK_CMDLINE_PARSER + help + Say Y here if you want to read the partition table from bootargs. + The format for the command line is just like mtdparts. diff --git a/block/partitions/Makefile b/block/partitions/Makefile new file mode 100644 index 00000000000..37a95270503 --- /dev/null +++ b/block/partitions/Makefile @@ -0,0 +1,22 @@ +# +# Makefile for the linux kernel. +# + +obj-$(CONFIG_BLOCK) := check.o + +obj-$(CONFIG_ACORN_PARTITION) += acorn.o +obj-$(CONFIG_AMIGA_PARTITION) += amiga.o +obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_AIX_PARTITION) += aix.o +obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o +obj-$(CONFIG_MAC_PARTITION) += mac.o +obj-$(CONFIG_LDM_PARTITION) += ldm.o +obj-$(CONFIG_MSDOS_PARTITION) += msdos.o +obj-$(CONFIG_OSF_PARTITION) += osf.o +obj-$(CONFIG_SGI_PARTITION) += sgi.o +obj-$(CONFIG_SUN_PARTITION) += sun.o +obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o +obj-$(CONFIG_IBM_PARTITION) += ibm.o +obj-$(CONFIG_EFI_PARTITION) += efi.o +obj-$(CONFIG_KARMA_PARTITION) += karma.o +obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c new file mode 100644 index 00000000000..fbeb697374d --- /dev/null +++ b/block/partitions/acorn.c @@ -0,0 +1,556 @@ +/* + * linux/fs/partitions/acorn.c + * + * Copyright (c) 1996-2000 Russell King. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Scan ADFS partitions on hard disk drives. Unfortunately, there + * isn't a standard for partitioning drives on Acorn machines, so + * every single manufacturer of SCSI and IDE cards created their own + * method. + */ +#include <linux/buffer_head.h> +#include <linux/adfs_fs.h> + +#include "check.h" +#include "acorn.h" + +/* + * Partition types. (Oh for reusability) + */ +#define PARTITION_RISCIX_MFM 1 +#define PARTITION_RISCIX_SCSI 2 +#define PARTITION_LINUX 9 + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static struct adfs_discrecord * +adfs_partition(struct parsed_partitions *state, char *name, char *data, + unsigned long first_sector, int slot) +{ + struct adfs_discrecord *dr; + unsigned int nr_sects; + + if (adfs_checkbblk(data)) + return NULL; + + dr = (struct adfs_discrecord *)(data + 0x1c0); + + if (dr->disc_size == 0 && dr->disc_size_high == 0) + return NULL; + + nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | + (le32_to_cpu(dr->disc_size) >> 9); + + if (name) { + strlcat(state->pp_buf, " [", PAGE_SIZE); + strlcat(state->pp_buf, name, PAGE_SIZE); + strlcat(state->pp_buf, "]", PAGE_SIZE); + } + put_partition(state, slot, first_sector, nr_sects); + return dr; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_RISCIX + +struct riscix_part { + __le32 start; + __le32 length; + __le32 one; + char name[16]; +}; + +struct riscix_record { + __le32 magic; +#define RISCIX_MAGIC cpu_to_le32(0x4a657320) + __le32 date; + struct riscix_part part[8]; +}; + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static int riscix_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) +{ + Sector sect; + struct riscix_record *rr; + + rr = read_part_sector(state, first_sect, §); + if (!rr) + return -1; + + strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); + + + if (rr->magic == RISCIX_MAGIC) { + unsigned long size = nr_sects > 2 ? 2 : nr_sects; + int part; + + strlcat(state->pp_buf, " <", PAGE_SIZE); + + put_partition(state, slot++, first_sect, size); + for (part = 0; part < 8; part++) { + if (rr->part[part].one && + memcmp(rr->part[part].name, "All\0", 4)) { + put_partition(state, slot++, + le32_to_cpu(rr->part[part].start), + le32_to_cpu(rr->part[part].length)); + strlcat(state->pp_buf, "(", PAGE_SIZE); + strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); + strlcat(state->pp_buf, ")", PAGE_SIZE); + } + } + + strlcat(state->pp_buf, " >\n", PAGE_SIZE); + } else { + put_partition(state, slot++, first_sect, nr_sects); + } + + put_dev_sector(sect); + return slot; +} +#endif +#endif + +#define LINUX_NATIVE_MAGIC 0xdeafa1de +#define LINUX_SWAP_MAGIC 0xdeafab1e + +struct linux_part { + __le32 magic; + __le32 start_sect; + __le32 nr_sects; +}; + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static int linux_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) +{ + Sector sect; + struct linux_part *linuxp; + unsigned long size = nr_sects > 2 ? 2 : nr_sects; + + strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); + + put_partition(state, slot++, first_sect, size); + + linuxp = read_part_sector(state, first_sect, §); + if (!linuxp) + return -1; + + strlcat(state->pp_buf, " <", PAGE_SIZE); + while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || + linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { + if (slot == state->limit) + break; + put_partition(state, slot++, first_sect + + le32_to_cpu(linuxp->start_sect), + le32_to_cpu(linuxp->nr_sects)); + linuxp ++; + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + + put_dev_sector(sect); + return slot; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_CUMANA +int adfspart_check_CUMANA(struct parsed_partitions *state) +{ + unsigned long first_sector = 0; + unsigned int start_blk = 0; + Sector sect; + unsigned char *data; + char *name = "CUMANA/ADFS"; + int first = 1; + int slot = 1; + + /* + * Try Cumana style partitions - sector 6 contains ADFS boot block + * with pointer to next 'drive'. + * + * There are unknowns in this code - is the 'cylinder number' of the + * next partition relative to the start of this one - I'm assuming + * it is. + * + * Also, which ID did Cumana use? + * + * This is totally unfinished, and will require more work to get it + * going. Hence it is totally untested. + */ + do { + struct adfs_discrecord *dr; + unsigned int nr_sects; + + data = read_part_sector(state, start_blk * 2 + 6, §); + if (!data) + return -1; + + if (slot == state->limit) + break; + + dr = adfs_partition(state, name, data, first_sector, slot++); + if (!dr) + break; + + name = NULL; + + nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) * + (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) * + dr->secspertrack; + + if (!nr_sects) + break; + + first = 0; + first_sector += nr_sects; + start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9); + nr_sects = 0; /* hmm - should be partition size */ + + switch (data[0x1fc] & 15) { + case 0: /* No partition / ADFS? */ + break; + +#ifdef CONFIG_ACORN_PARTITION_RISCIX + case PARTITION_RISCIX_SCSI: + /* RISCiX - we don't know how to find the next one. */ + slot = riscix_partition(state, first_sector, slot, + nr_sects); + break; +#endif + + case PARTITION_LINUX: + slot = linux_partition(state, first_sector, slot, + nr_sects); + break; + } + put_dev_sector(sect); + if (slot == -1) + return -1; + } while (1); + put_dev_sector(sect); + return first ? 0 : 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_ADFS +/* + * Purpose: allocate ADFS partitions. + * + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * + * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok. + * + * Alloc : hda = whole drive + * hda1 = ADFS partition on first drive. + * hda2 = non-ADFS partition. + */ +int adfspart_check_ADFS(struct parsed_partitions *state) +{ + unsigned long start_sect, nr_sects, sectscyl, heads; + Sector sect; + unsigned char *data; + struct adfs_discrecord *dr; + unsigned char id; + int slot = 1; + + data = read_part_sector(state, 6, §); + if (!data) + return -1; + + dr = adfs_partition(state, "ADFS", data, 0, slot++); + if (!dr) { + put_dev_sector(sect); + return 0; + } + + heads = dr->heads + ((dr->lowsector >> 6) & 1); + sectscyl = dr->secspertrack * heads; + start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl; + id = data[0x1fc] & 15; + put_dev_sector(sect); + + /* + * Work out start of non-adfs partition. + */ + nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + + if (start_sect) { + switch (id) { +#ifdef CONFIG_ACORN_PARTITION_RISCIX + case PARTITION_RISCIX_SCSI: + case PARTITION_RISCIX_MFM: + slot = riscix_partition(state, start_sect, slot, + nr_sects); + break; +#endif + + case PARTITION_LINUX: + slot = linux_partition(state, start_sect, slot, + nr_sects); + break; + } + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_ICS + +struct ics_part { + __le32 start; + __le32 size; +}; + +static int adfspart_check_ICSLinux(struct parsed_partitions *state, + unsigned long block) +{ + Sector sect; + unsigned char *data = read_part_sector(state, block, §); + int result = 0; + + if (data) { + if (memcmp(data, "LinuxPart", 9) == 0) + result = 1; + put_dev_sector(sect); + } + + return result; +} + +/* + * Check for a valid ICS partition using the checksum. + */ +static inline int valid_ics_sector(const unsigned char *data) +{ + unsigned long sum; + int i; + + for (i = 0, sum = 0x50617274; i < 508; i++) + sum += data[i]; + + sum -= le32_to_cpu(*(__le32 *)(&data[508])); + + return sum == 0; +} + +/* + * Purpose: allocate ICS partitions. + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. + * Alloc : hda = whole drive + * hda1 = ADFS partition 0 on first drive. + * hda2 = ADFS partition 1 on first drive. + * ..etc.. + */ +int adfspart_check_ICS(struct parsed_partitions *state) +{ + const unsigned char *data; + const struct ics_part *p; + int slot; + Sector sect; + + /* + * Try ICS style partitions - sector 0 contains partition info. + */ + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + if (!valid_ics_sector(data)) { + put_dev_sector(sect); + return 0; + } + + strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); + + for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { + u32 start = le32_to_cpu(p->start); + s32 size = le32_to_cpu(p->size); /* yes, it's signed. */ + + if (slot == state->limit) + break; + + /* + * Negative sizes tell the RISC OS ICS driver to ignore + * this partition - in effect it says that this does not + * contain an ADFS filesystem. + */ + if (size < 0) { + size = -size; + + /* + * Our own extension - We use the first sector + * of the partition to identify what type this + * partition is. We must not make this visible + * to the filesystem. + */ + if (size > 1 && adfspart_check_ICSLinux(state, start)) { + start += 1; + size -= 1; + } + } + + if (size) + put_partition(state, slot++, start, size); + } + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_POWERTEC +struct ptec_part { + __le32 unused1; + __le32 unused2; + __le32 start; + __le32 size; + __le32 unused5; + char type[8]; +}; + +static inline int valid_ptec_sector(const unsigned char *data) +{ + unsigned char checksum = 0x2a; + int i; + + /* + * If it looks like a PC/BIOS partition, then it + * probably isn't PowerTec. + */ + if (data[510] == 0x55 && data[511] == 0xaa) + return 0; + + for (i = 0; i < 511; i++) + checksum += data[i]; + + return checksum == data[511]; +} + +/* + * Purpose: allocate ICS partitions. + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. + * Alloc : hda = whole drive + * hda1 = ADFS partition 0 on first drive. + * hda2 = ADFS partition 1 on first drive. + * ..etc.. + */ +int adfspart_check_POWERTEC(struct parsed_partitions *state) +{ + Sector sect; + const unsigned char *data; + const struct ptec_part *p; + int slot = 1; + int i; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + if (!valid_ptec_sector(data)) { + put_dev_sector(sect); + return 0; + } + + strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); + + for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { + u32 start = le32_to_cpu(p->start); + u32 size = le32_to_cpu(p->size); + + if (size) + put_partition(state, slot++, start, size); + } + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_EESOX +struct eesox_part { + char magic[6]; + char name[10]; + __le32 start; + __le32 unused6; + __le32 unused7; + __le32 unused8; +}; + +/* + * Guess who created this format? + */ +static const char eesox_name[] = { + 'N', 'e', 'i', 'l', ' ', + 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' ' +}; + +/* + * EESOX SCSI partition format. + * + * This is a goddamned awful partition format. We don't seem to store + * the size of the partition in this table, only the start addresses. + * + * There are two possibilities where the size comes from: + * 1. The individual ADFS boot block entries that are placed on the disk. + * 2. The start address of the next entry. + */ +int adfspart_check_EESOX(struct parsed_partitions *state) +{ + Sector sect; + const unsigned char *data; + unsigned char buffer[256]; + struct eesox_part *p; + sector_t start = 0; + int i, slot = 1; + + data = read_part_sector(state, 7, §); + if (!data) + return -1; + + /* + * "Decrypt" the partition table. God knows why... + */ + for (i = 0; i < 256; i++) + buffer[i] = data[i] ^ eesox_name[i & 15]; + + put_dev_sector(sect); + + for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) { + sector_t next; + + if (memcmp(p->magic, "Eesox", 6)) + break; + + next = le32_to_cpu(p->start); + if (i) + put_partition(state, slot++, start, next - start); + start = next; + } + + if (i != 0) { + sector_t size; + + size = get_capacity(state->bdev->bd_disk); + put_partition(state, slot++, start, size - start); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + } + + return i ? 1 : 0; +} +#endif diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h new file mode 100644 index 00000000000..ede82852969 --- /dev/null +++ b/block/partitions/acorn.h @@ -0,0 +1,14 @@ +/* + * linux/fs/partitions/acorn.h + * + * Copyright (C) 1996-2001 Russell King. + * + * I _hate_ this partitioning mess - why can't we have one defined + * format, and everyone stick to it? + */ + +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/block/partitions/aix.c b/block/partitions/aix.c new file mode 100644 index 00000000000..43be471d9b1 --- /dev/null +++ b/block/partitions/aix.c @@ -0,0 +1,293 @@ +/* + * fs/partitions/aix.c + * + * Copyright (C) 2012-2013 Philippe De Muyter <phdm@macqel.be> + */ + +#include "check.h" +#include "aix.h" + +struct lvm_rec { + char lvm_id[4]; /* "_LVM" */ + char reserved4[16]; + __be32 lvmarea_len; + __be32 vgda_len; + __be32 vgda_psn[2]; + char reserved36[10]; + __be16 pp_size; /* log2(pp_size) */ + char reserved46[12]; + __be16 version; + }; + +struct vgda { + __be32 secs; + __be32 usec; + char reserved8[16]; + __be16 numlvs; + __be16 maxlvs; + __be16 pp_size; + __be16 numpvs; + __be16 total_vgdas; + __be16 vgda_size; + }; + +struct lvd { + __be16 lv_ix; + __be16 res2; + __be16 res4; + __be16 maxsize; + __be16 lv_state; + __be16 mirror; + __be16 mirror_policy; + __be16 num_lps; + __be16 res10[8]; + }; + +struct lvname { + char name[64]; + }; + +struct ppe { + __be16 lv_ix; + unsigned short res2; + unsigned short res4; + __be16 lp_ix; + unsigned short res8[12]; + }; + +struct pvd { + char reserved0[16]; + __be16 pp_count; + char reserved18[2]; + __be32 psn_part1; + char reserved24[8]; + struct ppe ppe[1016]; + }; + +#define LVM_MAXLVS 256 + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return (bdev->bd_inode->i_size >> 9) - 1ULL; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @count + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, + size_t count) +{ + size_t totalreadcount = 0; + + if (!buffer || lba + count / 512 > last_lba(state->bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, lba++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount += copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_pvd(): reads physical volume descriptor + * @state + * @lba + * + * Description: Returns pvd on success, NULL on error. + * Allocates space for pvd and fill it with disk blocks at @lba + * Notes: remember to free pvd when you're done! + */ +static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct pvd); + struct pvd *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +/** + * alloc_lvn(): reads logical volume names + * @state + * @lba + * + * Description: Returns lvn on success, NULL on error. + * Allocates space for lvn and fill it with disk blocks at @lba + * Notes: remember to free lvn when you're done! + */ +static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct lvname) * LVM_MAXLVS; + struct lvname *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +int aix_partition(struct parsed_partitions *state) +{ + int ret = 0; + Sector sect; + unsigned char *d; + u32 pp_bytes_size; + u32 pp_blocks_size = 0; + u32 vgda_sector = 0; + u32 vgda_len = 0; + int numlvs = 0; + struct pvd *pvd; + struct lv_info { + unsigned short pps_per_lv; + unsigned short pps_found; + unsigned char lv_is_contiguous; + } *lvip; + struct lvname *n = NULL; + + d = read_part_sector(state, 7, §); + if (d) { + struct lvm_rec *p = (struct lvm_rec *)d; + u16 lvm_version = be16_to_cpu(p->version); + char tmp[64]; + + if (lvm_version == 1) { + int pp_size_log2 = be16_to_cpu(p->pp_size); + + pp_bytes_size = 1 << pp_size_log2; + pp_blocks_size = pp_bytes_size / 512; + snprintf(tmp, sizeof(tmp), + " AIX LVM header version %u found\n", + lvm_version); + vgda_len = be32_to_cpu(p->vgda_len); + vgda_sector = be32_to_cpu(p->vgda_psn[0]); + } else { + snprintf(tmp, sizeof(tmp), + " unsupported AIX LVM version %d found\n", + lvm_version); + } + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + } + if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { + struct vgda *p = (struct vgda *)d; + + numlvs = be16_to_cpu(p->numlvs); + put_dev_sector(sect); + } + lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL); + if (!lvip) + return 0; + if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { + struct lvd *p = (struct lvd *)d; + int i; + + n = alloc_lvn(state, vgda_sector + vgda_len - 33); + if (n) { + int foundlvs = 0; + + for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) { + lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps); + if (lvip[i].pps_per_lv) + foundlvs += 1; + } + } + put_dev_sector(sect); + } + pvd = alloc_pvd(state, vgda_sector + 17); + if (pvd) { + int numpps = be16_to_cpu(pvd->pp_count); + int psn_part1 = be32_to_cpu(pvd->psn_part1); + int i; + int cur_lv_ix = -1; + int next_lp_ix = 1; + int lp_ix; + + for (i = 0; i < numpps; i += 1) { + struct ppe *p = pvd->ppe + i; + unsigned int lv_ix; + + lp_ix = be16_to_cpu(p->lp_ix); + if (!lp_ix) { + next_lp_ix = 1; + continue; + } + lv_ix = be16_to_cpu(p->lv_ix) - 1; + if (lv_ix > state->limit) { + cur_lv_ix = -1; + continue; + } + lvip[lv_ix].pps_found += 1; + if (lp_ix == 1) { + cur_lv_ix = lv_ix; + next_lp_ix = 1; + } else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) { + next_lp_ix = 1; + continue; + } + if (lp_ix == lvip[lv_ix].pps_per_lv) { + char tmp[70]; + + put_partition(state, lv_ix + 1, + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, + lvip[lv_ix].pps_per_lv * pp_blocks_size); + snprintf(tmp, sizeof(tmp), " <%s>\n", + n[lv_ix].name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + lvip[lv_ix].lv_is_contiguous = 1; + ret = 1; + next_lp_ix = 1; + } else + next_lp_ix += 1; + } + for (i = 0; i < state->limit; i += 1) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + pr_warn("partition %s (%u pp's found) is " + "not contiguous\n", + n[i].name, lvip[i].pps_found); + kfree(pvd); + } + kfree(n); + kfree(lvip); + return ret; +} diff --git a/block/partitions/aix.h b/block/partitions/aix.h new file mode 100644 index 00000000000..e0c66a98752 --- /dev/null +++ b/block/partitions/aix.h @@ -0,0 +1 @@ +extern int aix_partition(struct parsed_partitions *state); diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c new file mode 100644 index 00000000000..70cbf44a156 --- /dev/null +++ b/block/partitions/amiga.c @@ -0,0 +1,139 @@ +/* + * fs/partitions/amiga.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include <linux/types.h> +#include <linux/affs_hardblocks.h> + +#include "check.h" +#include "amiga.h" + +static __inline__ u32 +checksum_block(__be32 *m, int size) +{ + u32 sum = 0; + + while (size--) + sum += be32_to_cpu(*m++); + return sum; +} + +int amiga_partition(struct parsed_partitions *state) +{ + Sector sect; + unsigned char *data; + struct RigidDiskBlock *rdb; + struct PartitionBlock *pb; + int start_sect, nr_sects, blk, part, res = 0; + int blksize = 1; /* Multiplier for disk block size */ + int slot = 1; + char b[BDEVNAME_SIZE]; + + for (blk = 0; ; blk++, put_dev_sector(sect)) { + if (blk == RDB_ALLOCATION_LIMIT) + goto rdb_done; + data = read_part_sector(state, blk, §); + if (!data) { + if (warn_no_part) + printk("Dev %s: unable to read RDB block %d\n", + bdevname(state->bdev, b), blk); + res = -1; + goto rdb_done; + } + if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) + continue; + + rdb = (struct RigidDiskBlock *)data; + if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) + break; + /* Try again with 0xdc..0xdf zeroed, Windows might have + * trashed it. + */ + *(__be32 *)(data+0xdc) = 0; + if (checksum_block((__be32 *)data, + be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { + printk("Warning: Trashed word at 0xd0 in block %d " + "ignored in checksum calculation\n",blk); + break; + } + + printk("Dev %s: RDB in block %d has bad checksum\n", + bdevname(state->bdev, b), blk); + } + + /* blksize is blocks per 512 byte standard block */ + blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; + + { + char tmp[7 + 10 + 1 + 1]; + + /* Be more informative */ + snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + blk = be32_to_cpu(rdb->rdb_PartitionList); + put_dev_sector(sect); + for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { + blk *= blksize; /* Read in terms partition table understands */ + data = read_part_sector(state, blk, §); + if (!data) { + if (warn_no_part) + printk("Dev %s: unable to read partition block %d\n", + bdevname(state->bdev, b), blk); + res = -1; + goto rdb_done; + } + pb = (struct PartitionBlock *)data; + blk = be32_to_cpu(pb->pb_Next); + if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) + continue; + if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) + continue; + + /* Tell Kernel about it */ + + nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - + be32_to_cpu(pb->pb_Environment[9])) * + be32_to_cpu(pb->pb_Environment[3]) * + be32_to_cpu(pb->pb_Environment[5]) * + blksize; + if (!nr_sects) + continue; + start_sect = be32_to_cpu(pb->pb_Environment[9]) * + be32_to_cpu(pb->pb_Environment[3]) * + be32_to_cpu(pb->pb_Environment[5]) * + blksize; + put_partition(state,slot++,start_sect,nr_sects); + { + /* Be even more informative to aid mounting */ + char dostype[4]; + char tmp[42]; + + __be32 *dt = (__be32 *)dostype; + *dt = pb->pb_Environment[16]; + if (dostype[3] < ' ') + snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", + dostype[0], dostype[1], + dostype[2], dostype[3] + '@' ); + else + snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", + dostype[0], dostype[1], + dostype[2], dostype[3]); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + snprintf(tmp, sizeof(tmp), "(res %d spb %d)", + be32_to_cpu(pb->pb_Environment[6]), + be32_to_cpu(pb->pb_Environment[4])); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + res = 1; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + +rdb_done: + return res; +} diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h new file mode 100644 index 00000000000..d094585cada --- /dev/null +++ b/block/partitions/amiga.h @@ -0,0 +1,6 @@ +/* + * fs/partitions/amiga.h + */ + +int amiga_partition(struct parsed_partitions *state); + diff --git a/block/partitions/atari.c b/block/partitions/atari.c new file mode 100644 index 00000000000..9875b05e80a --- /dev/null +++ b/block/partitions/atari.c @@ -0,0 +1,149 @@ +/* + * fs/partitions/atari.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include <linux/ctype.h> +#include "check.h" +#include "atari.h" + +/* ++guenther: this should be settable by the user ("make config")?. + */ +#define ICD_PARTS + +/* check if a partition entry looks valid -- Atari format is assumed if at + least one of the primary entries is ok this way */ +#define VALID_PARTITION(pi,hdsiz) \ + (((pi)->flg & 1) && \ + isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ + be32_to_cpu((pi)->st) <= (hdsiz) && \ + be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) + +static inline int OK_id(char *s) +{ + return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || + memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || + memcmp (s, "RAW", 3) == 0 ; +} + +int atari_partition(struct parsed_partitions *state) +{ + Sector sect; + struct rootsector *rs; + struct partition_info *pi; + u32 extensect; + u32 hd_size; + int slot; +#ifdef ICD_PARTS + int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ +#endif + + rs = read_part_sector(state, 0, §); + if (!rs) + return -1; + + /* Verify this is an Atari rootsector: */ + hd_size = state->bdev->bd_inode->i_size >> 9; + if (!VALID_PARTITION(&rs->part[0], hd_size) && + !VALID_PARTITION(&rs->part[1], hd_size) && + !VALID_PARTITION(&rs->part[2], hd_size) && + !VALID_PARTITION(&rs->part[3], hd_size)) { + /* + * if there's no valid primary partition, assume that no Atari + * format partition table (there's no reliable magic or the like + * :-() + */ + put_dev_sector(sect); + return 0; + } + + pi = &rs->part[0]; + strlcat(state->pp_buf, " AHDI", PAGE_SIZE); + for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { + struct rootsector *xrs; + Sector sect2; + ulong partsect; + + if ( !(pi->flg & 1) ) + continue; + /* active partition */ + if (memcmp (pi->id, "XGM", 3) != 0) { + /* we don't care about other id's */ + put_partition (state, slot, be32_to_cpu(pi->st), + be32_to_cpu(pi->siz)); + continue; + } + /* extension partition */ +#ifdef ICD_PARTS + part_fmt = 1; +#endif + strlcat(state->pp_buf, " XGM<", PAGE_SIZE); + partsect = extensect = be32_to_cpu(pi->st); + while (1) { + xrs = read_part_sector(state, partsect, §2); + if (!xrs) { + printk (" block %ld read failed\n", partsect); + put_dev_sector(sect); + return -1; + } + + /* ++roman: sanity check: bit 0 of flg field must be set */ + if (!(xrs->part[0].flg & 1)) { + printk( "\nFirst sub-partition in extended partition is not valid!\n" ); + put_dev_sector(sect2); + break; + } + + put_partition(state, slot, + partsect + be32_to_cpu(xrs->part[0].st), + be32_to_cpu(xrs->part[0].siz)); + + if (!(xrs->part[1].flg & 1)) { + /* end of linked partition list */ + put_dev_sector(sect2); + break; + } + if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { + printk("\nID of extended partition is not XGM!\n"); + put_dev_sector(sect2); + break; + } + + partsect = be32_to_cpu(xrs->part[1].st) + extensect; + put_dev_sector(sect2); + if (++slot == state->limit) { + printk( "\nMaximum number of partitions reached!\n" ); + break; + } + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + } +#ifdef ICD_PARTS + if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ + pi = &rs->icdpart[0]; + /* sanity check: no ICD format if first partition invalid */ + if (OK_id(pi->id)) { + strlcat(state->pp_buf, " ICD<", PAGE_SIZE); + for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { + /* accept only GEM,BGM,RAW,LNX,SWP partitions */ + if (!((pi->flg & 1) && OK_id(pi->id))) + continue; + part_fmt = 2; + put_partition (state, slot, + be32_to_cpu(pi->st), + be32_to_cpu(pi->siz)); + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + } + } +#endif + put_dev_sector(sect); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/atari.h b/block/partitions/atari.h new file mode 100644 index 00000000000..f2ec43bfeec --- /dev/null +++ b/block/partitions/atari.h @@ -0,0 +1,36 @@ +/* + * fs/partitions/atari.h + * Moved by Russell King from: + * + * linux/include/linux/atari_rootsec.h + * definitions for Atari Rootsector layout + * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de) + * + * modified for ICD/Supra partitioning scheme restricted to at most 12 + * partitions + * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de) + */ + +#include <linux/compiler.h> + +struct partition_info +{ + u8 flg; /* bit 0: active; bit 7: bootable */ + char id[3]; /* "GEM", "BGM", "XGM", or other */ + __be32 st; /* start of partition */ + __be32 siz; /* length of partition */ +}; + +struct rootsector +{ + char unused[0x156]; /* room for boot code */ + struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */ + char unused2[0xc]; + u32 hd_siz; /* size of disk in blocks */ + struct partition_info part[4]; + u32 bsl_st; /* start of bad sector list */ + u32 bsl_cnt; /* length of bad sector list */ + u16 checksum; /* checksum for bootable disks */ +} __packed; + +int atari_partition(struct parsed_partitions *state); diff --git a/block/partitions/check.c b/block/partitions/check.c new file mode 100644 index 00000000000..9ac1df74f69 --- /dev/null +++ b/block/partitions/check.c @@ -0,0 +1,197 @@ +/* + * fs/partitions/check.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + * + * We now have independent partition support from the + * block drivers, which allows all the partition code to + * be grouped in one location, and it to be mostly self + * contained. + * + * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} + */ + +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/ctype.h> +#include <linux/genhd.h> + +#include "check.h" + +#include "acorn.h" +#include "amiga.h" +#include "atari.h" +#include "ldm.h" +#include "mac.h" +#include "msdos.h" +#include "osf.h" +#include "sgi.h" +#include "sun.h" +#include "ibm.h" +#include "ultrix.h" +#include "efi.h" +#include "karma.h" +#include "sysv68.h" +#include "cmdline.h" + +int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ + +static int (*check_part[])(struct parsed_partitions *) = { + /* + * Probe partition formats with tables at disk address 0 + * that also have an ADFS boot block at 0xdc0. + */ +#ifdef CONFIG_ACORN_PARTITION_ICS + adfspart_check_ICS, +#endif +#ifdef CONFIG_ACORN_PARTITION_POWERTEC + adfspart_check_POWERTEC, +#endif +#ifdef CONFIG_ACORN_PARTITION_EESOX + adfspart_check_EESOX, +#endif + + /* + * Now move on to formats that only have partition info at + * disk address 0xdc0. Since these may also have stale + * PC/BIOS partition tables, they need to come before + * the msdos entry. + */ +#ifdef CONFIG_ACORN_PARTITION_CUMANA + adfspart_check_CUMANA, +#endif +#ifdef CONFIG_ACORN_PARTITION_ADFS + adfspart_check_ADFS, +#endif + +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif +#ifdef CONFIG_EFI_PARTITION + efi_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_SGI_PARTITION + sgi_partition, +#endif +#ifdef CONFIG_LDM_PARTITION + ldm_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_MSDOS_PARTITION + msdos_partition, +#endif +#ifdef CONFIG_OSF_PARTITION + osf_partition, +#endif +#ifdef CONFIG_SUN_PARTITION + sun_partition, +#endif +#ifdef CONFIG_AMIGA_PARTITION + amiga_partition, +#endif +#ifdef CONFIG_ATARI_PARTITION + atari_partition, +#endif +#ifdef CONFIG_MAC_PARTITION + mac_partition, +#endif +#ifdef CONFIG_ULTRIX_PARTITION + ultrix_partition, +#endif +#ifdef CONFIG_IBM_PARTITION + ibm_partition, +#endif +#ifdef CONFIG_KARMA_PARTITION + karma_partition, +#endif +#ifdef CONFIG_SYSV68_PARTITION + sysv68_partition, +#endif + NULL +}; + +static struct parsed_partitions *allocate_partitions(struct gendisk *hd) +{ + struct parsed_partitions *state; + int nr; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return NULL; + + nr = disk_max_parts(hd); + state->parts = vzalloc(nr * sizeof(state->parts[0])); + if (!state->parts) { + kfree(state); + return NULL; + } + + state->limit = nr; + + return state; +} + +void free_partitions(struct parsed_partitions *state) +{ + vfree(state->parts); + kfree(state); +} + +struct parsed_partitions * +check_partition(struct gendisk *hd, struct block_device *bdev) +{ + struct parsed_partitions *state; + int i, res, err; + + state = allocate_partitions(hd); + if (!state) + return NULL; + state->pp_buf = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf) { + free_partitions(state); + return NULL; + } + state->pp_buf[0] = '\0'; + + state->bdev = bdev; + disk_name(hd, 0, state->name); + snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + if (isdigit(state->name[strlen(state->name)-1])) + sprintf(state->name, "p"); + + i = res = err = 0; + while (!res && check_part[i]) { + memset(state->parts, 0, state->limit * sizeof(state->parts[0])); + res = check_part[i++](state); + if (res < 0) { + /* We have hit an I/O error which we don't report now. + * But record it, and let the others do their job. + */ + err = res; + res = 0; + } + + } + if (res > 0) { + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + return state; + } + if (state->access_beyond_eod) + err = -ENOSPC; + if (err) + /* The partition is unrecognized. So report I/O errors if there were any */ + res = err; + if (!res) + strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); + else if (warn_no_part) + strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); + + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + free_partitions(state); + return ERR_PTR(res); +} diff --git a/block/partitions/check.h b/block/partitions/check.h new file mode 100644 index 00000000000..eade17ea910 --- /dev/null +++ b/block/partitions/check.h @@ -0,0 +1,54 @@ +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include <linux/genhd.h> + +/* + * add_gd_partition adds a partitions details to the devices partition + * description. + */ +struct parsed_partitions { + struct block_device *bdev; + char name[BDEVNAME_SIZE]; + struct { + sector_t from; + sector_t size; + int flags; + bool has_info; + struct partition_meta_info info; + } *parts; + int next; + int limit; + bool access_beyond_eod; + char *pp_buf; +}; + +void free_partitions(struct parsed_partitions *state); + +struct parsed_partitions * +check_partition(struct gendisk *, struct block_device *); + +static inline void *read_part_sector(struct parsed_partitions *state, + sector_t n, Sector *p) +{ + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; + } + return read_dev_sector(state->bdev, n, p); +} + +static inline void +put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) +{ + if (n < p->limit) { + char tmp[1 + BDEVNAME_SIZE + 10 + 1]; + + p->parts[n].from = from; + p->parts[n].size = size; + snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); + strlcat(p->pp_buf, tmp, PAGE_SIZE); + } +} + +extern int warn_no_part; + diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c new file mode 100644 index 00000000000..5141b563adf --- /dev/null +++ b/block/partitions/cmdline.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013 HUAWEI + * Author: Cai Zhiyong <caizhiyong@huawei.com> + * + * Read block device partition table from the command line. + * Typically used for fixed block (eMMC) embedded devices. + * It has no MBR, so saves storage space. Bootloader can be easily accessed + * by absolute address of data on the block device. + * Users can easily change the partition. + * + * The format for the command line is just like mtdparts. + * + * For further information, see "Documentation/block/cmdline-partition.txt" + * + */ + +#include <linux/cmdline-parser.h> + +#include "check.h" +#include "cmdline.h" + +static char *cmdline; +static struct cmdline_parts *bdev_parts; + +static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +{ + int label_min; + struct partition_meta_info *info; + char tmp[sizeof(info->volname) + 4]; + struct parsed_partitions *state = (struct parsed_partitions *)param; + + if (slot >= state->limit) + return 1; + + put_partition(state, slot, subpart->from >> 9, + subpart->size >> 9); + + info = &state->parts[slot].info; + + label_min = min_t(int, sizeof(info->volname) - 1, + sizeof(subpart->name)); + strncpy(info->volname, subpart->name, label_min); + info->volname[label_min] = '\0'; + + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + state->parts[slot].has_info = true; + + return 0; +} + +static int __init cmdline_parts_setup(char *s) +{ + cmdline = s; + return 1; +} +__setup("blkdevparts=", cmdline_parts_setup); + +/* + * Purpose: allocate cmdline partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + */ +int cmdline_partition(struct parsed_partitions *state) +{ + sector_t disk_size; + char bdev[BDEVNAME_SIZE]; + struct cmdline_parts *parts; + + if (cmdline) { + if (bdev_parts) + cmdline_parts_free(&bdev_parts); + + if (cmdline_parts_parse(&bdev_parts, cmdline)) { + cmdline = NULL; + return -1; + } + cmdline = NULL; + } + + if (!bdev_parts) + return 0; + + bdevname(state->bdev, bdev); + parts = cmdline_parts_find(bdev_parts, bdev); + if (!parts) + return 0; + + disk_size = get_capacity(state->bdev->bd_disk) << 9; + + cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h new file mode 100644 index 00000000000..26e0f8da141 --- /dev/null +++ b/block/partitions/cmdline.h @@ -0,0 +1,2 @@ + +int cmdline_partition(struct parsed_partitions *state); diff --git a/block/partitions/efi.c b/block/partitions/efi.c new file mode 100644 index 00000000000..dc51f467a56 --- /dev/null +++ b/block/partitions/efi.c @@ -0,0 +1,735 @@ +/************************************************************ + * EFI GUID Partition Table handling + * + * http://www.uefi.org/specs/ + * http://www.intel.com/technology/efi/ + * + * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> + * Copyright 2000,2001,2002,2004 Dell Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * TODO: + * + * Changelog: + * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com> + * - detect hybrid MBRs, tighter pMBR checking & cleanups. + * + * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> + * - test for valid PMBR and valid PGPT before ever reading + * AGPT, allow override with 'gpt' kernel command line option. + * - check for first/last_usable_lba outside of size of disk + * + * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com> + * - Ported to 2.5.7-pre1 and 2.5.7-dj2 + * - Applied patch to avoid fault in alternate header handling + * - cleaned up find_valid_gpt + * - On-disk structure and copy in memory is *always* LE now - + * swab fields as needed + * - remove print_gpt_header() + * - only use first max_p partition entries, to keep the kernel minor number + * and partition numbers tied. + * + * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com> + * - Removed __PRIPTR_PREFIX - not being used + * + * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com> + * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied + * + * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com> + * - Added compare_gpts(). + * - moved le_efi_guid_to_cpus() back into this file. GPT is the only + * thing that keeps EFI GUIDs on disk. + * - Changed gpt structure names and members to be simpler and more Linux-like. + * + * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com> + * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck + * + * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com> + * - Changed function comments to DocBook style per Andreas Dilger suggestion. + * + * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com> + * - Change read_lba() to use the page cache per Al Viro's work. + * - print u64s properly on all architectures + * - fixed debug_printk(), now Dprintk() + * + * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com> + * - Style cleanups + * - made most functions static + * - Endianness addition + * - remove test for second alternate header, as it's not per spec, + * and is unnecessary. There's now a method to read/write the last + * sector of an odd-sized disk from user space. No tools have ever + * been released which used this code, so it's effectively dead. + * - Per Asit Mallick of Intel, added a test for a valid PMBR. + * - Added kernel command line option 'gpt' to override valid PMBR test. + * + * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com> + * - added devfs volume UUID support (/dev/volumes/uuids) for + * mounting file systems by the partition GUID. + * + * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com> + * - Moved crc32() to linux/lib, added efi_crc32(). + * + * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com> + * - Replaced Intel's CRC32 function with an equivalent + * non-license-restricted version. + * + * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com> + * - Fixed the last_lba() call to return the proper last block + * + * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com> + * - Thanks to Andries Brouwer for his debugging assistance. + * - Code works, detects all the partitions. + * + ************************************************************/ +#include <linux/kernel.h> +#include <linux/crc32.h> +#include <linux/ctype.h> +#include <linux/math64.h> +#include <linux/slab.h> +#include "check.h" +#include "efi.h" + +/* This allows a kernel command line option 'gpt' to override + * the test for invalid PMBR. Not __initdata because reloading + * the partition tables happens after init too. + */ +static int force_gpt; +static int __init +force_gpt_fn(char *str) +{ + force_gpt = 1; + return 1; +} +__setup("gpt", force_gpt_fn); + + +/** + * efi_crc32() - EFI version of crc32 function + * @buf: buffer to calculate crc32 of + * @len - length of buf + * + * Description: Returns EFI-style CRC32 value for @buf + * + * This function uses the little endian Ethernet polynomial + * but seeds the function with ~0, and xor's with ~0 at the end. + * Note, the EFI Specification, v1.02, has a reference to + * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). + */ +static inline u32 +efi_crc32(const void *buf, unsigned long len) +{ + return (crc32(~0L, buf, len) ^ ~0L); +} + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return div_u64(bdev->bd_inode->i_size, + bdev_logical_block_size(bdev)) - 1ULL; +} + +static inline int pmbr_part_valid(gpt_mbr_record *part) +{ + if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) + goto invalid; + + /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ + if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) + goto invalid; + + return GPT_MBR_PROTECTIVE; +invalid: + return 0; +} + +/** + * is_pmbr_valid(): test Protective MBR for validity + * @mbr: pointer to a legacy mbr structure + * @total_sectors: amount of sectors in the device + * + * Description: Checks for a valid protective or hybrid + * master boot record (MBR). The validity of a pMBR depends + * on all of the following properties: + * 1) MSDOS signature is in the last two bytes of the MBR + * 2) One partition of type 0xEE is found + * + * In addition, a hybrid MBR will have up to three additional + * primary partitions, which point to the same space that's + * marked out by up to three GPT partitions. + * + * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or + * GPT_MBR_HYBRID depending on the device layout. + */ +static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) +{ + uint32_t sz = 0; + int i, part = 0, ret = 0; /* invalid by default */ + + if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) + goto done; + + for (i = 0; i < 4; i++) { + ret = pmbr_part_valid(&mbr->partition_record[i]); + if (ret == GPT_MBR_PROTECTIVE) { + part = i; + /* + * Ok, we at least know that there's a protective MBR, + * now check if there are other partition types for + * hybrid MBR. + */ + goto check_hybrid; + } + } + + if (ret != GPT_MBR_PROTECTIVE) + goto done; +check_hybrid: + for (i = 0; i < 4; i++) + if ((mbr->partition_record[i].os_type != + EFI_PMBR_OSTYPE_EFI_GPT) && + (mbr->partition_record[i].os_type != 0x00)) + ret = GPT_MBR_HYBRID; + + /* + * Protective MBRs take up the lesser of the whole disk + * or 2 TiB (32bit LBA), ignoring the rest of the disk. + * Some partitioning programs, nonetheless, choose to set + * the size to the maximum 32-bit limitation, disregarding + * the disk size. + * + * Hybrid MBRs do not necessarily comply with this. + * + * Consider a bad value here to be a warning to support dd'ing + * an image from a smaller disk to a larger disk. + */ + if (ret == GPT_MBR_PROTECTIVE) { + sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); + if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) + pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", + sz, min_t(uint32_t, + total_sectors - 1, 0xFFFFFFFF)); + } +done: + return ret; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @size_t + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, + u64 lba, u8 *buffer, size_t count) +{ + size_t totalreadcount = 0; + struct block_device *bdev = state->bdev; + sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + + if (!buffer || lba > last_lba(bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, n++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount +=copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_read_gpt_entries(): reads partition entries from disk + * @state + * @gpt - GPT header + * + * Description: Returns ptes on success, NULL on error. + * Allocates space for PTEs based on information found in @gpt. + * Notes: remember to free pte when you're done! + */ +static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, + gpt_header *gpt) +{ + size_t count; + gpt_entry *pte; + + if (!gpt) + return NULL; + + count = le32_to_cpu(gpt->num_partition_entries) * + le32_to_cpu(gpt->sizeof_partition_entry); + if (!count) + return NULL; + pte = kmalloc(count, GFP_KERNEL); + if (!pte) + return NULL; + + if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), + (u8 *) pte, count) < count) { + kfree(pte); + pte=NULL; + return NULL; + } + return pte; +} + +/** + * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk + * @state + * @lba is the Logical Block Address of the partition table + * + * Description: returns GPT header on success, NULL on error. Allocates + * and fills a GPT header starting at @ from @state->bdev. + * Note: remember to free gpt when finished with it. + */ +static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, + u64 lba) +{ + gpt_header *gpt; + unsigned ssz = bdev_logical_block_size(state->bdev); + + gpt = kmalloc(ssz, GFP_KERNEL); + if (!gpt) + return NULL; + + if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { + kfree(gpt); + gpt=NULL; + return NULL; + } + + return gpt; +} + +/** + * is_gpt_valid() - tests one GPT header and PTEs for validity + * @state + * @lba is the logical block address of the GPT header to test + * @gpt is a GPT header ptr, filled on return. + * @ptes is a PTEs ptr, filled on return. + * + * Description: returns 1 if valid, 0 on error. + * If valid, returns pointers to newly allocated GPT header and PTEs. + */ +static int is_gpt_valid(struct parsed_partitions *state, u64 lba, + gpt_header **gpt, gpt_entry **ptes) +{ + u32 crc, origcrc; + u64 lastlba; + + if (!ptes) + return 0; + if (!(*gpt = alloc_read_gpt_header(state, lba))) + return 0; + + /* Check the GUID Partition Table signature */ + if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { + pr_debug("GUID Partition Table Header signature is wrong:" + "%lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->signature), + (unsigned long long)GPT_HEADER_SIGNATURE); + goto fail; + } + + /* Check the GUID Partition Table header size is too big */ + if (le32_to_cpu((*gpt)->header_size) > + bdev_logical_block_size(state->bdev)) { + pr_debug("GUID Partition Table Header size is too large: %u > %u\n", + le32_to_cpu((*gpt)->header_size), + bdev_logical_block_size(state->bdev)); + goto fail; + } + + /* Check the GUID Partition Table header size is too small */ + if (le32_to_cpu((*gpt)->header_size) < sizeof(gpt_header)) { + pr_debug("GUID Partition Table Header size is too small: %u < %zu\n", + le32_to_cpu((*gpt)->header_size), + sizeof(gpt_header)); + goto fail; + } + + /* Check the GUID Partition Table CRC */ + origcrc = le32_to_cpu((*gpt)->header_crc32); + (*gpt)->header_crc32 = 0; + crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); + + if (crc != origcrc) { + pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", + crc, origcrc); + goto fail; + } + (*gpt)->header_crc32 = cpu_to_le32(origcrc); + + /* Check that the my_lba entry points to the LBA that contains + * the GUID Partition Table */ + if (le64_to_cpu((*gpt)->my_lba) != lba) { + pr_debug("GPT my_lba incorrect: %lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->my_lba), + (unsigned long long)lba); + goto fail; + } + + /* Check the first_usable_lba and last_usable_lba are + * within the disk. + */ + lastlba = last_lba(state->bdev); + if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { + pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), + (unsigned long long)lastlba); + goto fail; + } + if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)lastlba); + goto fail; + } + if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); + goto fail; + } + /* Check that sizeof_partition_entry has the correct value */ + if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { + pr_debug("GUID Partitition Entry Size check failed.\n"); + goto fail; + } + + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) + goto fail; + + /* Check the GUID Partition Entry Array CRC */ + crc = efi_crc32((const unsigned char *) (*ptes), + le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry)); + + if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { + pr_debug("GUID Partitition Entry Array CRC check failed.\n"); + goto fail_ptes; + } + + /* We're done, all's well */ + return 1; + + fail_ptes: + kfree(*ptes); + *ptes = NULL; + fail: + kfree(*gpt); + *gpt = NULL; + return 0; +} + +/** + * is_pte_valid() - tests one PTE for validity + * @pte is the pte to check + * @lastlba is last lba of the disk + * + * Description: returns 1 if valid, 0 on error. + */ +static inline int +is_pte_valid(const gpt_entry *pte, const u64 lastlba) +{ + if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || + le64_to_cpu(pte->starting_lba) > lastlba || + le64_to_cpu(pte->ending_lba) > lastlba) + return 0; + return 1; +} + +/** + * compare_gpts() - Search disk for valid GPT headers and PTEs + * @pgpt is the primary GPT header + * @agpt is the alternate GPT header + * @lastlba is the last LBA number + * Description: Returns nothing. Sanity checks pgpt and agpt fields + * and prints warnings on discrepancies. + * + */ +static void +compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) +{ + int error_found = 0; + if (!pgpt || !agpt) + return; + if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { + pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->my_lba), + (unsigned long long)le64_to_cpu(agpt->alternate_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { + pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->alternate_lba), + (unsigned long long)le64_to_cpu(agpt->my_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->first_usable_lba) != + le64_to_cpu(agpt->first_usable_lba)) { + pr_warn("GPT:first_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), + (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->last_usable_lba) != + le64_to_cpu(agpt->last_usable_lba)) { + pr_warn("GPT:last_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), + (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); + error_found++; + } + if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { + pr_warn("GPT:disk_guids don't match.\n"); + error_found++; + } + if (le32_to_cpu(pgpt->num_partition_entries) != + le32_to_cpu(agpt->num_partition_entries)) { + pr_warn("GPT:num_partition_entries don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->num_partition_entries), + le32_to_cpu(agpt->num_partition_entries)); + error_found++; + } + if (le32_to_cpu(pgpt->sizeof_partition_entry) != + le32_to_cpu(agpt->sizeof_partition_entry)) { + pr_warn("GPT:sizeof_partition_entry values don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->sizeof_partition_entry), + le32_to_cpu(agpt->sizeof_partition_entry)); + error_found++; + } + if (le32_to_cpu(pgpt->partition_entry_array_crc32) != + le32_to_cpu(agpt->partition_entry_array_crc32)) { + pr_warn("GPT:partition_entry_array_crc32 values don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->partition_entry_array_crc32), + le32_to_cpu(agpt->partition_entry_array_crc32)); + error_found++; + } + if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { + pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->alternate_lba), + (unsigned long long)lastlba); + error_found++; + } + + if (le64_to_cpu(agpt->my_lba) != lastlba) { + pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(agpt->my_lba), + (unsigned long long)lastlba); + error_found++; + } + + if (error_found) + pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); + return; +} + +/** + * find_valid_gpt() - Search disk for valid GPT headers and PTEs + * @state + * @gpt is a GPT header ptr, filled on return. + * @ptes is a PTEs ptr, filled on return. + * Description: Returns 1 if valid, 0 on error. + * If valid, returns pointers to newly allocated GPT header and PTEs. + * Validity depends on PMBR being valid (or being overridden by the + * 'gpt' kernel command line option) and finding either the Primary + * GPT header and PTEs valid, or the Alternate GPT header and PTEs + * valid. If the Primary GPT header is not valid, the Alternate GPT header + * is not checked unless the 'gpt' kernel command line option is passed. + * This protects against devices which misreport their size, and forces + * the user to decide to use the Alternate GPT. + */ +static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, + gpt_entry **ptes) +{ + int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; + gpt_header *pgpt = NULL, *agpt = NULL; + gpt_entry *pptes = NULL, *aptes = NULL; + legacy_mbr *legacymbr; + sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + u64 lastlba; + + if (!ptes) + return 0; + + lastlba = last_lba(state->bdev); + if (!force_gpt) { + /* This will be added to the EFI Spec. per Intel after v1.02. */ + legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); + if (!legacymbr) + goto fail; + + read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); + good_pmbr = is_pmbr_valid(legacymbr, total_sectors); + kfree(legacymbr); + + if (!good_pmbr) + goto fail; + + pr_debug("Device has a %s MBR\n", + good_pmbr == GPT_MBR_PROTECTIVE ? + "protective" : "hybrid"); + } + + good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, + &pgpt, &pptes); + if (good_pgpt) + good_agpt = is_gpt_valid(state, + le64_to_cpu(pgpt->alternate_lba), + &agpt, &aptes); + if (!good_agpt && force_gpt) + good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + + /* The obviously unsuccessful case */ + if (!good_pgpt && !good_agpt) + goto fail; + + compare_gpts(pgpt, agpt, lastlba); + + /* The good cases */ + if (good_pgpt) { + *gpt = pgpt; + *ptes = pptes; + kfree(agpt); + kfree(aptes); + if (!good_agpt) + pr_warn("Alternate GPT is invalid, using primary GPT.\n"); + return 1; + } + else if (good_agpt) { + *gpt = agpt; + *ptes = aptes; + kfree(pgpt); + kfree(pptes); + pr_warn("Primary GPT is invalid, using alternate GPT.\n"); + return 1; + } + + fail: + kfree(pgpt); + kfree(agpt); + kfree(pptes); + kfree(aptes); + *gpt = NULL; + *ptes = NULL; + return 0; +} + +/** + * efi_partition(struct parsed_partitions *state) + * @state + * + * Description: called from check.c, if the disk contains GPT + * partitions, sets up partition entries in the kernel. + * + * If the first block on the disk is a legacy MBR, + * it will get handled by msdos_partition(). + * If it's a Protective MBR, we'll handle it here. + * + * We do not create a Linux partition for GPT, but + * only for the actual data partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + * + */ +int efi_partition(struct parsed_partitions *state) +{ + gpt_header *gpt = NULL; + gpt_entry *ptes = NULL; + u32 i; + unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + + if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { + kfree(gpt); + kfree(ptes); + return 0; + } + + pr_debug("GUID Partition Table is valid! Yea!\n"); + + for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + struct partition_meta_info *info; + unsigned label_count = 0; + unsigned label_max; + u64 start = le64_to_cpu(ptes[i].starting_lba); + u64 size = le64_to_cpu(ptes[i].ending_lba) - + le64_to_cpu(ptes[i].starting_lba) + 1ULL; + + if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + continue; + + put_partition(state, i+1, start * ssz, size * ssz); + + /* If this is a RAID volume, tell md */ + if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) + state->parts[i + 1].flags = ADDPART_FLAG_RAID; + + info = &state->parts[i + 1].info; + efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid); + + /* Naively convert UTF16-LE to 7 bits. */ + label_max = min(ARRAY_SIZE(info->volname) - 1, + ARRAY_SIZE(ptes[i].partition_name)); + info->volname[label_max] = 0; + while (label_count < label_max) { + u8 c = ptes[i].partition_name[label_count] & 0xff; + if (c && !isprint(c)) + c = '!'; + info->volname[label_count] = c; + label_count++; + } + state->parts[i + 1].has_info = true; + } + kfree(ptes); + kfree(gpt); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} diff --git a/block/partitions/efi.h b/block/partitions/efi.h new file mode 100644 index 00000000000..abd0b19288a --- /dev/null +++ b/block/partitions/efi.h @@ -0,0 +1,133 @@ +/************************************************************ + * EFI GUID Partition Table + * Per Intel EFI Specification v1.02 + * http://developer.intel.com/technology/efi/efi.htm + * + * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000 + * Copyright 2000,2001 Dell Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + ************************************************************/ + +#ifndef FS_PART_EFI_H_INCLUDED +#define FS_PART_EFI_H_INCLUDED + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/string.h> +#include <linux/efi.h> +#include <linux/compiler.h> + +#define MSDOS_MBR_SIGNATURE 0xaa55 +#define EFI_PMBR_OSTYPE_EFI 0xEF +#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE + +#define GPT_MBR_PROTECTIVE 1 +#define GPT_MBR_HYBRID 2 + +#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL +#define GPT_HEADER_REVISION_V1 0x00010000 +#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 + +#define PARTITION_SYSTEM_GUID \ + EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \ + 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) +#define LEGACY_MBR_PARTITION_GUID \ + EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \ + 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F) +#define PARTITION_MSFT_RESERVED_GUID \ + EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \ + 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE) +#define PARTITION_BASIC_DATA_GUID \ + EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \ + 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7) +#define PARTITION_LINUX_RAID_GUID \ + EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \ + 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e) +#define PARTITION_LINUX_SWAP_GUID \ + EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \ + 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f) +#define PARTITION_LINUX_LVM_GUID \ + EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ + 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) + +typedef struct _gpt_header { + __le64 signature; + __le32 revision; + __le32 header_size; + __le32 header_crc32; + __le32 reserved1; + __le64 my_lba; + __le64 alternate_lba; + __le64 first_usable_lba; + __le64 last_usable_lba; + efi_guid_t disk_guid; + __le64 partition_entry_lba; + __le32 num_partition_entries; + __le32 sizeof_partition_entry; + __le32 partition_entry_array_crc32; + + /* The rest of the logical block is reserved by UEFI and must be zero. + * EFI standard handles this by: + * + * uint8_t reserved2[ BlockSize - 92 ]; + */ +} __packed gpt_header; + +typedef struct _gpt_entry_attributes { + u64 required_to_function:1; + u64 reserved:47; + u64 type_guid_specific:16; +} __packed gpt_entry_attributes; + +typedef struct _gpt_entry { + efi_guid_t partition_type_guid; + efi_guid_t unique_partition_guid; + __le64 starting_lba; + __le64 ending_lba; + gpt_entry_attributes attributes; + efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; +} __packed gpt_entry; + +typedef struct _gpt_mbr_record { + u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */ + u8 start_head; /* unused by EFI, pt start in CHS */ + u8 start_sector; /* unused by EFI, pt start in CHS */ + u8 start_track; + u8 os_type; /* EFI and legacy non-EFI OS types */ + u8 end_head; /* unused by EFI, pt end in CHS */ + u8 end_sector; /* unused by EFI, pt end in CHS */ + u8 end_track; /* unused by EFI, pt end in CHS */ + __le32 starting_lba; /* used by EFI - start addr of the on disk pt */ + __le32 size_in_lba; /* used by EFI - size of pt in LBA */ +} __packed gpt_mbr_record; + + +typedef struct _legacy_mbr { + u8 boot_code[440]; + __le32 unique_mbr_signature; + __le16 unknown; + gpt_mbr_record partition_record[4]; + __le16 signature; +} __packed legacy_mbr; + +/* Functions */ +extern int efi_partition(struct parsed_partitions *state); + +#endif diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c new file mode 100644 index 00000000000..47a61474e79 --- /dev/null +++ b/block/partitions/ibm.c @@ -0,0 +1,364 @@ +/* + * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com> + * Volker Sameske <sameske@de.ibm.com> + * Bugreports.to..: <Linux390@de.ibm.com> + * Copyright IBM Corp. 1999, 2012 + */ + +#include <linux/buffer_head.h> +#include <linux/hdreg.h> +#include <linux/slab.h> +#include <asm/dasd.h> +#include <asm/ebcdic.h> +#include <asm/uaccess.h> +#include <asm/vtoc.h> + +#include "check.h" +#include "ibm.h" + + +union label_t { + struct vtoc_volume_label_cdl vol; + struct vtoc_volume_label_ldl lnx; + struct vtoc_cms_label cms; +}; + +/* + * compute the block number from a + * cyl-cyl-head-head structure + */ +static sector_t cchh2blk(struct vtoc_cchh *ptr, struct hd_geometry *geo) +{ + sector_t cyl; + __u16 head; + + /* decode cylinder and heads for large volumes */ + cyl = ptr->hh & 0xFFF0; + cyl <<= 12; + cyl |= ptr->cc; + head = ptr->hh & 0x000F; + return cyl * geo->heads * geo->sectors + + head * geo->sectors; +} + +/* + * compute the block number from a + * cyl-cyl-head-head-block structure + */ +static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo) +{ + sector_t cyl; + __u16 head; + + /* decode cylinder and heads for large volumes */ + cyl = ptr->hh & 0xFFF0; + cyl <<= 12; + cyl |= ptr->cc; + head = ptr->hh & 0x000F; + return cyl * geo->heads * geo->sectors + + head * geo->sectors + + ptr->b; +} + +static int find_label(struct parsed_partitions *state, + dasd_information2_t *info, + struct hd_geometry *geo, + int blocksize, + sector_t *labelsect, + char name[], + char type[], + union label_t *label) +{ + Sector sect; + unsigned char *data; + sector_t testsect[3]; + unsigned char temp[5]; + int found = 0; + int i, testcount; + + /* There a three places where we may find a valid label: + * - on an ECKD disk it's block 2 + * - on an FBA disk it's block 1 + * - on an CMS formatted FBA disk it is sector 1, even if the block size + * is larger than 512 bytes (possible if the DIAG discipline is used) + * If we have a valid info structure, then we know exactly which case we + * have, otherwise we just search through all possebilities. + */ + if (info) { + if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || + (info->cu_type == 0x3880 && info->dev_type == 0x3370)) + testsect[0] = info->label_block; + else + testsect[0] = info->label_block * (blocksize >> 9); + testcount = 1; + } else { + testsect[0] = 1; + testsect[1] = (blocksize >> 9); + testsect[2] = 2 * (blocksize >> 9); + testcount = 3; + } + for (i = 0; i < testcount; ++i) { + data = read_part_sector(state, testsect[i], §); + if (data == NULL) + continue; + memcpy(label, data, sizeof(*label)); + memcpy(temp, data, 4); + temp[4] = 0; + EBCASC(temp, 4); + put_dev_sector(sect); + if (!strcmp(temp, "VOL1") || + !strcmp(temp, "LNX1") || + !strcmp(temp, "CMS1")) { + if (!strcmp(temp, "VOL1")) { + strncpy(type, label->vol.vollbl, 4); + strncpy(name, label->vol.volid, 6); + } else { + strncpy(type, label->lnx.vollbl, 4); + strncpy(name, label->lnx.volid, 6); + } + EBCASC(type, 4); + EBCASC(name, 6); + *labelsect = testsect[i]; + found = 1; + break; + } + } + if (!found) + memset(label, 0, sizeof(*label)); + + return found; +} + +static int find_vol1_partitions(struct parsed_partitions *state, + struct hd_geometry *geo, + int blocksize, + char name[], + union label_t *label) +{ + sector_t blk; + int counter; + char tmp[64]; + Sector sect; + unsigned char *data; + loff_t offset, size; + struct vtoc_format1_label f1; + int secperblk; + + snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* + * get start of VTOC from the disk label and then search for format1 + * and format8 labels + */ + secperblk = blocksize >> 9; + blk = cchhb2blk(&label->vol.vtoc, geo) + 1; + counter = 0; + data = read_part_sector(state, blk * secperblk, §); + while (data != NULL) { + memcpy(&f1, data, sizeof(struct vtoc_format1_label)); + put_dev_sector(sect); + /* skip FMT4 / FMT5 / FMT7 labels */ + if (f1.DS1FMTID == _ascebc['4'] + || f1.DS1FMTID == _ascebc['5'] + || f1.DS1FMTID == _ascebc['7'] + || f1.DS1FMTID == _ascebc['9']) { + blk++; + data = read_part_sector(state, blk * secperblk, §); + continue; + } + /* only FMT1 and 8 labels valid at this point */ + if (f1.DS1FMTID != _ascebc['1'] && + f1.DS1FMTID != _ascebc['8']) + break; + /* OK, we got valid partition data */ + offset = cchh2blk(&f1.DS1EXT1.llimit, geo); + size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - + offset + geo->sectors; + offset *= secperblk; + size *= secperblk; + if (counter >= state->limit) + break; + put_partition(state, counter + 1, offset, size); + counter++; + blk++; + data = read_part_sector(state, blk * secperblk, §); + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + if (!data) + return -1; + + return 1; +} + +static int find_lnx1_partitions(struct parsed_partitions *state, + struct hd_geometry *geo, + int blocksize, + char name[], + union label_t *label, + sector_t labelsect, + loff_t i_size, + dasd_information2_t *info) +{ + loff_t offset, geo_size, size; + char tmp[64]; + int secperblk; + + snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + secperblk = blocksize >> 9; + if (label->lnx.ldl_version == 0xf2) { + size = label->lnx.formatted_blocks * secperblk; + } else { + /* + * Formated w/o large volume support. If the sanity check + * 'size based on geo == size based on i_size' is true, then + * we can safely assume that we know the formatted size of + * the disk, otherwise we need additional information + * that we can only get from a real DASD device. + */ + geo_size = geo->cylinders * geo->heads + * geo->sectors * secperblk; + size = i_size >> 9; + if (size != geo_size) { + if (!info) { + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; + } + if (!strcmp(info->type, "ECKD")) + if (geo_size < size) + size = geo_size; + /* else keep size based on i_size */ + } + } + /* first and only partition starts in the first block after the label */ + offset = labelsect + secperblk; + put_partition(state, 1, offset, size - offset); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} + +static int find_cms1_partitions(struct parsed_partitions *state, + struct hd_geometry *geo, + int blocksize, + char name[], + union label_t *label, + sector_t labelsect) +{ + loff_t offset, size; + char tmp[64]; + int secperblk; + + /* + * VM style CMS1 labeled disk + */ + blocksize = label->cms.block_size; + secperblk = blocksize >> 9; + if (label->cms.disk_offset != 0) { + snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* disk is reserved minidisk */ + offset = label->cms.disk_offset * secperblk; + size = (label->cms.block_count - 1) * secperblk; + } else { + snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* + * Special case for FBA devices: + * If an FBA device is CMS formatted with blocksize > 512 byte + * and the DIAG discipline is used, then the CMS label is found + * in sector 1 instead of block 1. However, the partition is + * still supposed to start in block 2. + */ + if (labelsect == 1) + offset = 2 * secperblk; + else + offset = labelsect + secperblk; + size = label->cms.block_count * secperblk; + } + + put_partition(state, 1, offset, size-offset); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} + + +/* + * This is the main function, called by check.c + */ +int ibm_partition(struct parsed_partitions *state) +{ + struct block_device *bdev = state->bdev; + int blocksize, res; + loff_t i_size, offset, size; + dasd_information2_t *info; + struct hd_geometry *geo; + char type[5] = {0,}; + char name[7] = {0,}; + sector_t labelsect; + union label_t *label; + + res = 0; + blocksize = bdev_logical_block_size(bdev); + if (blocksize <= 0) + goto out_exit; + i_size = i_size_read(bdev->bd_inode); + if (i_size == 0) + goto out_exit; + info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); + if (info == NULL) + goto out_exit; + geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); + if (geo == NULL) + goto out_nogeo; + label = kmalloc(sizeof(union label_t), GFP_KERNEL); + if (label == NULL) + goto out_nolab; + if (ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) + goto out_freeall; + if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0) { + kfree(info); + info = NULL; + } + + if (find_label(state, info, geo, blocksize, &labelsect, name, type, + label)) { + if (!strncmp(type, "VOL1", 4)) { + res = find_vol1_partitions(state, geo, blocksize, name, + label); + } else if (!strncmp(type, "LNX1", 4)) { + res = find_lnx1_partitions(state, geo, blocksize, name, + label, labelsect, i_size, + info); + } else if (!strncmp(type, "CMS1", 4)) { + res = find_cms1_partitions(state, geo, blocksize, name, + label, labelsect); + } + } else if (info) { + /* + * ugly but needed for backward compatibility: + * If the block device is a DASD (i.e. BIODASDINFO2 works), + * then we claim it in any case, even though it has no valid + * label. If it has the LDL format, then we simply define a + * partition as if it had an LNX1 label. + */ + res = 1; + if (info->format == DASD_FORMAT_LDL) { + strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); + size = i_size >> 9; + offset = (info->label_block + 1) * (blocksize >> 9); + put_partition(state, 1, offset, size-offset); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + } + } else + res = 0; + +out_freeall: + kfree(label); +out_nolab: + kfree(geo); +out_nogeo: + kfree(info); +out_exit: + return res; +} diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h new file mode 100644 index 00000000000..08fb0804a81 --- /dev/null +++ b/block/partitions/ibm.h @@ -0,0 +1 @@ +int ibm_partition(struct parsed_partitions *); diff --git a/block/partitions/karma.c b/block/partitions/karma.c new file mode 100644 index 00000000000..9721fa589bb --- /dev/null +++ b/block/partitions/karma.c @@ -0,0 +1,58 @@ +/* + * fs/partitions/karma.c + * Rio Karma partition info. + * + * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com) + * based on osf.c + */ + +#include "check.h" +#include "karma.h" +#include <linux/compiler.h> + +int karma_partition(struct parsed_partitions *state) +{ + int i; + int slot = 1; + Sector sect; + unsigned char *data; + struct disklabel { + u8 d_reserved[270]; + struct d_partition { + __le32 p_res; + u8 p_fstype; + u8 p_res2[3]; + __le32 p_offset; + __le32 p_size; + } d_partitions[2]; + u8 d_blank[208]; + __le16 d_magic; + } __packed *label; + struct d_partition *p; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + label = (struct disklabel *)data; + if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) { + put_dev_sector(sect); + return 0; + } + + p = label->d_partitions; + for (i = 0 ; i < 2; i++, p++) { + if (slot == state->limit) + break; + + if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) { + put_partition(state, slot, le32_to_cpu(p->p_offset), + le32_to_cpu(p->p_size)); + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} + diff --git a/block/partitions/karma.h b/block/partitions/karma.h new file mode 100644 index 00000000000..c764b2e9df2 --- /dev/null +++ b/block/partitions/karma.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/karma.h + */ + +#define KARMA_LABEL_MAGIC 0xAB56 + +int karma_partition(struct parsed_partitions *state); + diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c new file mode 100644 index 00000000000..e507cfbd044 --- /dev/null +++ b/block/partitions/ldm.c @@ -0,0 +1,1567 @@ +/** + * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) + * + * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> + * Copyright (c) 2001-2012 Anton Altaparmakov + * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> + * + * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program (in the main directory of the source in the file COPYING); if + * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + */ + +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/stringify.h> +#include <linux/kernel.h> +#include "ldm.h" +#include "check.h" +#include "msdos.h" + +/** + * ldm_debug/info/error/crit - Output an error message + * @f: A printf format string containing the message + * @...: Variables to substitute into @f + * + * ldm_debug() writes a DEBUG level message to the syslog but only if the + * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. + */ +#ifndef CONFIG_LDM_DEBUG +#define ldm_debug(...) do {} while (0) +#else +#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) +#endif + +#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) +#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) +#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) + +static __printf(3, 4) +void _ldm_printk(const char *level, const char *function, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start (args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%s(): %pV\n", level, function, &vaf); + + va_end(args); +} + +/** + * ldm_parse_hexbyte - Convert a ASCII hex number to a byte + * @src: Pointer to at least 2 characters to convert. + * + * Convert a two character ASCII hex string to a number. + * + * Return: 0-255 Success, the byte was parsed correctly + * -1 Error, an invalid character was supplied + */ +static int ldm_parse_hexbyte (const u8 *src) +{ + unsigned int x; /* For correct wrapping */ + int h; + + /* high part */ + x = h = hex_to_bin(src[0]); + if (h < 0) + return -1; + + /* low part */ + h = hex_to_bin(src[1]); + if (h < 0) + return -1; + + return (x << 4) + h; +} + +/** + * ldm_parse_guid - Convert GUID from ASCII to binary + * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba + * @dest: Memory block to hold binary GUID (16 bytes) + * + * N.B. The GUID need not be NULL terminated. + * + * Return: 'true' @dest contains binary GUID + * 'false' @dest contents are undefined + */ +static bool ldm_parse_guid (const u8 *src, u8 *dest) +{ + static const int size[] = { 4, 2, 2, 2, 6 }; + int i, j, v; + + if (src[8] != '-' || src[13] != '-' || + src[18] != '-' || src[23] != '-') + return false; + + for (j = 0; j < 5; j++, src++) + for (i = 0; i < size[j]; i++, src+=2, *dest++ = v) + if ((v = ldm_parse_hexbyte (src)) < 0) + return false; + + return true; +} + +/** + * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure + * @data: Raw database PRIVHEAD structure loaded from the device + * @ph: In-memory privhead structure in which to return parsed information + * + * This parses the LDM database PRIVHEAD structure supplied in @data and + * sets up the in-memory privhead structure @ph with the obtained information. + * + * Return: 'true' @ph contains the PRIVHEAD data + * 'false' @ph contents are undefined + */ +static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) +{ + bool is_vista = false; + + BUG_ON(!data || !ph); + if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { + ldm_error("Cannot find PRIVHEAD structure. LDM database is" + " corrupt. Aborting."); + return false; + } + ph->ver_major = get_unaligned_be16(data + 0x000C); + ph->ver_minor = get_unaligned_be16(data + 0x000E); + ph->logical_disk_start = get_unaligned_be64(data + 0x011B); + ph->logical_disk_size = get_unaligned_be64(data + 0x0123); + ph->config_start = get_unaligned_be64(data + 0x012B); + ph->config_size = get_unaligned_be64(data + 0x0133); + /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ + if (ph->ver_major == 2 && ph->ver_minor == 12) + is_vista = true; + if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { + ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." + " Aborting.", ph->ver_major, ph->ver_minor); + return false; + } + ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, + ph->ver_minor, is_vista ? "Vista" : "2000/XP"); + if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ + /* Warn the user and continue, carefully. */ + ldm_info("Database is normally %u bytes, it claims to " + "be %llu bytes.", LDM_DB_SIZE, + (unsigned long long)ph->config_size); + } + if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + + ph->logical_disk_size > ph->config_start)) { + ldm_error("PRIVHEAD disk size doesn't match real disk size"); + return false; + } + if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) { + ldm_error("PRIVHEAD contains an invalid GUID."); + return false; + } + ldm_debug("Parsed PRIVHEAD successfully."); + return true; +} + +/** + * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure + * @data: Raw database TOCBLOCK structure loaded from the device + * @toc: In-memory toc structure in which to return parsed information + * + * This parses the LDM Database TOCBLOCK (table of contents) structure supplied + * in @data and sets up the in-memory tocblock structure @toc with the obtained + * information. + * + * N.B. The *_start and *_size values returned in @toc are not range-checked. + * + * Return: 'true' @toc contains the TOCBLOCK data + * 'false' @toc contents are undefined + */ +static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) +{ + BUG_ON (!data || !toc); + + if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { + ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); + return false; + } + strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); + toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; + toc->bitmap1_start = get_unaligned_be64(data + 0x2E); + toc->bitmap1_size = get_unaligned_be64(data + 0x36); + + if (strncmp (toc->bitmap1_name, TOC_BITMAP1, + sizeof (toc->bitmap1_name)) != 0) { + ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", + TOC_BITMAP1, toc->bitmap1_name); + return false; + } + strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); + toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; + toc->bitmap2_start = get_unaligned_be64(data + 0x50); + toc->bitmap2_size = get_unaligned_be64(data + 0x58); + if (strncmp (toc->bitmap2_name, TOC_BITMAP2, + sizeof (toc->bitmap2_name)) != 0) { + ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", + TOC_BITMAP2, toc->bitmap2_name); + return false; + } + ldm_debug ("Parsed TOCBLOCK successfully."); + return true; +} + +/** + * ldm_parse_vmdb - Read the LDM Database VMDB structure + * @data: Raw database VMDB structure loaded from the device + * @vm: In-memory vmdb structure in which to return parsed information + * + * This parses the LDM Database VMDB structure supplied in @data and sets up + * the in-memory vmdb structure @vm with the obtained information. + * + * N.B. The *_start, *_size and *_seq values will be range-checked later. + * + * Return: 'true' @vm contains VMDB info + * 'false' @vm contents are undefined + */ +static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) +{ + BUG_ON (!data || !vm); + + if (MAGIC_VMDB != get_unaligned_be32(data)) { + ldm_crit ("Cannot find the VMDB, database may be corrupt."); + return false; + } + + vm->ver_major = get_unaligned_be16(data + 0x12); + vm->ver_minor = get_unaligned_be16(data + 0x14); + if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { + ldm_error ("Expected VMDB version %d.%d, got %d.%d. " + "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); + return false; + } + + vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + + vm->vblk_offset = get_unaligned_be32(data + 0x0C); + vm->last_vblk_seq = get_unaligned_be32(data + 0x04); + + ldm_debug ("Parsed VMDB successfully."); + return true; +} + +/** + * ldm_compare_privheads - Compare two privhead objects + * @ph1: First privhead + * @ph2: Second privhead + * + * This compares the two privhead structures @ph1 and @ph2. + * + * Return: 'true' Identical + * 'false' Different + */ +static bool ldm_compare_privheads (const struct privhead *ph1, + const struct privhead *ph2) +{ + BUG_ON (!ph1 || !ph2); + + return ((ph1->ver_major == ph2->ver_major) && + (ph1->ver_minor == ph2->ver_minor) && + (ph1->logical_disk_start == ph2->logical_disk_start) && + (ph1->logical_disk_size == ph2->logical_disk_size) && + (ph1->config_start == ph2->config_start) && + (ph1->config_size == ph2->config_size) && + !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE)); +} + +/** + * ldm_compare_tocblocks - Compare two tocblock objects + * @toc1: First toc + * @toc2: Second toc + * + * This compares the two tocblock structures @toc1 and @toc2. + * + * Return: 'true' Identical + * 'false' Different + */ +static bool ldm_compare_tocblocks (const struct tocblock *toc1, + const struct tocblock *toc2) +{ + BUG_ON (!toc1 || !toc2); + + return ((toc1->bitmap1_start == toc2->bitmap1_start) && + (toc1->bitmap1_size == toc2->bitmap1_size) && + (toc1->bitmap2_start == toc2->bitmap2_start) && + (toc1->bitmap2_size == toc2->bitmap2_size) && + !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, + sizeof (toc1->bitmap1_name)) && + !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, + sizeof (toc1->bitmap2_name))); +} + +/** + * ldm_validate_privheads - Compare the primary privhead with its backups + * @state: Partition check state including device holding the LDM Database + * @ph1: Memory struct to fill with ph contents + * + * Read and compare all three privheads from disk. + * + * The privheads on disk show the size and location of the main disk area and + * the configuration area (the database). The values are range-checked against + * @hd, which contains the real size of the disk. + * + * Return: 'true' Success + * 'false' Error + */ +static bool ldm_validate_privheads(struct parsed_partitions *state, + struct privhead *ph1) +{ + static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; + struct privhead *ph[3] = { ph1 }; + Sector sect; + u8 *data; + bool result = false; + long num_sects; + int i; + + BUG_ON (!state || !ph1); + + ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); + ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); + if (!ph[1] || !ph[2]) { + ldm_crit ("Out of memory."); + goto out; + } + + /* off[1 & 2] are relative to ph[0]->config_start */ + ph[0]->config_start = 0; + + /* Read and parse privheads */ + for (i = 0; i < 3; i++) { + data = read_part_sector(state, ph[0]->config_start + off[i], + §); + if (!data) { + ldm_crit ("Disk read failed."); + goto out; + } + result = ldm_parse_privhead (data, ph[i]); + put_dev_sector (sect); + if (!result) { + ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ + if (i < 2) + goto out; /* Already logged */ + else + break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ + } + } + + num_sects = state->bdev->bd_inode->i_size >> 9; + + if ((ph[0]->config_start > num_sects) || + ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { + ldm_crit ("Database extends beyond the end of the disk."); + goto out; + } + + if ((ph[0]->logical_disk_start > ph[0]->config_start) || + ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) + > ph[0]->config_start)) { + ldm_crit ("Disk and database overlap."); + goto out; + } + + if (!ldm_compare_privheads (ph[0], ph[1])) { + ldm_crit ("Primary and backup PRIVHEADs don't match."); + goto out; + } + /* FIXME ignore this for now + if (!ldm_compare_privheads (ph[0], ph[2])) { + ldm_crit ("Primary and backup PRIVHEADs don't match."); + goto out; + }*/ + ldm_debug ("Validated PRIVHEADs successfully."); + result = true; +out: + kfree (ph[1]); + kfree (ph[2]); + return result; +} + +/** + * ldm_validate_tocblocks - Validate the table of contents and its backups + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database + * @ldb: Cache of the database structures + * + * Find and compare the four tables of contents of the LDM Database stored on + * @state->bdev and return the parsed information into @toc1. + * + * The offsets and sizes of the configs are range-checked against a privhead. + * + * Return: 'true' @toc1 contains validated TOCBLOCK info + * 'false' @toc1 contents are undefined + */ +static bool ldm_validate_tocblocks(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) +{ + static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; + struct tocblock *tb[4]; + struct privhead *ph; + Sector sect; + u8 *data; + int i, nr_tbs; + bool result = false; + + BUG_ON(!state || !ldb); + ph = &ldb->ph; + tb[0] = &ldb->toc; + tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); + if (!tb[1]) { + ldm_crit("Out of memory."); + goto err; + } + tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); + tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); + /* + * Try to read and parse all four TOCBLOCKs. + * + * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so + * skip any that fail as long as we get at least one valid TOCBLOCK. + */ + for (nr_tbs = i = 0; i < 4; i++) { + data = read_part_sector(state, base + off[i], §); + if (!data) { + ldm_error("Disk read failed for TOCBLOCK %d.", i); + continue; + } + if (ldm_parse_tocblock(data, tb[nr_tbs])) + nr_tbs++; + put_dev_sector(sect); + } + if (!nr_tbs) { + ldm_crit("Failed to find a valid TOCBLOCK."); + goto err; + } + /* Range check the TOCBLOCK against a privhead. */ + if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || + ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > + ph->config_size)) { + ldm_crit("The bitmaps are out of range. Giving up."); + goto err; + } + /* Compare all loaded TOCBLOCKs. */ + for (i = 1; i < nr_tbs; i++) { + if (!ldm_compare_tocblocks(tb[0], tb[i])) { + ldm_crit("TOCBLOCKs 0 and %d do not match.", i); + goto err; + } + } + ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); + result = true; +err: + kfree(tb[1]); + return result; +} + +/** + * ldm_validate_vmdb - Read the VMDB and validate it + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @bdev, of the database + * @ldb: Cache of the database structures + * + * Find the vmdb of the LDM Database stored on @bdev and return the parsed + * information in @ldb. + * + * Return: 'true' @ldb contains validated VBDB info + * 'false' @ldb contents are undefined + */ +static bool ldm_validate_vmdb(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) +{ + Sector sect; + u8 *data; + bool result = false; + struct vmdb *vm; + struct tocblock *toc; + + BUG_ON (!state || !ldb); + + vm = &ldb->vm; + toc = &ldb->toc; + + data = read_part_sector(state, base + OFF_VMDB, §); + if (!data) { + ldm_crit ("Disk read failed."); + return false; + } + + if (!ldm_parse_vmdb (data, vm)) + goto out; /* Already logged */ + + /* Are there uncommitted transactions? */ + if (get_unaligned_be16(data + 0x10) != 0x01) { + ldm_crit ("Database is not in a consistent state. Aborting."); + goto out; + } + + if (vm->vblk_offset != 512) + ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); + + /* + * The last_vblkd_seq can be before the end of the vmdb, just make sure + * it is not out of bounds. + */ + if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { + ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " + "Database is corrupt. Aborting."); + goto out; + } + + result = true; +out: + put_dev_sector (sect); + return result; +} + + +/** + * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk + * @state: Partition check state including device holding the LDM Database + * + * This function provides a weak test to decide whether the device is a dynamic + * disk or not. It looks for an MS-DOS-style partition table containing at + * least one partition of type 0x42 (formerly SFS, now used by Windows for + * dynamic disks). + * + * N.B. The only possible error can come from the read_part_sector and that is + * only likely to happen if the underlying device is strange. If that IS + * the case we should return zero to let someone else try. + * + * Return: 'true' @state->bdev is a dynamic disk + * 'false' @state->bdev is not a dynamic disk, or an error occurred + */ +static bool ldm_validate_partition_table(struct parsed_partitions *state) +{ + Sector sect; + u8 *data; + struct partition *p; + int i; + bool result = false; + + BUG_ON(!state); + + data = read_part_sector(state, 0, §); + if (!data) { + ldm_info ("Disk read failed."); + return false; + } + + if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) + goto out; + + p = (struct partition*)(data + 0x01BE); + for (i = 0; i < 4; i++, p++) + if (SYS_IND (p) == LDM_PARTITION) { + result = true; + break; + } + + if (result) + ldm_debug ("Found W2K dynamic disk partition type."); + +out: + put_dev_sector (sect); + return result; +} + +/** + * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id + * @ldb: Cache of the database structures + * + * The LDM Database contains a list of all partitions on all dynamic disks. + * The primary PRIVHEAD, at the beginning of the physical disk, tells us + * the GUID of this disk. This function searches for the GUID in a linked + * list of vblk's. + * + * Return: Pointer, A matching vblk was found + * NULL, No match, or an error + */ +static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) +{ + struct list_head *item; + + BUG_ON (!ldb); + + list_for_each (item, &ldb->v_disk) { + struct vblk *v = list_entry (item, struct vblk, list); + if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE)) + return v; + } + + return NULL; +} + +/** + * ldm_create_data_partitions - Create data partitions for this device + * @pp: List of the partitions parsed so far + * @ldb: Cache of the database structures + * + * The database contains ALL the partitions for ALL disk groups, so we need to + * filter out this specific disk. Using the disk's object id, we can find all + * the partitions in the database that belong to this disk. + * + * Add each partition in our database, to the parsed_partitions structure. + * + * N.B. This function creates the partitions in the order it finds partition + * objects in the linked list. + * + * Return: 'true' Partition created + * 'false' Error, probably a range checking problem + */ +static bool ldm_create_data_partitions (struct parsed_partitions *pp, + const struct ldmdb *ldb) +{ + struct list_head *item; + struct vblk *vb; + struct vblk *disk; + struct vblk_part *part; + int part_num = 1; + + BUG_ON (!pp || !ldb); + + disk = ldm_get_disk_objid (ldb); + if (!disk) { + ldm_crit ("Can't find the ID of this disk in the database."); + return false; + } + + strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); + + /* Create the data partitions */ + list_for_each (item, &ldb->v_part) { + vb = list_entry (item, struct vblk, list); + part = &vb->vblk.part; + + if (part->disk_id != disk->obj_id) + continue; + + put_partition (pp, part_num, ldb->ph.logical_disk_start + + part->start, part->size); + part_num++; + } + + strlcat(pp->pp_buf, "\n", PAGE_SIZE); + return true; +} + + +/** + * ldm_relative - Calculate the next relative offset + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @base: Size of the previous fixed width fields + * @offset: Cumulative size of the previous variable-width fields + * + * Because many of the VBLK fields are variable-width, it's necessary + * to calculate each offset based on the previous one and the length + * of the field it pointed to. + * + * Return: -1 Error, the calculated offset exceeded the size of the buffer + * n OK, a range-checked offset into buffer + */ +static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) +{ + + base += offset; + if (!buffer || offset < 0 || base > buflen) { + if (!buffer) + ldm_error("!buffer"); + if (offset < 0) + ldm_error("offset (%d) < 0", offset); + if (base > buflen) + ldm_error("base (%d) > buflen (%d)", base, buflen); + return -1; + } + if (base + buffer[base] >= buflen) { + ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, + buffer[base], buflen); + return -1; + } + return buffer[base] + offset + 1; +} + +/** + * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order + * @block: Pointer to the variable-width number to convert + * + * Large numbers in the LDM Database are often stored in a packed format. Each + * number is prefixed by a one byte width marker. All numbers in the database + * are stored in big-endian byte order. This function reads one of these + * numbers and returns the result + * + * N.B. This function DOES NOT perform any range checking, though the most + * it will read is eight bytes. + * + * Return: n A number + * 0 Zero, or an error occurred + */ +static u64 ldm_get_vnum (const u8 *block) +{ + u64 tmp = 0; + u8 length; + + BUG_ON (!block); + + length = *block++; + + if (length && length <= 8) + while (length--) + tmp = (tmp << 8) | *block++; + else + ldm_error ("Illegal length %d.", length); + + return tmp; +} + +/** + * ldm_get_vstr - Read a length-prefixed string into a buffer + * @block: Pointer to the length marker + * @buffer: Location to copy string to + * @buflen: Size of the output buffer + * + * Many of the strings in the LDM Database are not NULL terminated. Instead + * they are prefixed by a one byte length marker. This function copies one of + * these strings into a buffer. + * + * N.B. This function DOES NOT perform any range checking on the input. + * If the buffer is too small, the output will be truncated. + * + * Return: 0, Error and @buffer contents are undefined + * n, String length in characters (excluding NULL) + * buflen-1, String was truncated. + */ +static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) +{ + int length; + + BUG_ON (!block || !buffer); + + length = block[0]; + if (length >= buflen) { + ldm_error ("Truncating string %d -> %d.", length, buflen); + length = buflen - 1; + } + memcpy (buffer, block + 1, length); + buffer[length] = 0; + return length; +} + + +/** + * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Component object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Component VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; + struct vblk_comp *comp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); + r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); + r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); + + if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { + r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); + r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); + len = r_cols; + } else { + r_stripe = 0; + r_cols = 0; + len = r_parent; + } + if (len < 0) + return false; + + len += VBLK_SIZE_CMP3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + comp = &vb->vblk.comp; + ldm_get_vstr (buffer + 0x18 + r_name, comp->state, + sizeof (comp->state)); + comp->type = buffer[0x18 + r_vstate]; + comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); + comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); + comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; + + return true; +} + +/** + * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk Group object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Disk Group VBLK + * 'false' @vb contents are not defined + */ +static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_diskid, r_id1, r_id2, len; + struct vblk_dgrp *dgrp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); + + if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { + r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); + r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); + len = r_id2; + } else { + r_id1 = 0; + r_id2 = 0; + len = r_diskid; + } + if (len < 0) + return false; + + len += VBLK_SIZE_DGR3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + dgrp = &vb->vblk.dgrp; + ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, + sizeof (dgrp->disk_id)); + return true; +} + +/** + * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk Group object (version 4) into a vblk structure. + * + * Return: 'true' @vb contains a Disk Group VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) +{ + char buf[64]; + int r_objid, r_name, r_id1, r_id2, len; + struct vblk_dgrp *dgrp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + + if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { + r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); + r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); + len = r_id2; + } else { + r_id1 = 0; + r_id2 = 0; + len = r_name; + } + if (len < 0) + return false; + + len += VBLK_SIZE_DGR4; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + dgrp = &vb->vblk.dgrp; + + ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); + return true; +} + +/** + * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Disk VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_diskid, r_altname, len; + struct vblk_disk *disk; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); + r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); + len = r_altname; + if (len < 0) + return false; + + len += VBLK_SIZE_DSK3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + disk = &vb->vblk.disk; + ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, + sizeof (disk->alt_name)); + if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id)) + return false; + + return true; +} + +/** + * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk object (version 4) into a vblk structure. + * + * Return: 'true' @vb contains a Disk VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, len; + struct vblk_disk *disk; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + len = r_name; + if (len < 0) + return false; + + len += VBLK_SIZE_DSK4; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + disk = &vb->vblk.disk; + memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE); + return true; +} + +/** + * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Partition object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Partition VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; + struct vblk_part *part; + + BUG_ON(!buffer || !vb); + r_objid = ldm_relative(buffer, buflen, 0x18, 0); + if (r_objid < 0) { + ldm_error("r_objid %d < 0", r_objid); + return false; + } + r_name = ldm_relative(buffer, buflen, 0x18, r_objid); + if (r_name < 0) { + ldm_error("r_name %d < 0", r_name); + return false; + } + r_size = ldm_relative(buffer, buflen, 0x34, r_name); + if (r_size < 0) { + ldm_error("r_size %d < 0", r_size); + return false; + } + r_parent = ldm_relative(buffer, buflen, 0x34, r_size); + if (r_parent < 0) { + ldm_error("r_parent %d < 0", r_parent); + return false; + } + r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); + if (r_diskid < 0) { + ldm_error("r_diskid %d < 0", r_diskid); + return false; + } + if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { + r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); + if (r_index < 0) { + ldm_error("r_index %d < 0", r_index); + return false; + } + len = r_index; + } else { + r_index = 0; + len = r_diskid; + } + if (len < 0) { + ldm_error("len %d < 0", len); + return false; + } + len += VBLK_SIZE_PRT3; + if (len > get_unaligned_be32(buffer + 0x14)) { + ldm_error("len %d > BE32(buffer + 0x14) %d", len, + get_unaligned_be32(buffer + 0x14)); + return false; + } + part = &vb->vblk.part; + part->start = get_unaligned_be64(buffer + 0x24 + r_name); + part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); + part->size = ldm_get_vnum(buffer + 0x34 + r_name); + part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); + part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); + if (vb->flags & VBLK_FLAG_PART_INDEX) + part->partnum = buffer[0x35 + r_diskid]; + else + part->partnum = 0; + return true; +} + +/** + * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Volume object (version 5) into a vblk structure. + * + * Return: 'true' @vb contains a Volume VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; + int r_id1, r_id2, r_size2, r_drive, len; + struct vblk_volu *volu; + + BUG_ON(!buffer || !vb); + r_objid = ldm_relative(buffer, buflen, 0x18, 0); + if (r_objid < 0) { + ldm_error("r_objid %d < 0", r_objid); + return false; + } + r_name = ldm_relative(buffer, buflen, 0x18, r_objid); + if (r_name < 0) { + ldm_error("r_name %d < 0", r_name); + return false; + } + r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); + if (r_vtype < 0) { + ldm_error("r_vtype %d < 0", r_vtype); + return false; + } + r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); + if (r_disable_drive_letter < 0) { + ldm_error("r_disable_drive_letter %d < 0", + r_disable_drive_letter); + return false; + } + r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); + if (r_child < 0) { + ldm_error("r_child %d < 0", r_child); + return false; + } + r_size = ldm_relative(buffer, buflen, 0x3D, r_child); + if (r_size < 0) { + ldm_error("r_size %d < 0", r_size); + return false; + } + if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { + r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); + if (r_id1 < 0) { + ldm_error("r_id1 %d < 0", r_id1); + return false; + } + } else + r_id1 = r_size; + if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { + r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); + if (r_id2 < 0) { + ldm_error("r_id2 %d < 0", r_id2); + return false; + } + } else + r_id2 = r_id1; + if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { + r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); + if (r_size2 < 0) { + ldm_error("r_size2 %d < 0", r_size2); + return false; + } + } else + r_size2 = r_id2; + if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { + r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); + if (r_drive < 0) { + ldm_error("r_drive %d < 0", r_drive); + return false; + } + } else + r_drive = r_size2; + len = r_drive; + if (len < 0) { + ldm_error("len %d < 0", len); + return false; + } + len += VBLK_SIZE_VOL5; + if (len > get_unaligned_be32(buffer + 0x14)) { + ldm_error("len %d > BE32(buffer + 0x14) %d", len, + get_unaligned_be32(buffer + 0x14)); + return false; + } + volu = &vb->vblk.volu; + ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, + sizeof(volu->volume_type)); + memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, + sizeof(volu->volume_state)); + volu->size = ldm_get_vnum(buffer + 0x3D + r_child); + volu->partition_type = buffer[0x41 + r_size]; + memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); + if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { + ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, + sizeof(volu->drive_hint)); + } + return true; +} + +/** + * ldm_parse_vblk - Read a raw VBLK object into a vblk structure + * @buf: Block of data being worked on + * @len: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK object into a vblk structure. This function just reads the + * information common to all VBLK types, then delegates the rest of the work to + * helper functions: ldm_parse_*. + * + * Return: 'true' @vb contains a VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) +{ + bool result = false; + int r_objid; + + BUG_ON (!buf || !vb); + + r_objid = ldm_relative (buf, len, 0x18, 0); + if (r_objid < 0) { + ldm_error ("VBLK header is corrupt."); + return false; + } + + vb->flags = buf[0x12]; + vb->type = buf[0x13]; + vb->obj_id = ldm_get_vnum (buf + 0x18); + ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); + + switch (vb->type) { + case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; + case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; + case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; + case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; + case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; + case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; + case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; + } + + if (result) + ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", + (unsigned long long) vb->obj_id, vb->type); + else + ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", + (unsigned long long) vb->obj_id, vb->type); + + return result; +} + + +/** + * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database + * @data: Raw VBLK to add to the database + * @len: Size of the raw VBLK + * @ldb: Cache of the database structures + * + * The VBLKs are sorted into categories. Partitions are also sorted by offset. + * + * N.B. This function does not check the validity of the VBLKs. + * + * Return: 'true' The VBLK was added + * 'false' An error occurred + */ +static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) +{ + struct vblk *vb; + struct list_head *item; + + BUG_ON (!data || !ldb); + + vb = kmalloc (sizeof (*vb), GFP_KERNEL); + if (!vb) { + ldm_crit ("Out of memory."); + return false; + } + + if (!ldm_parse_vblk (data, len, vb)) { + kfree(vb); + return false; /* Already logged */ + } + + /* Put vblk into the correct list. */ + switch (vb->type) { + case VBLK_DGR3: + case VBLK_DGR4: + list_add (&vb->list, &ldb->v_dgrp); + break; + case VBLK_DSK3: + case VBLK_DSK4: + list_add (&vb->list, &ldb->v_disk); + break; + case VBLK_VOL5: + list_add (&vb->list, &ldb->v_volu); + break; + case VBLK_CMP3: + list_add (&vb->list, &ldb->v_comp); + break; + case VBLK_PRT3: + /* Sort by the partition's start sector. */ + list_for_each (item, &ldb->v_part) { + struct vblk *v = list_entry (item, struct vblk, list); + if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && + (v->vblk.part.start > vb->vblk.part.start)) { + list_add_tail (&vb->list, &v->list); + return true; + } + } + list_add_tail (&vb->list, &ldb->v_part); + break; + } + return true; +} + +/** + * ldm_frag_add - Add a VBLK fragment to a list + * @data: Raw fragment to be added to the list + * @size: Size of the raw fragment + * @frags: Linked list of VBLK fragments + * + * Fragmented VBLKs may not be consecutive in the database, so they are placed + * in a list so they can be pieced together later. + * + * Return: 'true' Success, the VBLK was added to the list + * 'false' Error, a problem occurred + */ +static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) +{ + struct frag *f; + struct list_head *item; + int rec, num, group; + + BUG_ON (!data || !frags); + + if (size < 2 * VBLK_SIZE_HEAD) { + ldm_error("Value of size is to small."); + return false; + } + + group = get_unaligned_be32(data + 0x08); + rec = get_unaligned_be16(data + 0x0C); + num = get_unaligned_be16(data + 0x0E); + if ((num < 1) || (num > 4)) { + ldm_error ("A VBLK claims to have %d parts.", num); + return false; + } + if (rec >= num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); + return false; + } + + list_for_each (item, frags) { + f = list_entry (item, struct frag, list); + if (f->group == group) + goto found; + } + + f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); + if (!f) { + ldm_crit ("Out of memory."); + return false; + } + + f->group = group; + f->num = num; + f->rec = rec; + f->map = 0xFF << num; + + list_add_tail (&f->list, frags); +found: + if (rec >= f->num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); + return false; + } + if (f->map & (1 << rec)) { + ldm_error ("Duplicate VBLK, part %d.", rec); + f->map &= 0x7F; /* Mark the group as broken */ + return false; + } + f->map |= (1 << rec); + if (!rec) + memcpy(f->data, data, VBLK_SIZE_HEAD); + data += VBLK_SIZE_HEAD; + size -= VBLK_SIZE_HEAD; + memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size); + return true; +} + +/** + * ldm_frag_free - Free a linked list of VBLK fragments + * @list: Linked list of fragments + * + * Free a linked list of VBLK fragments + * + * Return: none + */ +static void ldm_frag_free (struct list_head *list) +{ + struct list_head *item, *tmp; + + BUG_ON (!list); + + list_for_each_safe (item, tmp, list) + kfree (list_entry (item, struct frag, list)); +} + +/** + * ldm_frag_commit - Validate fragmented VBLKs and add them to the database + * @frags: Linked list of VBLK fragments + * @ldb: Cache of the database structures + * + * Now that all the fragmented VBLKs have been collected, they must be added to + * the database for later use. + * + * Return: 'true' All the fragments we added successfully + * 'false' One or more of the fragments we invalid + */ +static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) +{ + struct frag *f; + struct list_head *item; + + BUG_ON (!frags || !ldb); + + list_for_each (item, frags) { + f = list_entry (item, struct frag, list); + + if (f->map != 0xFF) { + ldm_error ("VBLK group %d is incomplete (0x%02x).", + f->group, f->map); + return false; + } + + if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) + return false; /* Already logged */ + } + return true; +} + +/** + * ldm_get_vblks - Read the on-disk database of VBLKs into memory + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database + * @ldb: Cache of the database structures + * + * To use the information from the VBLKs, they need to be read from the disk, + * unpacked and validated. We cache them in @ldb according to their type. + * + * Return: 'true' All the VBLKs were read successfully + * 'false' An error occurred + */ +static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, + struct ldmdb *ldb) +{ + int size, perbuf, skip, finish, s, v, recs; + u8 *data = NULL; + Sector sect; + bool result = false; + LIST_HEAD (frags); + + BUG_ON(!state || !ldb); + + size = ldb->vm.vblk_size; + perbuf = 512 / size; + skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ + finish = (size * ldb->vm.last_vblk_seq) >> 9; + + for (s = skip; s < finish; s++) { /* For each sector */ + data = read_part_sector(state, base + OFF_VMDB + s, §); + if (!data) { + ldm_crit ("Disk read failed."); + goto out; + } + + for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ + if (MAGIC_VBLK != get_unaligned_be32(data)) { + ldm_error ("Expected to find a VBLK."); + goto out; + } + + recs = get_unaligned_be16(data + 0x0E); /* Number of records */ + if (recs == 1) { + if (!ldm_ldmdb_add (data, size, ldb)) + goto out; /* Already logged */ + } else if (recs > 1) { + if (!ldm_frag_add (data, size, &frags)) + goto out; /* Already logged */ + } + /* else Record is not in use, ignore it. */ + } + put_dev_sector (sect); + data = NULL; + } + + result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ +out: + if (data) + put_dev_sector (sect); + ldm_frag_free (&frags); + + return result; +} + +/** + * ldm_free_vblks - Free a linked list of vblk's + * @lh: Head of a linked list of struct vblk + * + * Free a list of vblk's and free the memory used to maintain the list. + * + * Return: none + */ +static void ldm_free_vblks (struct list_head *lh) +{ + struct list_head *item, *tmp; + + BUG_ON (!lh); + + list_for_each_safe (item, tmp, lh) + kfree (list_entry (item, struct vblk, list)); +} + + +/** + * ldm_partition - Find out whether a device is a dynamic disk and handle it + * @state: Partition check state including device holding the LDM Database + * + * This determines whether the device @bdev is a dynamic disk and if so creates + * the partitions necessary in the gendisk structure pointed to by @hd. + * + * We create a dummy device 1, which contains the LDM database, and then create + * each partition described by the LDM database in sequence as devices 2+. For + * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, + * and so on: the actual data containing partitions. + * + * Return: 1 Success, @state->bdev is a dynamic disk and we handled it + * 0 Success, @state->bdev is not a dynamic disk + * -1 An error occurred before enough information had been read + * Or @state->bdev is a dynamic disk, but it may be corrupted + */ +int ldm_partition(struct parsed_partitions *state) +{ + struct ldmdb *ldb; + unsigned long base; + int result = -1; + + BUG_ON(!state); + + /* Look for signs of a Dynamic Disk */ + if (!ldm_validate_partition_table(state)) + return 0; + + ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); + if (!ldb) { + ldm_crit ("Out of memory."); + goto out; + } + + /* Parse and check privheads. */ + if (!ldm_validate_privheads(state, &ldb->ph)) + goto out; /* Already logged */ + + /* All further references are relative to base (database start). */ + base = ldb->ph.config_start; + + /* Parse and check tocs and vmdb. */ + if (!ldm_validate_tocblocks(state, base, ldb) || + !ldm_validate_vmdb(state, base, ldb)) + goto out; /* Already logged */ + + /* Initialize vblk lists in ldmdb struct */ + INIT_LIST_HEAD (&ldb->v_dgrp); + INIT_LIST_HEAD (&ldb->v_disk); + INIT_LIST_HEAD (&ldb->v_volu); + INIT_LIST_HEAD (&ldb->v_comp); + INIT_LIST_HEAD (&ldb->v_part); + + if (!ldm_get_vblks(state, base, ldb)) { + ldm_crit ("Failed to read the VBLKs from the database."); + goto cleanup; + } + + /* Finally, create the data partition devices. */ + if (ldm_create_data_partitions(state, ldb)) { + ldm_debug ("Parsed LDM database successfully."); + result = 1; + } + /* else Already logged */ + +cleanup: + ldm_free_vblks (&ldb->v_dgrp); + ldm_free_vblks (&ldb->v_disk); + ldm_free_vblks (&ldb->v_volu); + ldm_free_vblks (&ldb->v_comp); + ldm_free_vblks (&ldb->v_part); +out: + kfree (ldb); + return result; +} diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h new file mode 100644 index 00000000000..374242c0971 --- /dev/null +++ b/block/partitions/ldm.h @@ -0,0 +1,215 @@ +/** + * ldm - Part of the Linux-NTFS project. + * + * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> + * + * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS source + * in the file COPYING); if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _FS_PT_LDM_H_ +#define _FS_PT_LDM_H_ + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/genhd.h> +#include <linux/fs.h> +#include <asm/unaligned.h> +#include <asm/byteorder.h> + +struct parsed_partitions; + +/* Magic numbers in CPU format. */ +#define MAGIC_VMDB 0x564D4442 /* VMDB */ +#define MAGIC_VBLK 0x56424C4B /* VBLK */ +#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */ +#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */ + +/* The defined vblk types. */ +#define VBLK_VOL5 0x51 /* Volume, version 5 */ +#define VBLK_CMP3 0x32 /* Component, version 3 */ +#define VBLK_PRT3 0x33 /* Partition, version 3 */ +#define VBLK_DSK3 0x34 /* Disk, version 3 */ +#define VBLK_DSK4 0x44 /* Disk, version 4 */ +#define VBLK_DGR3 0x35 /* Disk Group, version 3 */ +#define VBLK_DGR4 0x45 /* Disk Group, version 4 */ + +/* vblk flags indicating extra information will be present */ +#define VBLK_FLAG_COMP_STRIPE 0x10 +#define VBLK_FLAG_PART_INDEX 0x08 +#define VBLK_FLAG_DGR3_IDS 0x08 +#define VBLK_FLAG_DGR4_IDS 0x08 +#define VBLK_FLAG_VOLU_ID1 0x08 +#define VBLK_FLAG_VOLU_ID2 0x20 +#define VBLK_FLAG_VOLU_SIZE 0x80 +#define VBLK_FLAG_VOLU_DRIVE 0x02 + +/* size of a vblk's static parts */ +#define VBLK_SIZE_HEAD 16 +#define VBLK_SIZE_CMP3 22 /* Name and version */ +#define VBLK_SIZE_DGR3 12 +#define VBLK_SIZE_DGR4 44 +#define VBLK_SIZE_DSK3 12 +#define VBLK_SIZE_DSK4 45 +#define VBLK_SIZE_PRT3 28 +#define VBLK_SIZE_VOL5 58 + +/* component types */ +#define COMP_STRIPE 0x01 /* Stripe-set */ +#define COMP_BASIC 0x02 /* Basic disk */ +#define COMP_RAID 0x03 /* Raid-set */ + +/* Other constants. */ +#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */ + +#define OFF_PRIV1 6 /* Offset of the first privhead + relative to the start of the + device in sectors */ + +/* Offsets to structures within the LDM Database in sectors. */ +#define OFF_PRIV2 1856 /* Backup private headers. */ +#define OFF_PRIV3 2047 + +#define OFF_TOCB1 1 /* Tables of contents. */ +#define OFF_TOCB2 2 +#define OFF_TOCB3 2045 +#define OFF_TOCB4 2046 + +#define OFF_VMDB 17 /* List of partitions. */ + +#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */ + +#define TOC_BITMAP1 "config" /* Names of the two defined */ +#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ + +/* Borrowed from msdos.c */ +#define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) + +struct frag { /* VBLK Fragment handling */ + struct list_head list; + u32 group; + u8 num; /* Total number of records */ + u8 rec; /* This is record number n */ + u8 map; /* Which portions are in use */ + u8 data[0]; +}; + +/* In memory LDM database structures. */ + +#define GUID_SIZE 16 + +struct privhead { /* Offsets and sizes are in sectors. */ + u16 ver_major; + u16 ver_minor; + u64 logical_disk_start; + u64 logical_disk_size; + u64 config_start; + u64 config_size; + u8 disk_id[GUID_SIZE]; +}; + +struct tocblock { /* We have exactly two bitmaps. */ + u8 bitmap1_name[16]; + u64 bitmap1_start; + u64 bitmap1_size; + u8 bitmap2_name[16]; + u64 bitmap2_start; + u64 bitmap2_size; +}; + +struct vmdb { /* VMDB: The database header */ + u16 ver_major; + u16 ver_minor; + u32 vblk_size; + u32 vblk_offset; + u32 last_vblk_seq; +}; + +struct vblk_comp { /* VBLK Component */ + u8 state[16]; + u64 parent_id; + u8 type; + u8 children; + u16 chunksize; +}; + +struct vblk_dgrp { /* VBLK Disk Group */ + u8 disk_id[64]; +}; + +struct vblk_disk { /* VBLK Disk */ + u8 disk_id[GUID_SIZE]; + u8 alt_name[128]; +}; + +struct vblk_part { /* VBLK Partition */ + u64 start; + u64 size; /* start, size and vol_off in sectors */ + u64 volume_offset; + u64 parent_id; + u64 disk_id; + u8 partnum; +}; + +struct vblk_volu { /* VBLK Volume */ + u8 volume_type[16]; + u8 volume_state[16]; + u8 guid[16]; + u8 drive_hint[4]; + u64 size; + u8 partition_type; +}; + +struct vblk_head { /* VBLK standard header */ + u32 group; + u16 rec; + u16 nrec; +}; + +struct vblk { /* Generalised VBLK */ + u8 name[64]; + u64 obj_id; + u32 sequence; + u8 flags; + u8 type; + union { + struct vblk_comp comp; + struct vblk_dgrp dgrp; + struct vblk_disk disk; + struct vblk_part part; + struct vblk_volu volu; + } vblk; + struct list_head list; +}; + +struct ldmdb { /* Cache of the database */ + struct privhead ph; + struct tocblock toc; + struct vmdb vm; + struct list_head v_dgrp; + struct list_head v_disk; + struct list_head v_volu; + struct list_head v_comp; + struct list_head v_part; +}; + +int ldm_partition(struct parsed_partitions *state); + +#endif /* _FS_PT_LDM_H_ */ + diff --git a/block/partitions/mac.c b/block/partitions/mac.c new file mode 100644 index 00000000000..76d8ba6379a --- /dev/null +++ b/block/partitions/mac.c @@ -0,0 +1,138 @@ +/* + * fs/partitions/mac.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include <linux/ctype.h> +#include "check.h" +#include "mac.h" + +#ifdef CONFIG_PPC_PMAC +#include <asm/machdep.h> +extern void note_bootable_part(dev_t dev, int part, int goodness); +#endif + +/* + * Code to understand MacOS partition tables. + */ + +static inline void mac_fix_string(char *stg, int len) +{ + int i; + + for (i = len - 1; i >= 0 && stg[i] == ' '; i--) + stg[i] = 0; +} + +int mac_partition(struct parsed_partitions *state) +{ + Sector sect; + unsigned char *data; + int slot, blocks_in_map; + unsigned secsize; +#ifdef CONFIG_PPC_PMAC + int found_root = 0; + int found_root_goodness = 0; +#endif + struct mac_partition *part; + struct mac_driver_desc *md; + + /* Get 0th block and look at the first partition map entry. */ + md = read_part_sector(state, 0, §); + if (!md) + return -1; + if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { + put_dev_sector(sect); + return 0; + } + secsize = be16_to_cpu(md->block_size); + put_dev_sector(sect); + data = read_part_sector(state, secsize/512, §); + if (!data) + return -1; + part = (struct mac_partition *) (data + secsize%512); + if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { + put_dev_sector(sect); + return 0; /* not a MacOS disk */ + } + blocks_in_map = be32_to_cpu(part->map_count); + if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { + put_dev_sector(sect); + return 0; + } + + if (blocks_in_map >= state->limit) + blocks_in_map = state->limit - 1; + + strlcat(state->pp_buf, " [mac]", PAGE_SIZE); + for (slot = 1; slot <= blocks_in_map; ++slot) { + int pos = slot * secsize; + put_dev_sector(sect); + data = read_part_sector(state, pos/512, §); + if (!data) + return -1; + part = (struct mac_partition *) (data + pos%512); + if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) + break; + put_partition(state, slot, + be32_to_cpu(part->start_block) * (secsize/512), + be32_to_cpu(part->block_count) * (secsize/512)); + + if (!strnicmp(part->type, "Linux_RAID", 10)) + state->parts[slot].flags = ADDPART_FLAG_RAID; +#ifdef CONFIG_PPC_PMAC + /* + * If this is the first bootable partition, tell the + * setup code, in case it wants to make this the root. + */ + if (machine_is(powermac)) { + int goodness = 0; + + mac_fix_string(part->processor, 16); + mac_fix_string(part->name, 32); + mac_fix_string(part->type, 32); + + if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) + && strcasecmp(part->processor, "powerpc") == 0) + goodness++; + + if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 + || (strnicmp(part->type, "Linux", 5) == 0 + && strcasecmp(part->type, "Linux_swap") != 0)) { + int i, l; + + goodness++; + l = strlen(part->name); + if (strcmp(part->name, "/") == 0) + goodness++; + for (i = 0; i <= l - 4; ++i) { + if (strnicmp(part->name + i, "root", + 4) == 0) { + goodness += 2; + break; + } + } + if (strnicmp(part->name, "swap", 4) == 0) + goodness--; + } + + if (goodness > found_root_goodness) { + found_root = slot; + found_root_goodness = goodness; + } + } +#endif /* CONFIG_PPC_PMAC */ + } +#ifdef CONFIG_PPC_PMAC + if (found_root_goodness) + note_bootable_part(state->bdev->bd_dev, found_root, + found_root_goodness); +#endif + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} diff --git a/block/partitions/mac.h b/block/partitions/mac.h new file mode 100644 index 00000000000..3c7d9843638 --- /dev/null +++ b/block/partitions/mac.h @@ -0,0 +1,44 @@ +/* + * fs/partitions/mac.h + */ + +#define MAC_PARTITION_MAGIC 0x504d + +/* type field value for A/UX or other Unix partitions */ +#define APPLE_AUX_TYPE "Apple_UNIX_SVR2" + +struct mac_partition { + __be16 signature; /* expected to be MAC_PARTITION_MAGIC */ + __be16 res1; + __be32 map_count; /* # blocks in partition map */ + __be32 start_block; /* absolute starting block # of partition */ + __be32 block_count; /* number of blocks in partition */ + char name[32]; /* partition name */ + char type[32]; /* string type description */ + __be32 data_start; /* rel block # of first data block */ + __be32 data_count; /* number of data blocks */ + __be32 status; /* partition status bits */ + __be32 boot_start; + __be32 boot_size; + __be32 boot_load; + __be32 boot_load2; + __be32 boot_entry; + __be32 boot_entry2; + __be32 boot_cksum; + char processor[16]; /* identifies ISA of boot */ + /* there is more stuff after this that we don't need */ +}; + +#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */ + +#define MAC_DRIVER_MAGIC 0x4552 + +/* Driver descriptor structure, in block 0 */ +struct mac_driver_desc { + __be16 signature; /* expected to be MAC_DRIVER_MAGIC */ + __be16 block_size; + __be32 block_count; + /* ... more stuff */ +}; + +int mac_partition(struct parsed_partitions *state); diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c new file mode 100644 index 00000000000..9123f250b42 --- /dev/null +++ b/block/partitions/msdos.c @@ -0,0 +1,579 @@ +/* + * fs/partitions/msdos.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * + * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug + * in the early extended-partition checks and added DM partitions + * + * Support for DiskManager v6.0x added by Mark Lord, + * with information provided by OnTrack. This now works for linux fdisk + * and LILO, as well as loadlin and bootln. Note that disks other than + * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). + * + * More flexible handling of extended partitions - aeb, 950831 + * + * Check partition table on IDE disks for common CHS translations + * + * Re-organised Feb 1998 Russell King + */ +#include <linux/msdos_fs.h> + +#include "check.h" +#include "msdos.h" +#include "efi.h" +#include "aix.h" + +/* + * Many architectures don't like unaligned accesses, while + * the nr_sects and start_sect partition table entries are + * at a 2 (mod 4) address. + */ +#include <asm/unaligned.h> + +#define SYS_IND(p) get_unaligned(&p->sys_ind) + +static inline sector_t nr_sects(struct partition *p) +{ + return (sector_t)get_unaligned_le32(&p->nr_sects); +} + +static inline sector_t start_sect(struct partition *p) +{ + return (sector_t)get_unaligned_le32(&p->start_sect); +} + +static inline int is_extended_partition(struct partition *p) +{ + return (SYS_IND(p) == DOS_EXTENDED_PARTITION || + SYS_IND(p) == WIN98_EXTENDED_PARTITION || + SYS_IND(p) == LINUX_EXTENDED_PARTITION); +} + +#define MSDOS_LABEL_MAGIC1 0x55 +#define MSDOS_LABEL_MAGIC2 0xAA + +static inline int +msdos_magic_present(unsigned char *p) +{ + return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); +} + +/* Value is EBCDIC 'IBMA' */ +#define AIX_LABEL_MAGIC1 0xC9 +#define AIX_LABEL_MAGIC2 0xC2 +#define AIX_LABEL_MAGIC3 0xD4 +#define AIX_LABEL_MAGIC4 0xC1 +static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) +{ + struct partition *pt = (struct partition *) (p + 0x1be); + Sector sect; + unsigned char *d; + int slot, ret = 0; + + if (!(p[0] == AIX_LABEL_MAGIC1 && + p[1] == AIX_LABEL_MAGIC2 && + p[2] == AIX_LABEL_MAGIC3 && + p[3] == AIX_LABEL_MAGIC4)) + return 0; + /* Assume the partition table is valid if Linux partitions exists */ + for (slot = 1; slot <= 4; slot++, pt++) { + if (pt->sys_ind == LINUX_SWAP_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) + return 0; + } + d = read_part_sector(state, 7, §); + if (d) { + if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') + ret = 1; + put_dev_sector(sect); + } + return ret; +} + +static void set_info(struct parsed_partitions *state, int slot, + u32 disksig) +{ + struct partition_meta_info *info = &state->parts[slot].info; + + snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, + slot); + info->volname[0] = 0; + state->parts[slot].has_info = true; +} + +/* + * Create devices for each logical partition in an extended partition. + * The logical partitions form a linked list, with each entry being + * a partition table with two entries. The first entry + * is the real data partition (with a start relative to the partition + * table start). The second is a pointer to the next logical partition + * (with a start relative to the entire extended partition). + * We do not create a Linux partition for the partition tables, but + * only for the actual data partitions. + */ + +static void parse_extended(struct parsed_partitions *state, + sector_t first_sector, sector_t first_size, + u32 disksig) +{ + struct partition *p; + Sector sect; + unsigned char *data; + sector_t this_sector, this_size; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + int loopct = 0; /* number of links followed + without finding a data partition */ + int i; + + this_sector = first_sector; + this_size = first_size; + + while (1) { + if (++loopct > 100) + return; + if (state->next == state->limit) + return; + data = read_part_sector(state, this_sector, §); + if (!data) + return; + + if (!msdos_magic_present(data + 510)) + goto done; + + p = (struct partition *) (data + 0x1be); + + /* + * Usually, the first entry is the real data partition, + * the 2nd entry is the next extended partition, or empty, + * and the 3rd and 4th entries are unused. + * However, DRDOS sometimes has the extended partition as + * the first entry (when the data partition is empty), + * and OS/2 seems to use all four entries. + */ + + /* + * First process the data partition(s) + */ + for (i=0; i<4; i++, p++) { + sector_t offs, size, next; + if (!nr_sects(p) || is_extended_partition(p)) + continue; + + /* Check the 3rd and 4th entries - + these sometimes contain random garbage */ + offs = start_sect(p)*sector_size; + size = nr_sects(p)*sector_size; + next = this_sector + offs; + if (i >= 2) { + if (offs + size > this_size) + continue; + if (next < first_sector) + continue; + if (next + size > first_sector + first_size) + continue; + } + + put_partition(state, state->next, next, size); + set_info(state, state->next, disksig); + if (SYS_IND(p) == LINUX_RAID_PARTITION) + state->parts[state->next].flags = ADDPART_FLAG_RAID; + loopct = 0; + if (++state->next == state->limit) + goto done; + } + /* + * Next, process the (first) extended partition, if present. + * (So far, there seems to be no reason to make + * parse_extended() recursive and allow a tree + * of extended partitions.) + * It should be a link to the next logical partition. + */ + p -= 4; + for (i=0; i<4; i++, p++) + if (nr_sects(p) && is_extended_partition(p)) + break; + if (i == 4) + goto done; /* nothing left to do */ + + this_sector = first_sector + start_sect(p) * sector_size; + this_size = nr_sects(p) * sector_size; + put_dev_sector(sect); + } +done: + put_dev_sector(sect); +} + +/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also + indicates linux swap. Be careful before believing this is Solaris. */ + +static void parse_solaris_x86(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_SOLARIS_X86_PARTITION + Sector sect; + struct solaris_x86_vtoc *v; + int i; + short max_nparts; + + v = read_part_sector(state, offset + 1, §); + if (!v) + return; + if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { + put_dev_sector(sect); + return; + } + { + char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + if (le32_to_cpu(v->v_version) != 1) { + char tmp[64]; + + snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", + le32_to_cpu(v->v_version)); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + return; + } + /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ + max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; + for (i=0; i<max_nparts && state->next<state->limit; i++) { + struct solaris_x86_slice *s = &v->v_slice[i]; + char tmp[3 + 10 + 1 + 1]; + + if (s->s_size == 0) + continue; + snprintf(tmp, sizeof(tmp), " [s%d]", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* solaris partitions are relative to current MS-DOS + * one; must add the offset of the current partition */ + put_partition(state, state->next++, + le32_to_cpu(s->s_start)+offset, + le32_to_cpu(s->s_size)); + } + put_dev_sector(sect); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +#endif +} + +#if defined(CONFIG_BSD_DISKLABEL) +/* + * Create devices for BSD partitions listed in a disklabel, under a + * dos-like partition. See parse_extended() for more information. + */ +static void parse_bsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin, char *flavour, + int max_partitions) +{ + Sector sect; + struct bsd_disklabel *l; + struct bsd_partition *p; + char tmp[64]; + + l = read_part_sector(state, offset + 1, §); + if (!l) + return; + if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { + put_dev_sector(sect); + return; + } + + snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + if (le16_to_cpu(l->d_npartitions) < max_partitions) + max_partitions = le16_to_cpu(l->d_npartitions); + for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { + sector_t bsd_start, bsd_size; + + if (state->next == state->limit) + break; + if (p->p_fstype == BSD_FS_UNUSED) + continue; + bsd_start = le32_to_cpu(p->p_offset); + bsd_size = le32_to_cpu(p->p_size); + if (offset == bsd_start && size == bsd_size) + /* full parent partition, we have it already */ + continue; + if (offset > bsd_start || offset+size < bsd_start+bsd_size) { + strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); + continue; + } + put_partition(state, state->next++, bsd_start, bsd_size); + } + put_dev_sector(sect); + if (le16_to_cpu(l->d_npartitions) > max_partitions) { + snprintf(tmp, sizeof(tmp), " (ignored %d more)", + le16_to_cpu(l->d_npartitions) - max_partitions); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +} +#endif + +static void parse_freebsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); +#endif +} + +static void parse_netbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); +#endif +} + +static void parse_openbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "openbsd", + OPENBSD_MAXPARTITIONS); +#endif +} + +/* + * Create devices for Unixware partitions listed in a disklabel, under a + * dos-like partition. See parse_extended() for more information. + */ +static void parse_unixware(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_UNIXWARE_DISKLABEL + Sector sect; + struct unixware_disklabel *l; + struct unixware_slice *p; + + l = read_part_sector(state, offset + 29, §); + if (!l) + return; + if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || + le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { + put_dev_sector(sect); + return; + } + { + char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + p = &l->vtoc.v_slice[1]; + /* I omit the 0th slice as it is the same as whole disk. */ + while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { + if (state->next == state->limit) + break; + + if (p->s_label != UNIXWARE_FS_UNUSED) + put_partition(state, state->next++, + le32_to_cpu(p->start_sect), + le32_to_cpu(p->nr_sects)); + p++; + } + put_dev_sector(sect); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +#endif +} + +/* + * Minix 2.0.0/2.0.2 subpartition support. + * Anand Krishnamurthy <anandk@wiproge.med.ge.com> + * Rajeev V. Pillai <rajeevvp@yahoo.com> + */ +static void parse_minix(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_MINIX_SUBPARTITION + Sector sect; + unsigned char *data; + struct partition *p; + int i; + + data = read_part_sector(state, offset, §); + if (!data) + return; + + p = (struct partition *)(data + 0x1be); + + /* The first sector of a Minix partition can have either + * a secondary MBR describing its subpartitions, or + * the normal boot sector. */ + if (msdos_magic_present (data + 510) && + SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ + char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { + if (state->next == state->limit) + break; + /* add each partition in use */ + if (SYS_IND(p) == MINIX_PARTITION) + put_partition(state, state->next++, + start_sect(p), nr_sects(p)); + } + strlcat(state->pp_buf, " >\n", PAGE_SIZE); + } + put_dev_sector(sect); +#endif /* CONFIG_MINIX_SUBPARTITION */ +} + +static struct { + unsigned char id; + void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); +} subtypes[] = { + {FREEBSD_PARTITION, parse_freebsd}, + {NETBSD_PARTITION, parse_netbsd}, + {OPENBSD_PARTITION, parse_openbsd}, + {MINIX_PARTITION, parse_minix}, + {UNIXWARE_PARTITION, parse_unixware}, + {SOLARIS_X86_PARTITION, parse_solaris_x86}, + {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, + {0, NULL}, +}; + +int msdos_partition(struct parsed_partitions *state) +{ + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + Sector sect; + unsigned char *data; + struct partition *p; + struct fat_boot_sector *fb; + int slot; + u32 disksig; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + /* + * Note order! (some AIX disks, e.g. unbootable kind, + * have no MSDOS 55aa) + */ + if (aix_magic_present(state, data)) { + put_dev_sector(sect); +#ifdef CONFIG_AIX_PARTITION + return aix_partition(state); +#else + strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); + return 0; +#endif + } + + if (!msdos_magic_present(data + 510)) { + put_dev_sector(sect); + return 0; + } + + /* + * Now that the 55aa signature is present, this is probably + * either the boot sector of a FAT filesystem or a DOS-type + * partition table. Reject this in case the boot indicator + * is not 0 or 0x80. + */ + p = (struct partition *) (data + 0x1be); + for (slot = 1; slot <= 4; slot++, p++) { + if (p->boot_ind != 0 && p->boot_ind != 0x80) { + /* + * Even without a valid boot inidicator value + * its still possible this is valid FAT filesystem + * without a partition table. + */ + fb = (struct fat_boot_sector *) data; + if (slot == 1 && fb->reserved && fb->fats + && fat_valid_media(fb->media)) { + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; + } else { + put_dev_sector(sect); + return 0; + } + } + } + +#ifdef CONFIG_EFI_PARTITION + p = (struct partition *) (data + 0x1be); + for (slot = 1 ; slot <= 4 ; slot++, p++) { + /* If this is an EFI GPT disk, msdos should ignore it. */ + if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { + put_dev_sector(sect); + return 0; + } + } +#endif + p = (struct partition *) (data + 0x1be); + + disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); + + /* + * Look for partitions in two passes: + * First find the primary and DOS-type extended partitions. + * On the second pass look inside *BSD, Unixware and Solaris partitions. + */ + + state->next = 5; + for (slot = 1 ; slot <= 4 ; slot++, p++) { + sector_t start = start_sect(p)*sector_size; + sector_t size = nr_sects(p)*sector_size; + if (!size) + continue; + if (is_extended_partition(p)) { + /* + * prevent someone doing mkfs or mkswap on an + * extended partition, but leave room for LILO + * FIXME: this uses one logical sector for > 512b + * sector, although it may not be enough/proper. + */ + sector_t n = 2; + n = min(size, max(sector_size, n)); + put_partition(state, slot, start, n); + + strlcat(state->pp_buf, " <", PAGE_SIZE); + parse_extended(state, start, size, disksig); + strlcat(state->pp_buf, " >", PAGE_SIZE); + continue; + } + put_partition(state, slot, start, size); + set_info(state, slot, disksig); + if (SYS_IND(p) == LINUX_RAID_PARTITION) + state->parts[slot].flags = ADDPART_FLAG_RAID; + if (SYS_IND(p) == DM6_PARTITION) + strlcat(state->pp_buf, "[DM]", PAGE_SIZE); + if (SYS_IND(p) == EZD_PARTITION) + strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); + } + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + /* second pass - output for each on a separate line */ + p = (struct partition *) (0x1be + data); + for (slot = 1 ; slot <= 4 ; slot++, p++) { + unsigned char id = SYS_IND(p); + int n; + + if (!nr_sects(p)) + continue; + + for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) + ; + + if (!subtypes[n].parse) + continue; + subtypes[n].parse(state, start_sect(p) * sector_size, + nr_sects(p) * sector_size, slot); + } + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h new file mode 100644 index 00000000000..38c781c490b --- /dev/null +++ b/block/partitions/msdos.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/msdos.h + */ + +#define MSDOS_LABEL_MAGIC 0xAA55 + +int msdos_partition(struct parsed_partitions *state); + diff --git a/block/partitions/osf.c b/block/partitions/osf.c new file mode 100644 index 00000000000..764b86a0196 --- /dev/null +++ b/block/partitions/osf.c @@ -0,0 +1,86 @@ +/* + * fs/partitions/osf.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include "check.h" +#include "osf.h" + +#define MAX_OSF_PARTITIONS 18 + +int osf_partition(struct parsed_partitions *state) +{ + int i; + int slot = 1; + unsigned int npartitions; + Sector sect; + unsigned char *data; + struct disklabel { + __le32 d_magic; + __le16 d_type,d_subtype; + u8 d_typename[16]; + u8 d_packname[16]; + __le32 d_secsize; + __le32 d_nsectors; + __le32 d_ntracks; + __le32 d_ncylinders; + __le32 d_secpercyl; + __le32 d_secprtunit; + __le16 d_sparespertrack; + __le16 d_sparespercyl; + __le32 d_acylinders; + __le16 d_rpm, d_interleave, d_trackskew, d_cylskew; + __le32 d_headswitch, d_trkseek, d_flags; + __le32 d_drivedata[5]; + __le32 d_spare[5]; + __le32 d_magic2; + __le16 d_checksum; + __le16 d_npartitions; + __le32 d_bbsize, d_sbsize; + struct d_partition { + __le32 p_size; + __le32 p_offset; + __le32 p_fsize; + u8 p_fstype; + u8 p_frag; + __le16 p_cpg; + } d_partitions[MAX_OSF_PARTITIONS]; + } * label; + struct d_partition * partition; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + label = (struct disklabel *) (data+64); + partition = label->d_partitions; + if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) { + put_dev_sector(sect); + return 0; + } + if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) { + put_dev_sector(sect); + return 0; + } + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { + if (slot == state->limit) + break; + if (le32_to_cpu(partition->p_size)) + put_partition(state, slot, + le32_to_cpu(partition->p_offset), + le32_to_cpu(partition->p_size)); + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/osf.h b/block/partitions/osf.h new file mode 100644 index 00000000000..20ed2315ec1 --- /dev/null +++ b/block/partitions/osf.h @@ -0,0 +1,7 @@ +/* + * fs/partitions/osf.h + */ + +#define DISKLABELMAGIC (0x82564557UL) + +int osf_partition(struct parsed_partitions *state); diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c new file mode 100644 index 00000000000..ea8a86dceaf --- /dev/null +++ b/block/partitions/sgi.c @@ -0,0 +1,82 @@ +/* + * fs/partitions/sgi.c + * + * Code extracted from drivers/block/genhd.c + */ + +#include "check.h" +#include "sgi.h" + +struct sgi_disklabel { + __be32 magic_mushroom; /* Big fat spliff... */ + __be16 root_part_num; /* Root partition number */ + __be16 swap_part_num; /* Swap partition number */ + s8 boot_file[16]; /* Name of boot file for ARCS */ + u8 _unused0[48]; /* Device parameter useless crapola.. */ + struct sgi_volume { + s8 name[8]; /* Name of volume */ + __be32 block_num; /* Logical block number */ + __be32 num_bytes; /* How big, in bytes */ + } volume[15]; + struct sgi_partition { + __be32 num_blocks; /* Size in logical blocks */ + __be32 first_block; /* First logical block */ + __be32 type; /* Type of this partition */ + } partitions[16]; + __be32 csum; /* Disk label checksum */ + __be32 _unused1; /* Padding */ +}; + +int sgi_partition(struct parsed_partitions *state) +{ + int i, csum; + __be32 magic; + int slot = 1; + unsigned int start, blocks; + __be32 *ui, cs; + Sector sect; + struct sgi_disklabel *label; + struct sgi_partition *p; + char b[BDEVNAME_SIZE]; + + label = read_part_sector(state, 0, §); + if (!label) + return -1; + p = &label->partitions[0]; + magic = label->magic_mushroom; + if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { + /*printk("Dev %s SGI disklabel: bad magic %08x\n", + bdevname(bdev, b), be32_to_cpu(magic));*/ + put_dev_sector(sect); + return 0; + } + ui = ((__be32 *) (label + 1)) - 1; + for(csum = 0; ui >= ((__be32 *) label);) { + cs = *ui--; + csum += be32_to_cpu(cs); + } + if(csum) { + printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", + bdevname(state->bdev, b)); + put_dev_sector(sect); + return 0; + } + /* All SGI disk labels have 16 partitions, disks under Linux only + * have 15 minor's. Luckily there are always a few zero length + * partitions which we don't care about so we never overflow the + * current_minor. + */ + for(i = 0; i < 16; i++, p++) { + blocks = be32_to_cpu(p->num_blocks); + start = be32_to_cpu(p->first_block); + if (blocks) { + put_partition(state, slot, start, blocks); + if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) + state->parts[slot].flags = ADDPART_FLAG_RAID; + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h new file mode 100644 index 00000000000..b9553ebdd5a --- /dev/null +++ b/block/partitions/sgi.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/sgi.h + */ + +extern int sgi_partition(struct parsed_partitions *state); + +#define SGI_LABEL_MAGIC 0x0be5a941 + diff --git a/block/partitions/sun.c b/block/partitions/sun.c new file mode 100644 index 00000000000..b5b6fcfb3d3 --- /dev/null +++ b/block/partitions/sun.c @@ -0,0 +1,122 @@ +/* + * fs/partitions/sun.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include "check.h" +#include "sun.h" + +int sun_partition(struct parsed_partitions *state) +{ + int i; + __be16 csum; + int slot = 1; + __be16 *ush; + Sector sect; + struct sun_disklabel { + unsigned char info[128]; /* Informative text string */ + struct sun_vtoc { + __be32 version; /* Layout version */ + char volume[8]; /* Volume name */ + __be16 nparts; /* Number of partitions */ + struct sun_info { /* Partition hdrs, sec 2 */ + __be16 id; + __be16 flags; + } infos[8]; + __be16 padding; /* Alignment padding */ + __be32 bootinfo[3]; /* Info needed by mboot */ + __be32 sanity; /* To verify vtoc sanity */ + __be32 reserved[10]; /* Free space */ + __be32 timestamp[8]; /* Partition timestamp */ + } vtoc; + __be32 write_reinstruct; /* sectors to skip, writes */ + __be32 read_reinstruct; /* sectors to skip, reads */ + unsigned char spare[148]; /* Padding */ + __be16 rspeed; /* Disk rotational speed */ + __be16 pcylcount; /* Physical cylinder count */ + __be16 sparecyl; /* extra sects per cylinder */ + __be16 obs1; /* gap1 */ + __be16 obs2; /* gap2 */ + __be16 ilfact; /* Interleave factor */ + __be16 ncyl; /* Data cylinder count */ + __be16 nacyl; /* Alt. cylinder count */ + __be16 ntrks; /* Tracks per cylinder */ + __be16 nsect; /* Sectors per track */ + __be16 obs3; /* bhead - Label head offset */ + __be16 obs4; /* ppart - Physical Partition */ + struct sun_partition { + __be32 start_cylinder; + __be32 num_sectors; + } partitions[8]; + __be16 magic; /* Magic number */ + __be16 csum; /* Label xor'd checksum */ + } * label; + struct sun_partition *p; + unsigned long spc; + char b[BDEVNAME_SIZE]; + int use_vtoc; + int nparts; + + label = read_part_sector(state, 0, §); + if (!label) + return -1; + + p = label->partitions; + if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { +/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", + bdevname(bdev, b), be16_to_cpu(label->magic)); */ + put_dev_sector(sect); + return 0; + } + /* Look at the checksum */ + ush = ((__be16 *) (label+1)) - 1; + for (csum = 0; ush >= ((__be16 *) label);) + csum ^= *ush--; + if (csum) { + printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", + bdevname(state->bdev, b)); + put_dev_sector(sect); + return 0; + } + + /* Check to see if we can use the VTOC table */ + use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) && + (be32_to_cpu(label->vtoc.version) == 1) && + (be16_to_cpu(label->vtoc.nparts) <= 8)); + + /* Use 8 partition entries if not specified in validated VTOC */ + nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8; + + /* + * So that old Linux-Sun partitions continue to work, + * alow the VTOC to be used under the additional condition ... + */ + use_vtoc = use_vtoc || !(label->vtoc.sanity || + label->vtoc.version || label->vtoc.nparts); + spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); + for (i = 0; i < nparts; i++, p++) { + unsigned long st_sector; + unsigned int num_sectors; + + st_sector = be32_to_cpu(p->start_cylinder) * spc; + num_sectors = be32_to_cpu(p->num_sectors); + if (num_sectors) { + put_partition(state, slot, st_sector, num_sectors); + state->parts[slot].flags = 0; + if (use_vtoc) { + if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION) + state->parts[slot].flags |= ADDPART_FLAG_RAID; + else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK) + state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; + } + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/sun.h b/block/partitions/sun.h new file mode 100644 index 00000000000..2424baa8319 --- /dev/null +++ b/block/partitions/sun.h @@ -0,0 +1,8 @@ +/* + * fs/partitions/sun.h + */ + +#define SUN_LABEL_MAGIC 0xDABE +#define SUN_VTOC_SANITY 0x600DDEEE + +int sun_partition(struct parsed_partitions *state); diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c new file mode 100644 index 00000000000..9627ccffc1c --- /dev/null +++ b/block/partitions/sysv68.c @@ -0,0 +1,95 @@ +/* + * fs/partitions/sysv68.c + * + * Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be> + */ + +#include "check.h" +#include "sysv68.h" + +/* + * Volume ID structure: on first 256-bytes sector of disk + */ + +struct volumeid { + u8 vid_unused[248]; + u8 vid_mac[8]; /* ASCII string "MOTOROLA" */ +}; + +/* + * config block: second 256-bytes sector on disk + */ + +struct dkconfig { + u8 ios_unused0[128]; + __be32 ios_slcblk; /* Slice table block number */ + __be16 ios_slccnt; /* Number of entries in slice table */ + u8 ios_unused1[122]; +}; + +/* + * combined volumeid and dkconfig block + */ + +struct dkblk0 { + struct volumeid dk_vid; + struct dkconfig dk_ios; +}; + +/* + * Slice Table Structure + */ + +struct slice { + __be32 nblocks; /* slice size (in blocks) */ + __be32 blkoff; /* block offset of slice */ +}; + + +int sysv68_partition(struct parsed_partitions *state) +{ + int i, slices; + int slot = 1; + Sector sect; + unsigned char *data; + struct dkblk0 *b; + struct slice *slice; + char tmp[64]; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + b = (struct dkblk0 *)data; + if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) { + put_dev_sector(sect); + return 0; + } + slices = be16_to_cpu(b->dk_ios.ios_slccnt); + i = be32_to_cpu(b->dk_ios.ios_slcblk); + put_dev_sector(sect); + + data = read_part_sector(state, i, §); + if (!data) + return -1; + + slices -= 1; /* last slice is the whole disk */ + snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + slice = (struct slice *)data; + for (i = 0; i < slices; i++, slice++) { + if (slot == state->limit) + break; + if (be32_to_cpu(slice->nblocks)) { + put_partition(state, slot, + be32_to_cpu(slice->blkoff), + be32_to_cpu(slice->nblocks)); + snprintf(tmp, sizeof(tmp), "(s%u)", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h new file mode 100644 index 00000000000..bf2f5ffa97a --- /dev/null +++ b/block/partitions/sysv68.h @@ -0,0 +1 @@ +extern int sysv68_partition(struct parsed_partitions *state); diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c new file mode 100644 index 00000000000..8dbaf9f77a9 --- /dev/null +++ b/block/partitions/ultrix.c @@ -0,0 +1,48 @@ +/* + * fs/partitions/ultrix.c + * + * Code extracted from drivers/block/genhd.c + * + * Re-organised Jul 1999 Russell King + */ + +#include "check.h" +#include "ultrix.h" + +int ultrix_partition(struct parsed_partitions *state) +{ + int i; + Sector sect; + unsigned char *data; + struct ultrix_disklabel { + s32 pt_magic; /* magic no. indicating part. info exits */ + s32 pt_valid; /* set by driver if pt is current */ + struct pt_info { + s32 pi_nblocks; /* no. of sectors */ + u32 pi_blkoff; /* block offset for start */ + } pt_part[8]; + } *label; + +#define PT_MAGIC 0x032957 /* Partition magic number */ +#define PT_VALID 1 /* Indicates if struct is valid */ + + data = read_part_sector(state, (16384 - sizeof(*label))/512, §); + if (!data) + return -1; + + label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label)); + + if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) { + for (i=0; i<8; i++) + if (label->pt_part[i].pi_nblocks) + put_partition(state, i+1, + label->pt_part[i].pi_blkoff, + label->pt_part[i].pi_nblocks); + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; + } else { + put_dev_sector(sect); + return 0; + } +} diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h new file mode 100644 index 00000000000..a3cc00b2bde --- /dev/null +++ b/block/partitions/ultrix.h @@ -0,0 +1,5 @@ +/* + * fs/partitions/ultrix.h + */ + +int ultrix_partition(struct parsed_partitions *state); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 4f4230b79bb..14695c6221c 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -24,8 +24,10 @@ #include <linux/capability.h> #include <linux/completion.h> #include <linux/cdrom.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/times.h> +#include <linux/uio.h> #include <asm/uaccess.h> #include <scsi/scsi.h> @@ -203,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) if (capable(CAP_SYS_RAWIO)) return 0; - /* if there's no filter set, assume we're filtering everything out */ - if (!filter) - return -EPERM; - /* Anybody who can open the device can do a read-safe command */ if (test_bit(cmd[0], filter->read_ok)) return 0; @@ -231,7 +229,6 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->cmd_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -284,7 +281,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, struct sg_io_hdr *hdr, fmode_t mode) { unsigned long start_time; - int writing = 0, ret = 0; + ssize_t ret = 0; + int writing = 0; struct request *rq; char sense[SCSI_SENSE_BUFFERSIZE]; struct bio *bio; @@ -312,6 +310,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); if (!rq) return -ENOMEM; + blk_rq_set_block_pc(rq); if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { blk_put_request(rq); @@ -319,37 +318,18 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, } if (hdr->iovec_count) { - const int size = sizeof(struct sg_iovec) * hdr->iovec_count; size_t iov_data_len; - struct sg_iovec *sg_iov; - struct iovec *iov; - int i; - - sg_iov = kmalloc(size, GFP_KERNEL); - if (!sg_iov) { - ret = -ENOMEM; - goto out; - } + struct iovec *iov = NULL; - if (copy_from_user(sg_iov, hdr->dxferp, size)) { - kfree(sg_iov); - ret = -EFAULT; + ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, + 0, NULL, &iov); + if (ret < 0) { + kfree(iov); goto out; } - /* - * Sum up the vecs, making sure they don't overflow - */ - iov = (struct iovec *) sg_iov; - iov_data_len = 0; - for (i = 0; i < hdr->iovec_count; i++) { - if (iov_data_len + iov[i].iov_len < iov_data_len) { - kfree(sg_iov); - ret = -EINVAL; - goto out; - } - iov_data_len += iov[i].iov_len; - } + iov_data_len = ret; + ret = 0; /* SG_IO howto says that the shorter of the two wins */ if (hdr->dxfer_len < iov_data_len) { @@ -359,9 +339,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, iov_data_len = hdr->dxfer_len; } - ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, + ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, + hdr->iovec_count, iov_data_len, GFP_KERNEL); - kfree(sg_iov); + kfree(iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL); @@ -510,7 +491,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, memset(sense, 0, sizeof(sense)); rq->sense = sense; rq->sense_len = 0; - rq->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(rq); blk_execute_rq(q, disk, rq, 0); @@ -543,7 +524,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, int err; rq = blk_get_request(q, WRITE, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_BLOCK_PC; + blk_rq_set_block_pc(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; rq->cmd[4] = data; @@ -565,7 +546,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod { int err; - if (!q || blk_get_queue(q)) + if (!q) return -ENXIO; switch (cmd) { @@ -686,11 +667,64 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod err = -ENOTTY; } - blk_put_queue(q); return err; } EXPORT_SYMBOL(scsi_cmd_ioctl); +int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) +{ + if (bd && bd == bd->bd_contains) + return 0; + + /* Actually none of these is particularly useful on a partition, + * but they are safe. + */ + switch (cmd) { + case SCSI_IOCTL_GET_IDLUN: + case SCSI_IOCTL_GET_BUS_NUMBER: + case SCSI_IOCTL_GET_PCI: + case SCSI_IOCTL_PROBE_HOST: + case SG_GET_VERSION_NUM: + case SG_SET_TIMEOUT: + case SG_GET_TIMEOUT: + case SG_GET_RESERVED_SIZE: + case SG_SET_RESERVED_SIZE: + case SG_EMULATED_HOST: + return 0; + case CDROM_GET_CAPABILITY: + /* Keep this until we remove the printk below. udev sends it + * and we do not want to spam dmesg about it. CD-ROMs do + * not have partitions, so we get here only for disks. + */ + return -ENOIOCTLCMD; + default: + break; + } + + if (capable(CAP_SYS_RAWIO)) + return 0; + + /* In particular, rule out all resets and host-specific ioctls. */ + printk_ratelimited(KERN_WARNING + "%s: sending ioctl %x to a partition!\n", current->comm, cmd); + + return -ENOIOCTLCMD; +} +EXPORT_SYMBOL(scsi_verify_blk_ioctl); + +int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, + unsigned int cmd, void __user *arg) +{ + int ret; + + ret = scsi_verify_blk_ioctl(bd, cmd); + if (ret < 0) + return ret; + + return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); +} +EXPORT_SYMBOL(scsi_cmd_blk_ioctl); + static int __init blk_scsi_ioctl_init(void) { blk_set_cmd_filter_defaults(&blk_default_cmd_filter); |
