diff options
34 files changed, 1699 insertions, 333 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 630879cd9a4..48e0b21b005 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -17,6 +17,9 @@ HOWTO You can do a very simple testing of running two dd threads in two different cgroups. Here is what you can do. +- Enable Block IO controller + CONFIG_BLK_CGROUP=y + - Enable group scheduling in CFQ CONFIG_CFQ_GROUP_IOSCHED=y @@ -54,32 +57,52 @@ cgroups. Here is what you can do. Various user visible config options =================================== -CONFIG_CFQ_GROUP_IOSCHED - - Enables group scheduling in CFQ. Currently only 1 level of group - creation is allowed. - -CONFIG_DEBUG_CFQ_IOSCHED - - Enables some debugging messages in blktrace. Also creates extra - cgroup file blkio.dequeue. - -Config options selected automatically -===================================== -These config options are not user visible and are selected/deselected -automatically based on IO scheduler configuration. - CONFIG_BLK_CGROUP - - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. + - Block IO controller. CONFIG_DEBUG_BLK_CGROUP - - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. + - Debug help. Right now some additional stats file show up in cgroup + if this option is enabled. + +CONFIG_CFQ_GROUP_IOSCHED + - Enables group scheduling in CFQ. Currently only 1 level of group + creation is allowed. Details of cgroup files ======================= - blkio.weight - - Specifies per cgroup weight. - + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See blkio.weight_device). Currently allowed range of weights is from 100 to 1000. +- blkio.weight_device + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight as specified + by blkio.weight. + + Following is the format. + + #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device + Configure weight=300 on /dev/sdb (8:16) in this cgroup + # echo 8:16 300 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + + Configure weight=500 on /dev/sda (8:0) in this cgroup + # echo 8:0 500 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:0 500 + 8:16 300 + + Remove specific weight for /dev/sda in this cgroup + # echo 8:0 0 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + - blkio.time - disk time allocated to cgroup per device in milliseconds. First two fields specify the major and minor number of the device and @@ -92,13 +115,105 @@ Details of cgroup files third field specifies the number of sectors transferred by the group to/from the device. +- blkio.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +- blkio.io_serviced + - Number of IOs completed to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.io_service_time + - Total amount of time between request dispatch and request completion + for the IOs done by this cgroup. This is in nanoseconds to make it + meaningful for flash devices too. For devices with queue depth of 1, + this time represents the actual service time. When queue_depth > 1, + that is no longer true as requests may be served out of order. This + may cause the service time for a given IO to include the service time + of multiple IOs when served out of order which may result in total + io_service_time > actual time elapsed. This time is further divided by + the type of operation - read or write, sync or async. First two fields + specify the major and minor number of the device, third field + specifies the operation type and the fourth field specifies the + io_service_time in ns. + +- blkio.io_wait_time + - Total amount of time the IOs for this cgroup spent waiting in the + scheduler queues for service. This can be greater than the total time + elapsed since it is cumulative io_wait_time for all IOs. It is not a + measure of total time the cgroup spent waiting but rather a measure of + the wait_time for its individual IOs. For devices with queue_depth > 1 + this metric does not include the time spent waiting for service once + the IO is dispatched to the device but till it actually gets serviced + (there might be a time lag here due to re-ordering of requests by the + device). This is in nanoseconds to make it meaningful for flash + devices too. This time is further divided by the type of operation - + read or write, sync or async. First two fields specify the major and + minor number of the device, third field specifies the operation type + and the fourth field specifies the io_wait_time in ns. + +- blkio.io_merged + - Total number of bios/requests merged into requests belonging to this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.io_queued + - Total number of requests queued up at any given instant for this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.avg_queue_size + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + The average queue size for this cgroup over the entire time of this + cgroup's existence. Queue size samples are taken each time one of the + queues of this cgroup gets a timeslice. + +- blkio.group_wait_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time the cgroup had to wait since it became busy + (i.e., went from 0 to 1 request queued) to get a timeslice for one of + its queues. This is different from the io_wait_time which is the + cumulative total of the amount of time spent by each IO in that cgroup + waiting in the scheduler queue. This is in nanoseconds. If this is + read when the cgroup is in a waiting (for timeslice) state, the stat + will only report the group_wait_time accumulated till the last time it + got a timeslice and will not include the current delta. + +- blkio.empty_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time a cgroup spends without any pending + requests when not being served, i.e., it does not include any time + spent idling for one of the queues of the cgroup. This is in + nanoseconds. If this is read when the cgroup is in an empty state, + the stat will only report the empty_time accumulated till the last + time it had a pending request and will not include the current delta. + +- blkio.idle_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time spent by the IO scheduler idling for a + given cgroup in anticipation of a better request than the exising ones + from other queues/cgroups. This is in nanoseconds. If this is read + when the cgroup is in an idling state, the stat will only report the + idle_time accumulated till the last idle period and will not include + the current delta. + - blkio.dequeue - - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This gives the statistics about how many a times a group was dequeued from service tree of the device. First two fields specify the major and minor number of the device and third field specifies the number of times a group was dequeued from a particular device. +- blkio.reset_stats + - Writing an int to this file will result in resetting all the stats + for that cgroup. + CFQ sysfs tunable ================= /sys/block/<disk>/queue/iosched/group_isolation diff --git a/block/Kconfig b/block/Kconfig index f9e89f4d94b..9be0b56eaee 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_CGROUP - tristate "Block cgroup support" - depends on CGROUPS - depends on CFQ_GROUP_IOSCHED - default n - ---help--- - Generic block IO controller cgroup interface. This is the common - cgroup interface which should be used by various IO controlling - policies. - - Currently, CFQ IO scheduler uses it to recognize task groups and - control disk bandwidth allocation (proportional time slice allocation) - to such task groups. - -config DEBUG_BLK_CGROUP - bool - depends on BLK_CGROUP - default n - ---help--- - Enable some debugging help. Currently it stores the cgroup path - in the blk group which can be used by cfq for tracing various - group related activity. - endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index fc71cf071fb..3199b76f795 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,7 +23,8 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - select BLK_CGROUP if CFQ_GROUP_IOSCHED + # If BLK_CGROUP is a module, CFQ has to be built as module. + depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -33,22 +34,15 @@ config IOSCHED_CFQ This is the default I/O scheduler. + Note: If BLK_CGROUP=m, then CFQ can be built only as module. + config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ && CGROUPS + depends on IOSCHED_CFQ && BLK_CGROUP default n ---help--- Enable group IO scheduling in CFQ. -config DEBUG_CFQ_IOSCHED - bool "Debug CFQ Scheduling" - depends on CFQ_GROUP_IOSCHED - select DEBUG_BLK_CGROUP - default n - ---help--- - Enable CFQ IO scheduling debugging in CFQ. Currently it makes - blktrace output more verbose. - choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/Makefile b/block/Makefile index cb2d515ebd6..0bb499a739c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 6d88544b677..0d710c9d403 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err) set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); clear_bit(BIO_UPTODATE, &bio->bi_flags); } - - complete(bio->bi_private); + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) * @error_sector: error sector + * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector, unsigned long flags) { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; - int ret; + int ret = 0; if (bdev->bd_disk == NULL) return -ENXIO; @@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) if (!q) return -ENXIO; - bio = bio_alloc(GFP_KERNEL, 0); + bio = bio_alloc(gfp_mask, 0); bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; bio->bi_bdev = bdev; - submit_bio(WRITE_BARRIER, bio); - - wait_for_completion(&wait); + if (test_bit(BLKDEV_WAIT, &flags)) + bio->bi_private = &wait; - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; + bio_get(bio); + submit_bio(WRITE_BARRIER, bio); + if (test_bit(BLKDEV_WAIT, &flags)) { + wait_for_completion(&wait); + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; + } - ret = 0; if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; else if (!bio_flagged(bio, BIO_UPTODATE)) @@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) return ret; } EXPORT_SYMBOL(blkdev_issue_flush); - -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - - if (bio->bi_private) - complete(bio->bi_private); - __free_page(bio_page(bio)); - - bio_put(bio); -} - -/** - * blkdev_issue_discard - queue a discard - * @bdev: blockdev to issue discard for - * @sector: start sector - * @nr_sects: number of sectors to discard - * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: DISCARD_FL_* flags to control behaviour - * - * Description: - * Issue a discard request for the sectors in question. - */ -int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q = bdev_get_queue(bdev); - int type = flags & DISCARD_FL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; - struct bio *bio; - struct page *page; - int ret = 0; - - if (!q) - return -ENXIO; - - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - - while (nr_sects && !ret) { - unsigned int sector_size = q->limits.logical_block_size; - unsigned int max_discard_sectors = - min(q->limits.max_discard_sectors, UINT_MAX >> 9); - - bio = bio_alloc(gfp_mask, 1); - if (!bio) - goto out; - bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; - bio->bi_bdev = bdev; - if (flags & DISCARD_FL_WAIT) - bio->bi_private = &wait; - - /* - * Add a zeroed one-sector payload as that's what - * our current implementations need. If we'll ever need - * more the interface will need revisiting. - */ - page = alloc_page(gfp_mask | __GFP_ZERO); - if (!page) - goto out_free_bio; - if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) - goto out_free_page; - - /* - * And override the bio size - the way discard works we - * touch many more blocks on disk than the actual payload - * length. - */ - if (nr_sects > max_discard_sectors) { - bio->bi_size = max_discard_sectors << 9; - nr_sects -= max_discard_sectors; - sector += max_discard_sectors; - } else { - bio->bi_size = nr_sects << 9; - nr_sects = 0; - } - - bio_get(bio); - submit_bio(type, bio); - - if (flags & DISCARD_FL_WAIT) - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; -out_free_page: - __free_page(page); -out_free_bio: - bio_put(bio); -out: - return -ENOMEM; -} -EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5fe03def34b..d02bbf88de1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -15,8 +15,12 @@ #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> +#include <linux/blkdev.h> #include <linux/slab.h> #include "blk-cgroup.h" +#include <linux/genhd.h> + +#define MAX_KEY_LEN 100 static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); @@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = { }; EXPORT_SYMBOL_GPL(blkio_subsys); +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) +{ + list_add(&pn->node, &blkcg->policy_list); +} + +/* Must be called with blkcg->lock held */ +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) +{ + list_del(&pn->node); +} + +/* Must be called with blkcg->lock held */ +static struct blkio_policy_node * +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (pn->dev == dev) + return pn; + } + + return NULL; +} + struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), @@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); -void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors) +/* + * Add to the appropriate stat variable depending on the request type. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, + bool sync) +{ + if (direction) + stat[BLKIO_STAT_WRITE] += add; + else + stat[BLKIO_STAT_READ] += add; + if (sync) + stat[BLKIO_STAT_SYNC] += add; + else + stat[BLKIO_STAT_ASYNC] += add; +} + +/* + * Decrements the appropriate stat variable if non-zero depending on the + * request type. Panics on value being zero. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) +{ + if (direction) { + BUG_ON(stat[BLKIO_STAT_WRITE] == 0); + stat[BLKIO_STAT_WRITE]--; + } else { + BUG_ON(stat[BLKIO_STAT_READ] == 0); + stat[BLKIO_STAT_READ]--; + } + if (sync) { + BUG_ON(stat[BLKIO_STAT_SYNC] == 0); + stat[BLKIO_STAT_SYNC]--; + } else { + BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); + stat[BLKIO_STAT_ASYNC]--; + } +} + +#ifdef CONFIG_DEBUG_BLK_CGROUP +/* This should be called with the blkg->stats_lock held. */ +static void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) +{ + if (blkio_blkg_waiting(&blkg->stats)) + return; + if (blkg == curr_blkg) + return; + blkg->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&blkg->stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + stats->group_wait_time += now - stats->start_group_wait_time; + blkio_clear_blkg_waiting(stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_end_empty_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + stats->empty_time += now - stats->start_empty_time; + blkio_clear_blkg_empty(stats); +} + +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + BUG_ON(blkio_blkg_idling(&blkg->stats)); + blkg->stats.start_idle_time = sched_clock(); + blkio_mark_blkg_idling(&blkg->stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + +void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + unsigned long long now; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (blkio_blkg_idling(stats)) { + now = sched_clock(); + if (time_after64(now, stats->start_idle_time)) + stats->idle_time += now - stats->start_idle_time; + blkio_clear_blkg_idling(stats); + } + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); + +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->avg_queue_size_sum += + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; + stats->avg_queue_size_samples++; + blkio_update_group_wait_time(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); + +void blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + + if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if(blkio_blkg_empty(stats)) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + stats->start_empty_time = sched_clock(); + blkio_mark_blkg_empty(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); + +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->stats.dequeue += dequeue; +} +EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); +#else +static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) {} +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} +#endif + +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, + sync); + blkio_end_empty_time(&blkg->stats); + blkio_set_start_group_wait_time(blkg, curr_blkg); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); + +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + +void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkg->stats.time += time; + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + struct blkio_group_stats *stats; + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->sectors += bytes >> 9; + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, + sync); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); + +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + struct blkio_group_stats *stats; + unsigned long flags; + unsigned long long now = sched_clock(); + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (time_after64(now, io_start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], + now - io_start_time, direction, sync); + if (time_after64(io_start_time, start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], + io_start_time - start_time, direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); + +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync) { - blkg->time += time; - blkg->sectors += sectors; + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, + sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); } -EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); +EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) @@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, unsigned long flags; spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_init(&blkg->stats_lock); rcu_assign_pointer(blkg->key, key); blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); spin_unlock_irqrestore(&blkcg->lock, flags); -#ifdef CONFIG_DEBUG_BLK_CGROUP /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); -#endif blkg->dev = dev; } |