diff options
author | Jiri Kosina <jkosina@suse.cz> | 2010-06-16 18:08:13 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2010-06-16 18:08:13 +0200 |
commit | f1bbbb6912662b9f6070c5bfc4ca9eb1f06a9d5b (patch) | |
tree | c2c130a74be25b0b2dff992e1a195e2728bdaadd /block | |
parent | fd0961ff67727482bb20ca7e8ea97b83e9de2ddb (diff) | |
parent | 7e27d6e778cd87b6f2415515d7127eba53fe5d02 (diff) |
Merge branch 'master' into for-next
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 23 | ||||
-rw-r--r-- | block/Kconfig.iosched | 16 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/blk-barrier.c | 147 | ||||
-rw-r--r-- | block/blk-cgroup.c | 791 | ||||
-rw-r--r-- | block/blk-cgroup.h | 178 | ||||
-rw-r--r-- | block/blk-core.c | 49 | ||||
-rw-r--r-- | block/blk-lib.c | 233 | ||||
-rw-r--r-- | block/cfq-iosched.c | 182 | ||||
-rw-r--r-- | block/elevator.c | 19 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | block/ioctl.c | 2 |
12 files changed, 1349 insertions, 295 deletions
diff --git a/block/Kconfig b/block/Kconfig index f9e89f4d94b..9be0b56eaee 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_CGROUP - tristate "Block cgroup support" - depends on CGROUPS - depends on CFQ_GROUP_IOSCHED - default n - ---help--- - Generic block IO controller cgroup interface. This is the common - cgroup interface which should be used by various IO controlling - policies. - - Currently, CFQ IO scheduler uses it to recognize task groups and - control disk bandwidth allocation (proportional time slice allocation) - to such task groups. - -config DEBUG_BLK_CGROUP - bool - depends on BLK_CGROUP - default n - ---help--- - Enable some debugging help. Currently it stores the cgroup path - in the blk group which can be used by cfq for tracing various - group related activity. - endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index fc71cf071fb..3199b76f795 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,7 +23,8 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - select BLK_CGROUP if CFQ_GROUP_IOSCHED + # If BLK_CGROUP is a module, CFQ has to be built as module. + depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -33,22 +34,15 @@ config IOSCHED_CFQ This is the default I/O scheduler. + Note: If BLK_CGROUP=m, then CFQ can be built only as module. + config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ && CGROUPS + depends on IOSCHED_CFQ && BLK_CGROUP default n ---help--- Enable group IO scheduling in CFQ. -config DEBUG_CFQ_IOSCHED - bool "Debug CFQ Scheduling" - depends on CFQ_GROUP_IOSCHED - select DEBUG_BLK_CGROUP - default n - ---help--- - Enable CFQ IO scheduling debugging in CFQ. Currently it makes - blktrace output more verbose. - choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/Makefile b/block/Makefile index cb2d515ebd6..0bb499a739c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 6d88544b677..0d710c9d403 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err) set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); clear_bit(BIO_UPTODATE, &bio->bi_flags); } - - complete(bio->bi_private); + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) * @error_sector: error sector + * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector, unsigned long flags) { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; - int ret; + int ret = 0; if (bdev->bd_disk == NULL) return -ENXIO; @@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) if (!q) return -ENXIO; - bio = bio_alloc(GFP_KERNEL, 0); + bio = bio_alloc(gfp_mask, 0); bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; bio->bi_bdev = bdev; - submit_bio(WRITE_BARRIER, bio); - - wait_for_completion(&wait); + if (test_bit(BLKDEV_WAIT, &flags)) + bio->bi_private = &wait; - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; + bio_get(bio); + submit_bio(WRITE_BARRIER, bio); + if (test_bit(BLKDEV_WAIT, &flags)) { + wait_for_completion(&wait); + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; + } - ret = 0; if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; else if (!bio_flagged(bio, BIO_UPTODATE)) @@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) return ret; } EXPORT_SYMBOL(blkdev_issue_flush); - -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - - if (bio->bi_private) - complete(bio->bi_private); - __free_page(bio_page(bio)); - - bio_put(bio); -} - -/** - * blkdev_issue_discard - queue a discard - * @bdev: blockdev to issue discard for - * @sector: start sector - * @nr_sects: number of sectors to discard - * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: DISCARD_FL_* flags to control behaviour - * - * Description: - * Issue a discard request for the sectors in question. - */ -int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q = bdev_get_queue(bdev); - int type = flags & DISCARD_FL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; - struct bio *bio; - struct page *page; - int ret = 0; - - if (!q) - return -ENXIO; - - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - - while (nr_sects && !ret) { - unsigned int sector_size = q->limits.logical_block_size; - unsigned int max_discard_sectors = - min(q->limits.max_discard_sectors, UINT_MAX >> 9); - - bio = bio_alloc(gfp_mask, 1); - if (!bio) - goto out; - bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; - bio->bi_bdev = bdev; - if (flags & DISCARD_FL_WAIT) - bio->bi_private = &wait; - - /* - * Add a zeroed one-sector payload as that's what - * our current implementations need. If we'll ever need - * more the interface will need revisiting. - */ - page = alloc_page(gfp_mask | __GFP_ZERO); - if (!page) - goto out_free_bio; - if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) - goto out_free_page; - - /* - * And override the bio size - the way discard works we - * touch many more blocks on disk than the actual payload - * length. - */ - if (nr_sects > max_discard_sectors) { - bio->bi_size = max_discard_sectors << 9; - nr_sects -= max_discard_sectors; - sector += max_discard_sectors; - } else { - bio->bi_size = nr_sects << 9; - nr_sects = 0; - } - - bio_get(bio); - submit_bio(type, bio); - - if (flags & DISCARD_FL_WAIT) - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; -out_free_page: - __free_page(page); -out_free_bio: - bio_put(bio); -out: - return -ENOMEM; -} -EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2cc682b860e..a6809645d21 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -15,8 +15,12 @@ #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> +#include <linux/blkdev.h> #include <linux/slab.h> #include "blk-cgroup.h" +#include <linux/genhd.h> + +#define MAX_KEY_LEN 100 static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); @@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = { }; EXPORT_SYMBOL_GPL(blkio_subsys); +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) +{ + list_add(&pn->node, &blkcg->policy_list); +} + +/* Must be called with blkcg->lock held */ +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) +{ + list_del(&pn->node); +} + +/* Must be called with blkcg->lock held */ +static struct blkio_policy_node * +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (pn->dev == dev) + return pn; + } + + return NULL; +} + struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), @@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); -void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors) +/* + * Add to the appropriate stat variable depending on the request type. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, + bool sync) +{ + if (direction) + stat[BLKIO_STAT_WRITE] += add; + else + stat[BLKIO_STAT_READ] += add; + if (sync) + stat[BLKIO_STAT_SYNC] += add; + else + stat[BLKIO_STAT_ASYNC] += add; +} + +/* + * Decrements the appropriate stat variable if non-zero depending on the + * request type. Panics on value being zero. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) +{ + if (direction) { + BUG_ON(stat[BLKIO_STAT_WRITE] == 0); + stat[BLKIO_STAT_WRITE]--; + } else { + BUG_ON(stat[BLKIO_STAT_READ] == 0); + stat[BLKIO_STAT_READ]--; + } + if (sync) { + BUG_ON(stat[BLKIO_STAT_SYNC] == 0); + stat[BLKIO_STAT_SYNC]--; + } else { + BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); + stat[BLKIO_STAT_ASYNC]--; + } +} + +#ifdef CONFIG_DEBUG_BLK_CGROUP +/* This should be called with the blkg->stats_lock held. */ +static void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) +{ + if (blkio_blkg_waiting(&blkg->stats)) + return; + if (blkg == curr_blkg) + return; + blkg->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&blkg->stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + stats->group_wait_time += now - stats->start_group_wait_time; + blkio_clear_blkg_waiting(stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_end_empty_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + stats->empty_time += now - stats->start_empty_time; + blkio_clear_blkg_empty(stats); +} + +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + BUG_ON(blkio_blkg_idling(&blkg->stats)); + blkg->stats.start_idle_time = sched_clock(); + blkio_mark_blkg_idling(&blkg->stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + +void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + unsigned long long now; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (blkio_blkg_idling(stats)) { + now = sched_clock(); + if (time_after64(now, stats->start_idle_time)) + stats->idle_time += now - stats->start_idle_time; + blkio_clear_blkg_idling(stats); + } + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); + +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->avg_queue_size_sum += + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; + stats->avg_queue_size_samples++; + blkio_update_group_wait_time(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); + +void blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + + if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if(blkio_blkg_empty(stats)) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + stats->start_empty_time = sched_clock(); + blkio_mark_blkg_empty(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); + +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->stats.dequeue += dequeue; +} +EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); +#else +static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) {} +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} +#endif + +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, + sync); + blkio_end_empty_time(&blkg->stats); + blkio_set_start_group_wait_time(blkg, curr_blkg); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); + +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + +void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkg->stats.time += time; + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) { - blkg->time += time; - blkg->sectors += sectors; + struct blkio_group_stats *stats; + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->sectors += bytes >> 9; + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, + sync); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); } -EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); +EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); + +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + struct blkio_group_stats *stats; + unsigned long flags; + unsigned long long now = sched_clock(); + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (time_after64(now, io_start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], + now - io_start_time, direction, sync); + if (time_after64(io_start_time, start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], + io_start_time - start_time, direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); + +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, + sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) @@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, unsigned long flags; spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_init(&blkg->stats_lock); rcu_assign_pointer(blkg->key, key); blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); spin_unlock_irqrestore(&blkcg->lock, flags); -#ifdef CONFIG_DEBUG_BLK_CGROUP /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); -#endif blkg->dev = dev; } EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); @@ -101,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg) rcu_read_lock(); css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (!css) - goto out; - - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; + if (css) { + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); } - spin_unlock_irqrestore(&blkcg->lock, flags); -out: + rcu_read_unlock(); return ret; } @@ -154,6 +428,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) struct blkio_group *blkg; struct hlist_node *n; struct blkio_policy_type *blkiop; + struct blkio_policy_node *pn; if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) return -EINVAL; @@ -162,7 +437,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + pn = blkio_policy_search_node(blkcg, blkg->dev); + + if (pn) + continue; + list_for_each_entry(blkiop, &blkio_list, list) blkiop->ops.blkio_update_group_weight_fn(blkg, blkcg->weight); @@ -172,13 +453,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) return 0; } -#define SHOW_FUNCTION_PER_GROUP(__VAR) \ +static int +blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct blkio_group_stats *stats; + struct hlist_node *n; + uint64_t queued[BLKIO_STAT_TOTAL]; + int i; +#ifdef CONFIG_DEBUG_BLK_CGROUP + bool idling, waiting, empty; + unsigned long long now = sched_clock(); +#endif + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + spin_lock(&blkg->stats_lock); + stats = &blkg->stats; +#ifdef CONFIG_DEBUG_BLK_CGROUP + idling = blkio_blkg_idling(stats); + waiting = blkio_blkg_waiting(stats); + empty = blkio_blkg_empty(stats); +#endif + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; + memset(stats, 0, sizeof(struct blkio_group_stats)); + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (idling) { + blkio_mark_blkg_idling(stats); + stats->start_idle_time = now; + } + if (waiting) { + blkio_mark_blkg_waiting(stats); + stats->start_group_wait_time = now; + } + if (empty) { + blkio_mark_blkg_empty(stats); + stats->start_empty_time = now; + } +#endif + spin_unlock(&blkg->stats_lock); + } + spin_unlock_irq(&blkcg->lock); + return 0; +} + +static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, + int chars_left, bool diskname_only) +{ + snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); + chars_left -= strlen(str); + if (chars_left <= 0) { + printk(KERN_WARNING + "Possibly incorrect cgroup stat display format"); + return; + } + if (diskname_only) + return; + switch (type) { + case BLKIO_STAT_READ: + strlcat(str, " Read", chars_left); + break; + case BLKIO_STAT_WRITE: + strlcat(str, " Write", chars_left); + break; + case BLKIO_STAT_SYNC: + strlcat(str, " Sync", chars_left); + break; + case BLKIO_STAT_ASYNC: + strlcat(str, " Async", chars_left); + break; + case BLKIO_STAT_TOTAL: + strlcat(str, " Total", chars_left); + break; + default: + strlcat(str, " Invalid", chars_left); + } +} + +static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, + struct cgroup_map_cb *cb, dev_t dev) +{ + blkio_get_key_name(0, dev, str, chars_left, true); + cb->fill(cb, str, val); + return val; +} + +/* This should be called with blkg->stats_lock held */ +static uint64_t blkio_get_stat(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) +{ + uint64_t disk_total; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.time, cb, dev); + if (type == BLKIO_STAT_SECTORS) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.sectors, cb, dev); +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { + uint64_t sum = blkg->stats.avg_queue_size_sum; + uint64_t samples = blkg->stats.avg_queue_size_samples; + if (samples) + do_div(sum, samples); + else + sum = 0; + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); + } + if (type == BLKIO_STAT_GROUP_WAIT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.group_wait_time, cb, dev); + if (type == BLKIO_STAT_IDLE_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.idle_time, cb, dev); + if (type == BLKIO_STAT_EMPTY_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.empty_time, cb, dev); + if (type == BLKIO_STAT_DEQUEUE) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.dequeue, cb, dev); +#endif + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); + } + disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + + blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; +} + +#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype, struct seq_file *m) \ + struct cftype *cftype, struct cgroup_map_cb *cb) \ { \ struct blkio_cgroup *blkcg; \ struct blkio_group *blkg; \ struct hlist_node *n; \ + uint64_t cgroup_total = 0; \ \ if (!cgroup_lock_live_group(cgroup)) \ return -ENODEV; \ @@ -186,50 +608,293 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ blkcg = cgroup_to_blkio_cgroup(cgroup); \ rcu_read_lock(); \ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ - if (blkg->dev) \ - seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ - MINOR(blkg->dev), blkg->__VAR); \ + if (blkg->dev) { \ + spin_lock_irq(&blkg->stats_lock); \ + cgroup_total += blkio_get_stat(blkg, cb, \ + blkg->dev, type); \ + spin_unlock_irq(&blkg->stats_lock); \ + } \ } \ + if (show_total) \ + cb->fill(cb, "Total", cgroup_total); \ rcu_read_unlock(); \ cgroup_unlock(); \ return 0; \ } -SHOW_FUNCTION_PER_GROUP(time); -SHOW_FUNCTION_PER_GROUP(sectors); +SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); +SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); +SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); +SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); +SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); +SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); +SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); +SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); #ifdef CONFIG_DEBUG_BLK_CGROUP -SHOW_FUNCTION_PER_GROUP(dequeue); +SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); +SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); +SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); +SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); +SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); #endif #undef SHOW_FUNCTION_PER_GROUP -#ifdef CONFIG_DEBUG_BLK_CGROUP -void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) +static int blkio_check_dev_num(dev_t dev) { - blkg->dequeue += dequeue; + int part = 0; + struct gendisk *disk; + + disk = get_gendisk(dev, &part); + if (!disk || part) + return -ENODEV; + + return 0; +} + +static int blkio_policy_parse_and_set(char *buf, + struct blkio_policy_node *newpn) +{ + char *s[4], *p, *major_s = NULL, *minor_s = NULL; + int ret; + unsigned long major, minor, temp; + int i = 0; + dev_t dev; + + memset(s, 0, sizeof(s)); + + while ((p = strsep(&buf, " ")) != NULL) { + if (!*p) + continue; + + s[i++] = p; + + /* Prevent from inputing too many things */ + if (i == 3) + break; + } + + if (i != 2) + return -EINVAL; + + p = strsep(&s[0], ":"); + if (p != NULL) + major_s = p; + else + return -EINVAL; + + minor_s = s[0]; + if (!minor_s) + return -EINVAL; + + ret = strict_strtoul(major_s, 10, &major); + if (ret) + return -EINVAL; + + ret = strict_strtoul(minor_s, 10, &minor); + if (ret) + return -EINVAL; + + dev = MKDEV(major, minor); + + ret = blkio_check_dev_num(dev); + if (ret) + return ret; + + newpn->dev = dev; + + if (s[1] == NULL) + return -EINVAL; + + ret = strict_strtoul(s[1], 10, &temp); + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + return -EINVAL; + + newpn->weight = temp; + + return 0; +} + +unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev); + if (pn) + return pn->weight; + else + return blkcg->weight; +} +EXPORT_SYMBOL_GPL(blkcg_get_weight); + + +static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + int ret = 0; + char *buf; + struct blkio_policy_node *newpn, *pn; + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + int keep_newpn = 0; + struct hlist_node *n; + struct blkio_policy_type *blkiop; + + buf = kstrdup(buffer, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); + if (!newpn) { + ret = -ENOMEM; + goto free_buf; + } + + ret = blkio_policy_parse_and_set(buf, newpn); + if (ret) + goto free_newpn; + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + spin_lock_irq(&blkcg->lock); + + pn = blkio_policy_search_node(blkcg, newpn->dev); + if (!pn) { + if (newpn->weight != 0) { + blkio_policy_insert_node(blkcg, newpn); + keep_newpn = 1; + } + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + + if (newpn->weight == 0) { + /* weight == 0 means deleteing a specific weight */ + blkio_policy_delete_node(pn); + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + spin_unlock_irq(&blkcg->lock); + + pn->weight = newpn->weight; + +update_io_group: + /* update weight for each cfqg */ + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (newpn->dev == blkg->dev) { + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_update_group_weight_fn(blkg, + newpn->weight ? + newpn->weight : + blkcg->weight); + } + } + + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + +free_newpn: + if (!keep_newpn) + kfree(newpn); +free_buf: + kfree(buf); + return ret; +} + +static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct blkio_cgroup *blkcg; + struct blkio_policy_node *pn; + + seq_printf(m, "dev\tweight\n"); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + if (!list_empty(&blkcg->policy_list)) { + spin_lock_irq(&blkcg->lock); + list_for_each_entry(pn, &blkcg->policy_list, node) { + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->weight); + } + spin_unlock_irq(&blkcg->lock); + } |