aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/blkio-controller.txt151
-rw-r--r--block/Kconfig23
-rw-r--r--block/Kconfig.iosched16
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-barrier.c147
-rw-r--r--block/blk-cgroup.c727
-rw-r--r--block/blk-cgroup.h178
-rw-r--r--block/blk-core.c13
-rw-r--r--block/blk-lib.c233
-rw-r--r--block/cfq-iosched.c80
-rw-r--r--block/elevator.c9
-rw-r--r--block/genhd.c1
-rw-r--r--block/ioctl.c2
-rw-r--r--drivers/block/drbd/drbd_int.h3
-rw-r--r--drivers/block/drbd/drbd_receiver.c3
-rw-r--r--fs/block_dev.c257
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--include/linux/backing-dev.h3
-rw-r--r--include/linux/blkdev.h62
-rw-r--r--include/linux/elevator.h6
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/writeback.h4
-rw-r--r--init/Kconfig27
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--mm/page-writeback.c39
-rw-r--r--mm/swapfile.c9
34 files changed, 1699 insertions, 333 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a4..48e0b21b005 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
You can do a very simple testing of running two dd threads in two different
cgroups. Here is what you can do.
+- Enable Block IO controller
+ CONFIG_BLK_CGROUP=y
+
- Enable group scheduling in CFQ
CONFIG_CFQ_GROUP_IOSCHED=y
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
Various user visible config options
===================================
-CONFIG_CFQ_GROUP_IOSCHED
- - Enables group scheduling in CFQ. Currently only 1 level of group
- creation is allowed.
-
-CONFIG_DEBUG_CFQ_IOSCHED
- - Enables some debugging messages in blktrace. Also creates extra
- cgroup file blkio.dequeue.
-
-Config options selected automatically
-=====================================
-These config options are not user visible and are selected/deselected
-automatically based on IO scheduler configuration.
-
CONFIG_BLK_CGROUP
- - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
+ - Block IO controller.
CONFIG_DEBUG_BLK_CGROUP
- - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
+ - Debug help. Right now some additional stats file show up in cgroup
+ if this option is enabled.
+
+CONFIG_CFQ_GROUP_IOSCHED
+ - Enables group scheduling in CFQ. Currently only 1 level of group
+ creation is allowed.
Details of cgroup files
=======================
- blkio.weight
- - Specifies per cgroup weight.
-
+ - Specifies per cgroup weight. This is default weight of the group
+ on all the devices until and unless overridden by per device rule.
+ (See blkio.weight_device).
Currently allowed range of weights is from 100 to 1000.
+- blkio.weight_device
+ - One can specify per cgroup per device rules using this interface.
+ These rules override the default value of group weight as specified
+ by blkio.weight.
+
+ Following is the format.
+
+ #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
+ Configure weight=300 on /dev/sdb (8:16) in this cgroup
+ # echo 8:16 300 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
+ Configure weight=500 on /dev/sda (8:0) in this cgroup
+ # echo 8:0 500 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:0 500
+ 8:16 300
+
+ Remove specific weight for /dev/sda in this cgroup
+ # echo 8:0 0 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
- blkio.time
- disk time allocated to cgroup per device in milliseconds. First
two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
third field specifies the number of sectors transferred by the
group to/from the device.
+- blkio.io_service_bytes
+ - Number of bytes transferred to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of bytes.
+
+- blkio.io_serviced
+ - Number of IOs completed to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of IOs.
+
+- blkio.io_service_time
+ - Total amount of time between request dispatch and request completion
+ for the IOs done by this cgroup. This is in nanoseconds to make it
+ meaningful for flash devices too. For devices with queue depth of 1,
+ this time represents the actual service time. When queue_depth > 1,
+ that is no longer true as requests may be served out of order. This
+ may cause the service time for a given IO to include the service time
+ of multiple IOs when served out of order which may result in total
+ io_service_time > actual time elapsed. This time is further divided by
+ the type of operation - read or write, sync or async. First two fields
+ specify the major and minor number of the device, third field
+ specifies the operation type and the fourth field specifies the
+ io_service_time in ns.
+
+- blkio.io_wait_time
+ - Total amount of time the IOs for this cgroup spent waiting in the
+ scheduler queues for service. This can be greater than the total time
+ elapsed since it is cumulative io_wait_time for all IOs. It is not a
+ measure of total time the cgroup spent waiting but rather a measure of
+ the wait_time for its individual IOs. For devices with queue_depth > 1
+ this metric does not include the time spent waiting for service once
+ the IO is dispatched to the device but till it actually gets serviced
+ (there might be a time lag here due to re-ordering of requests by the
+ device). This is in nanoseconds to make it meaningful for flash
+ devices too. This time is further divided by the type of operation -
+ read or write, sync or async. First two fields specify the major and
+ minor number of the device, third field specifies the operation type
+ and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+ - Total number of bios/requests merged into requests belonging to this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.io_queued
+ - Total number of requests queued up at any given instant for this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.avg_queue_size
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ The average queue size for this cgroup over the entire time of this
+ cgroup's existence. Queue size samples are taken each time one of the
+ queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time the cgroup had to wait since it became busy
+ (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+ its queues. This is different from the io_wait_time which is the
+ cumulative total of the amount of time spent by each IO in that cgroup
+ waiting in the scheduler queue. This is in nanoseconds. If this is
+ read when the cgroup is in a waiting (for timeslice) state, the stat
+ will only report the group_wait_time accumulated till the last time it
+ got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time a cgroup spends without any pending
+ requests when not being served, i.e., it does not include any time
+ spent idling for one of the queues of the cgroup. This is in
+ nanoseconds. If this is read when the cgroup is in an empty state,
+ the stat will only report the empty_time accumulated till the last
+ time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time spent by the IO scheduler idling for a
+ given cgroup in anticipation of a better request than the exising ones
+ from other queues/cgroups. This is in nanoseconds. If this is read
+ when the cgroup is in an idling state, the stat will only report the
+ idle_time accumulated till the last idle period and will not include
+ the current delta.
+
- blkio.dequeue
- - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
gives the statistics about how many a times a group was dequeued
from service tree of the device. First two fields specify the major
and minor number of the device and third field specifies the number
of times a group was dequeued from a particular device.
+- blkio.reset_stats
+ - Writing an int to this file will result in resetting all the stats
+ for that cgroup.
+
CFQ sysfs tunable
=================
/sys/block/<disk>/queue/iosched/group_isolation
diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4d94b..9be0b56eaee 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
-config BLK_CGROUP
- tristate "Block cgroup support"
- depends on CGROUPS
- depends on CFQ_GROUP_IOSCHED
- default n
- ---help---
- Generic block IO controller cgroup interface. This is the common
- cgroup interface which should be used by various IO controlling
- policies.
-
- Currently, CFQ IO scheduler uses it to recognize task groups and
- control disk bandwidth allocation (proportional time slice allocation)
- to such task groups.
-
-config DEBUG_BLK_CGROUP
- bool
- depends on BLK_CGROUP
- default n
- ---help---
- Enable some debugging help. Currently it stores the cgroup path
- in the blk group which can be used by cfq for tracing various
- group related activity.
-
endif # BLOCK
config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fc71cf071fb..3199b76f795 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
- select BLK_CGROUP if CFQ_GROUP_IOSCHED
+ # If BLK_CGROUP is a module, CFQ has to be built as module.
+ depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
default y
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
This is the default I/O scheduler.
+ Note: If BLK_CGROUP=m, then CFQ can be built only as module.
+
config CFQ_GROUP_IOSCHED
bool "CFQ Group Scheduling support"
- depends on IOSCHED_CFQ && CGROUPS
+ depends on IOSCHED_CFQ && BLK_CGROUP
default n
---help---
Enable group IO scheduling in CFQ.
-config DEBUG_CFQ_IOSCHED
- bool "Debug CFQ Scheduling"
- depends on CFQ_GROUP_IOSCHED
- select DEBUG_BLK_CGROUP
- default n
- ---help---
- Enable CFQ IO scheduling debugging in CFQ. Currently it makes
- blktrace output more verbose.
-
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6..0bb499a739c 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677..0d710c9d403 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
clear_bit(BIO_UPTODATE, &bio->bi_flags);
}
-
- complete(bio->bi_private);
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
}
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
+ * @gfp_mask: memory allocation flags (for bio_alloc)
* @error_sector: error sector
+ * @flags: BLKDEV_IFL_* flags to control behaviour
*
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
- * wish to.
+ * wish to. If WAIT flag is not passed then caller may check only what
+ * request was pushed in some internal queue for later handling.
*/
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+ sector_t *error_sector, unsigned long flags)
{
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q;
struct bio *bio;
- int ret;
+ int ret = 0;
if (bdev->bd_disk == NULL)
return -ENXIO;
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
if (!q)
return -ENXIO;
- bio = bio_alloc(GFP_KERNEL, 0);
+ bio = bio_alloc(gfp_mask, 0);
bio->bi_end_io = bio_end_empty_barrier;
- bio->bi_private = &wait;
bio->bi_bdev = bdev;
- submit_bio(WRITE_BARRIER, bio);
-
- wait_for_completion(&wait);
+ if (test_bit(BLKDEV_WAIT, &flags))
+ bio->bi_private = &wait;
- /*
- * The driver must store the error location in ->bi_sector, if
- * it supports it. For non-stacked drivers, this should be copied
- * from blk_rq_pos(rq).
- */
- if (error_sector)
- *error_sector = bio->bi_sector;
+ bio_get(bio);
+ submit_bio(WRITE_BARRIER, bio);
+ if (test_bit(BLKDEV_WAIT, &flags)) {
+ wait_for_completion(&wait);
+ /*
+ * The driver must store the error location in ->bi_sector, if
+ * it supports it. For non-stacked drivers, this should be
+ * copied from blk_rq_pos(rq).
+ */
+ if (error_sector)
+ *error_sector = bio->bi_sector;
+ }
- ret = 0;
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
-
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
-
- if (bio->bi_private)
- complete(bio->bi_private);
- __free_page(bio_page(bio));
-
- bio_put(bio);
-}
-
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev: blockdev to issue discard for
- * @sector: start sector
- * @nr_sects: number of sectors to discard
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @flags: DISCARD_FL_* flags to control behaviour
- *
- * Description:
- * Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, int flags)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- struct request_queue *q = bdev_get_queue(bdev);
- int type = flags & DISCARD_FL_BARRIER ?
- DISCARD_BARRIER : DISCARD_NOBARRIER;
- struct bio *bio;
- struct page *page;
- int ret = 0;
-
- if (!q)
- return -ENXIO;
-
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
-
- while (nr_sects && !ret) {
- unsigned int sector_size = q->limits.logical_block_size;
- unsigned int max_discard_sectors =
- min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-
- bio = bio_alloc(gfp_mask, 1);
- if (!bio)
- goto out;
- bio->bi_sector = sector;
- bio->bi_end_io = blkdev_discard_end_io;
- bio->bi_bdev = bdev;
- if (flags & DISCARD_FL_WAIT)
- bio->bi_private = &wait;
-
- /*
- * Add a zeroed one-sector payload as that's what
- * our current implementations need. If we'll ever need
- * more the interface will need revisiting.
- */
- page = alloc_page(gfp_mask | __GFP_ZERO);
- if (!page)
- goto out_free_bio;
- if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
- goto out_free_page;
-
- /*
- * And override the bio size - the way discard works we
- * touch many more blocks on disk than the actual payload
- * length.
- */
- if (nr_sects > max_discard_sectors) {
- bio->bi_size = max_discard_sectors << 9;
- nr_sects -= max_discard_sectors;
- sector += max_discard_sectors;
- } else {
- bio->bi_size = nr_sects << 9;
- nr_sects = 0;
- }
-
- bio_get(bio);
- submit_bio(type, bio);
-
- if (flags & DISCARD_FL_WAIT)
- wait_for_completion(&wait);
-
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
- else if (!bio_flagged(bio, BIO_UPTODATE))
- ret = -EIO;
- bio_put(bio);
- }
- return ret;
-out_free_page:
- __free_page(page);
-out_free_bio:
- bio_put(bio);
-out:
- return -ENOMEM;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5fe03def34b..d02bbf88de1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,8 +15,12 @@
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/err.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
#include "blk-cgroup.h"
+#include <linux/genhd.h>
+
+#define MAX_KEY_LEN 100
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = {
};
EXPORT_SYMBOL_GPL(blkio_subsys);
+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
+ struct blkio_policy_node *pn)
+{
+ list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */
+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
+{
+ list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+{
+ struct blkio_policy_node *pn;
+
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ if (pn->dev == dev)
+ return pn;
+ }
+
+ return NULL;
+}
+
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
}
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
- unsigned long time, unsigned long sectors)
+/*
+ * Add to the appropriate stat variable depending on the request type.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+ bool sync)
+{
+ if (direction)
+ stat[BLKIO_STAT_WRITE] += add;
+ else
+ stat[BLKIO_STAT_READ] += add;
+ if (sync)
+ stat[BLKIO_STAT_SYNC] += add;
+ else
+ stat[BLKIO_STAT_ASYNC] += add;
+}
+
+/*
+ * Decrements the appropriate stat variable if non-zero depending on the
+ * request type. Panics on value being zero.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
+{
+ if (direction) {
+ BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
+ stat[BLKIO_STAT_WRITE]--;
+ } else {
+ BUG_ON(stat[BLKIO_STAT_READ] == 0);
+ stat[BLKIO_STAT_READ]--;
+ }
+ if (sync) {
+ BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
+ stat[BLKIO_STAT_SYNC]--;
+ } else {
+ BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
+ stat[BLKIO_STAT_ASYNC]--;
+ }
+}
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg)
+{
+ if (blkio_blkg_waiting(&blkg->stats))
+ return;
+ if (blkg == curr_blkg)
+ return;
+ blkg->stats.start_group_wait_time = sched_clock();
+ blkio_mark_blkg_waiting(&blkg->stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+{
+ unsigned long long now;
+
+ if (!blkio_blkg_waiting(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_group_wait_time))
+ stats->group_wait_time += now - stats->start_group_wait_time;
+ blkio_clear_blkg_waiting(stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
+{
+ unsigned long long now;
+
+ if (!blkio_blkg_empty(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_empty_time))
+ stats->empty_time += now - stats->start_empty_time;
+ blkio_clear_blkg_empty(stats);
+}
+
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ BUG_ON(blkio_blkg_idling(&blkg->stats));
+ blkg->stats.start_idle_time = sched_clock();
+ blkio_mark_blkg_idling(&blkg->stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
+
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ unsigned long long now;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ if (blkio_blkg_idling(stats)) {
+ now = sched_clock();
+ if (time_after64(now, stats->start_idle_time))
+ stats->idle_time += now - stats->start_idle_time;
+ blkio_clear_blkg_idling(stats);
+ }
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
+
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ stats->avg_queue_size_sum +=
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+ stats->avg_queue_size_samples++;
+ blkio_update_group_wait_time(stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
+
+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+
+ if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ return;
+ }
+
+ /*
+ * group is already marked empty. This can happen if cfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if(blkio_blkg_empty(stats)) {
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ return;
+ }
+
+ stats->start_empty_time = sched_clock();
+ blkio_mark_blkg_empty(stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
+
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue)
+{
+ blkg->stats.dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
+#else
+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg) {}
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
+#endif
+
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction,
+ bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+ sync);
+ blkio_end_empty_time(&blkg->stats);
+ blkio_set_start_group_wait_time(blkg, curr_blkg);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
+
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+ direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
+
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkg->stats.time += time;
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+ uint64_t bytes, bool direction, bool sync)
+{
+ struct blkio_group_stats *stats;
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ stats->sectors += bytes >> 9;
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+ sync);
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+ direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
+
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+ uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+{
+ struct blkio_group_stats *stats;
+ unsigned long flags;
+ unsigned long long now = sched_clock();
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ if (time_after64(now, io_start_time))
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+ now - io_start_time, direction, sync);
+ if (time_after64(io_start_time, start_time))
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+ io_start_time - start_time, direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
+
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+ bool sync)
{
- blkg->time += time;
- blkg->sectors += sectors;
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+ sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
}
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev)
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
unsigned long flags;
spin_lock_irqsave(&blkcg->lock, flags);
+ spin_lock_init(&blkg->stats_lock);
rcu_assign_pointer(blkg->key, key);
blkg->blkcg_id = css_id(&blkcg->css);
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
spin_unlock_irqrestore(&blkcg->lock, flags);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Need to take css reference ? */
cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-#endif
blkg->dev = dev;
}