aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-05-21 15:25:33 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-21 15:25:33 -0700
commit6e80e8ed5eb92d0112674aabe82951266a6a1051 (patch)
tree4913d191cd088f355b92109af5ffa7d75e15ae4a
parent6969a434737dd82f7343e3fcd529bc320508d9fc (diff)
parentee9a3607fb03e804ddf624544105f4e34260c380 (diff)
Merge branch 'for-2.6.35' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.35' of git://git.kernel.dk/linux-2.6-block: (86 commits) pipe: set lower and upper limit on max pages in the pipe page array pipe: add support for shrinking and growing pipes drbd: This is now equivalent to drbd release 8.3.8rc1 drbd: Do not free p_uuid early, this is done in the exit code of the receiver drbd: Null pointer deref fix to the large "multi bio rewrite" drbd: Fix: Do not detach, if a bio with a barrier fails drbd: Ensure to not trigger late-new-UUID creation multiple times drbd: Do not Oops when C_STANDALONE when uuid gets generated writeback: fix mixed up arguments to bdi_start_writeback() writeback: fix problem with !CONFIG_BLOCK compilation block: improve automatic native capacity unlocking block: use struct parsed_partitions *state universally in partition check code block,ide: simplify bdops->set_capacity() to ->unlock_native_capacity() block: restart partition scan after resizing a device buffer: make invalidate_bdev() drain all percpu LRU add caches block: remove all rcu head initializations writeback: fixups for !dirty_writeback_centisecs writeback: bdi_writeback_task() must set task state before calling schedule() writeback: ensure that WB_SYNC_NONE writeback with sb pinned is sync drivers/block/drbd: Use kzalloc ...
-rw-r--r--Documentation/cgroups/blkio-controller.txt151
-rw-r--r--block/Kconfig23
-rw-r--r--block/Kconfig.iosched16
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-barrier.c147
-rw-r--r--block/blk-cgroup.c791
-rw-r--r--block/blk-cgroup.h178
-rw-r--r--block/blk-core.c31
-rw-r--r--block/blk-lib.c233
-rw-r--r--block/cfq-iosched.c81
-rw-r--r--block/elevator.c11
-rw-r--r--block/genhd.c2
-rw-r--r--block/ioctl.c2
-rw-r--r--drivers/block/Kconfig22
-rw-r--r--drivers/block/drbd/drbd_bitmap.c21
-rw-r--r--drivers/block/drbd/drbd_int.h151
-rw-r--r--drivers/block/drbd/drbd_main.c158
-rw-r--r--drivers/block/drbd/drbd_nl.c52
-rw-r--r--drivers/block/drbd/drbd_proc.c19
-rw-r--r--drivers/block/drbd/drbd_receiver.c666
-rw-r--r--drivers/block/drbd/drbd_req.c40
-rw-r--r--drivers/block/drbd/drbd_strings.c2
-rw-r--r--drivers/block/drbd/drbd_worker.c206
-rw-r--r--drivers/block/drbd/drbd_wrappers.h16
-rw-r--r--drivers/ide/ide-disk.c40
-rw-r--r--drivers/ide/ide-gd.c11
-rw-r--r--fs/block_dev.c257
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/fs-writeback.c98
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/partitions/acorn.c68
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c13
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c8
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c84
-rw-r--r--fs/partitions/check.h12
-rw-r--r--fs/partitions/efi.c91
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c21
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c4
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c89
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c11
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c4
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c6
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c6
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c6
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c4
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c122
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/splice.c151
-rw-r--r--fs/sync.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--include/linux/backing-dev.h6
-rw-r--r--include/linux/blkdev.h70
-rw-r--r--include/linux/drbd.h5
-rw-r--r--include/linux/drbd_limits.h16
-rw-r--r--include/linux/drbd_nl.h5
-rw-r--r--include/linux/elevator.h6
-rw-r--r--include/linux/fcntl.h6
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/ide.h2
-rw-r--r--include/linux/pipe_fs_i.h13
-rw-r--r--include/linux/splice.h7
-rw-r--r--include/linux/writeback.h18
-rw-r--r--init/Kconfig27
-rw-r--r--kernel/relay.c15
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/trace/trace.c60
-rw-r--r--mm/backing-dev.c15
-rw-r--r--mm/page-writeback.c44
-rw-r--r--mm/swapfile.c9
-rw-r--r--net/core/skbuff.c38
93 files changed, 3423 insertions, 1241 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a4..48e0b21b005 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
You can do a very simple testing of running two dd threads in two different
cgroups. Here is what you can do.
+- Enable Block IO controller
+ CONFIG_BLK_CGROUP=y
+
- Enable group scheduling in CFQ
CONFIG_CFQ_GROUP_IOSCHED=y
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
Various user visible config options
===================================
-CONFIG_CFQ_GROUP_IOSCHED
- - Enables group scheduling in CFQ. Currently only 1 level of group
- creation is allowed.
-
-CONFIG_DEBUG_CFQ_IOSCHED
- - Enables some debugging messages in blktrace. Also creates extra
- cgroup file blkio.dequeue.
-
-Config options selected automatically
-=====================================
-These config options are not user visible and are selected/deselected
-automatically based on IO scheduler configuration.
-
CONFIG_BLK_CGROUP
- - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
+ - Block IO controller.
CONFIG_DEBUG_BLK_CGROUP
- - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
+ - Debug help. Right now some additional stats file show up in cgroup
+ if this option is enabled.
+
+CONFIG_CFQ_GROUP_IOSCHED
+ - Enables group scheduling in CFQ. Currently only 1 level of group
+ creation is allowed.
Details of cgroup files
=======================
- blkio.weight
- - Specifies per cgroup weight.
-
+ - Specifies per cgroup weight. This is default weight of the group
+ on all the devices until and unless overridden by per device rule.
+ (See blkio.weight_device).
Currently allowed range of weights is from 100 to 1000.
+- blkio.weight_device
+ - One can specify per cgroup per device rules using this interface.
+ These rules override the default value of group weight as specified
+ by blkio.weight.
+
+ Following is the format.
+
+ #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
+ Configure weight=300 on /dev/sdb (8:16) in this cgroup
+ # echo 8:16 300 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
+ Configure weight=500 on /dev/sda (8:0) in this cgroup
+ # echo 8:0 500 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:0 500
+ 8:16 300
+
+ Remove specific weight for /dev/sda in this cgroup
+ # echo 8:0 0 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
- blkio.time
- disk time allocated to cgroup per device in milliseconds. First
two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
third field specifies the number of sectors transferred by the
group to/from the device.
+- blkio.io_service_bytes
+ - Number of bytes transferred to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of bytes.
+
+- blkio.io_serviced
+ - Number of IOs completed to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of IOs.
+
+- blkio.io_service_time
+ - Total amount of time between request dispatch and request completion
+ for the IOs done by this cgroup. This is in nanoseconds to make it
+ meaningful for flash devices too. For devices with queue depth of 1,
+ this time represents the actual service time. When queue_depth > 1,
+ that is no longer true as requests may be served out of order. This
+ may cause the service time for a given IO to include the service time
+ of multiple IOs when served out of order which may result in total
+ io_service_time > actual time elapsed. This time is further divided by
+ the type of operation - read or write, sync or async. First two fields
+ specify the major and minor number of the device, third field
+ specifies the operation type and the fourth field specifies the
+ io_service_time in ns.
+
+- blkio.io_wait_time
+ - Total amount of time the IOs for this cgroup spent waiting in the
+ scheduler queues for service. This can be greater than the total time
+ elapsed since it is cumulative io_wait_time for all IOs. It is not a
+ measure of total time the cgroup spent waiting but rather a measure of
+ the wait_time for its individual IOs. For devices with queue_depth > 1
+ this metric does not include the time spent waiting for service once
+ the IO is dispatched to the device but till it actually gets serviced
+ (there might be a time lag here due to re-ordering of requests by the
+ device). This is in nanoseconds to make it meaningful for flash
+ devices too. This time is further divided by the type of operation -
+ read or write, sync or async. First two fields specify the major and
+ minor number of the device, third field specifies the operation type
+ and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+ - Total number of bios/requests merged into requests belonging to this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.io_queued
+ - Total number of requests queued up at any given instant for this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.avg_queue_size
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ The average queue size for this cgroup over the entire time of this
+ cgroup's existence. Queue size samples are taken each time one of the
+ queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time the cgroup had to wait since it became busy
+ (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+ its queues. This is different from the io_wait_time which is the
+ cumulative total of the amount of time spent by each IO in that cgroup
+ waiting in the scheduler queue. This is in nanoseconds. If this is
+ read when the cgroup is in a waiting (for timeslice) state, the stat
+ will only report the group_wait_time accumulated till the last time it
+ got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time a cgroup spends without any pending
+ requests when not being served, i.e., it does not include any time
+ spent idling for one of the queues of the cgroup. This is in
+ nanoseconds. If this is read when the cgroup is in an empty state,
+ the stat will only report the empty_time accumulated till the last
+ time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time spent by the IO scheduler idling for a
+ given cgroup in anticipation of a better request than the exising ones
+ from other queues/cgroups. This is in nanoseconds. If this is read
+ when the cgroup is in an idling state, the stat will only report the
+ idle_time accumulated till the last idle period and will not include
+ the current delta.
+
- blkio.dequeue
- - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
gives the statistics about how many a times a group was dequeued
from service tree of the device. First two fields specify the major
and minor number of the device and third field specifies the number
of times a group was dequeued from a particular device.
+- blkio.reset_stats
+ - Writing an int to this file will result in resetting all the stats
+ for that cgroup.
+
CFQ sysfs tunable
=================
/sys/block/<disk>/queue/iosched/group_isolation
diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4d94b..9be0b56eaee 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
-config BLK_CGROUP
- tristate "Block cgroup support"
- depends on CGROUPS
- depends on CFQ_GROUP_IOSCHED
- default n
- ---help---
- Generic block IO controller cgroup interface. This is the common
- cgroup interface which should be used by various IO controlling
- policies.
-
- Currently, CFQ IO scheduler uses it to recognize task groups and
- control disk bandwidth allocation (proportional time slice allocation)
- to such task groups.
-
-config DEBUG_BLK_CGROUP
- bool
- depends on BLK_CGROUP
- default n
- ---help---
- Enable some debugging help. Currently it stores the cgroup path
- in the blk group which can be used by cfq for tracing various
- group related activity.
-
endif # BLOCK
config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fc71cf071fb..3199b76f795 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
- select BLK_CGROUP if CFQ_GROUP_IOSCHED
+ # If BLK_CGROUP is a module, CFQ has to be built as module.
+ depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
default y
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
This is the default I/O scheduler.
+ Note: If BLK_CGROUP=m, then CFQ can be built only as module.
+
config CFQ_GROUP_IOSCHED
bool "CFQ Group Scheduling support"
- depends on IOSCHED_CFQ && CGROUPS
+ depends on IOSCHED_CFQ && BLK_CGROUP
default n
---help---
Enable group IO scheduling in CFQ.
-config DEBUG_CFQ_IOSCHED
- bool "Debug CFQ Scheduling"
- depends on CFQ_GROUP_IOSCHED
- select DEBUG_BLK_CGROUP
- default n
- ---help---
- Enable CFQ IO scheduling debugging in CFQ. Currently it makes
- blktrace output more verbose.
-
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6..0bb499a739c 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677..0d710c9d403 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
clear_bit(BIO_UPTODATE, &bio->bi_flags);
}
-
- complete(bio->bi_private);
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
}
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
+ * @gfp_mask: memory allocation flags (for bio_alloc)
* @error_sector: error sector
+ * @flags: BLKDEV_IFL_* flags to control behaviour
*
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
- * wish to.
+ * wish to. If WAIT flag is not passed then caller may check only what
+ * request was pushed in some internal queue for later handling.
*/
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struc