1 files changed, 70 insertions, 54 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index d6da611f8f6..cd556b91478 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -28,16 +28,19 @@ cgroups. Here is what you can do.
 - Enable group scheduling in CFQ
 	CONFIG_CFQ_GROUP_IOSCHED=y
 
-- Compile and boot into kernel and mount IO controller (blkio).
+- Compile and boot into kernel and mount IO controller (blkio); see
+  cgroups.txt, Why are cgroups needed?.
 
-	mount -t cgroup -o blkio none /cgroup
+	mount -t tmpfs cgroup_root /sys/fs/cgroup
+	mkdir /sys/fs/cgroup/blkio
+	mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
 
 - Create two cgroups
-	mkdir -p /cgroup/test1/ /cgroup/test2
+	mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
 
 - Set weights of group test1 and test2
-	echo 1000 > /cgroup/test1/blkio.weight
-	echo 500 > /cgroup/test2/blkio.weight
+	echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
+	echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
 
 - Create two same size files (say 512MB each) on same disk (file1, file2) and
   launch two dd threads in different cgroup to read those files.
@@ -46,12 +49,12 @@ cgroups. Here is what you can do.
 	echo 3 > /proc/sys/vm/drop_caches
 
 	dd if=/mnt/sdb/zerofile1 of=/dev/null &
-	echo $! > /cgroup/test1/tasks
-	cat /cgroup/test1/tasks
+	echo $! > /sys/fs/cgroup/blkio/test1/tasks
+	cat /sys/fs/cgroup/blkio/test1/tasks
 
 	dd if=/mnt/sdb/zerofile2 of=/dev/null &
-	echo $! > /cgroup/test2/tasks
-	cat /cgroup/test2/tasks
+	echo $! > /sys/fs/cgroup/blkio/test2/tasks
+	cat /sys/fs/cgroup/blkio/test2/tasks
 
 - At macro level, first dd should finish first. To get more precise data, keep
   on looking at (with the help of script), at blkio.disk_time and
@@ -68,13 +71,13 @@ Throttling/Upper Limit policy
 - Enable throttling in block layer
 	CONFIG_BLK_DEV_THROTTLING=y
 
-- Mount blkio controller
-        mount -t cgroup -o blkio none /cgroup/blkio
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)
+        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
 
 - Specify a bandwidth rate on particular device for root group. The format
-  for policy is "<major>:<minor>  <byes_per_second>".
+  for policy is "<major>:<minor>  <bytes_per_second>".
 
-        echo "8:16  1048576" > /cgroup/blkio/blkio.read_bps_device
+        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
 
   Above will put a limit of 1MB/second on reads happening for root group
   on device having major/minor number 8:16.
@@ -87,7 +90,37 @@ Throttling/Upper Limit policy
         1024+0 records out
         4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
 
- Limits for writes can be put using blkio.write_bps_device file.
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Both CFQ and throttling implement hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows.
+
+			root
+			/  \
+		     test1 test2
+			|
+		     test3
+
+CFQ by default and throttling with "sane_behavior" will handle the
+hierarchy correctly.  For details on CFQ hierarchy support, refer to
+Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following.
+
+				pivot
+			     /  /   \  \
+			root  test1 test2  test3
 
 Various user visible config options
 ===================================
@@ -113,7 +146,7 @@ Proportional weight policy files
 	- Specifies per cgroup weight. This is default weight of the group
 	  on all the devices until and unless overridden by per device rule.
 	  (See blkio.weight_device).
-	  Currently allowed range of weights is from 100 to 1000.
+	  Currently allowed range of weights is from 10 to 1000.
 
 - blkio.weight_device
 	- One can specify per cgroup per device rules using this interface.
@@ -122,7 +155,7 @@ Proportional weight policy files
 
 	  Following is the format.
 
-	  #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
+	  # echo dev_maj:dev_minor weight > blkio.weight_device
 	  Configure weight=300 on /dev/sdb (8:16) in this cgroup
 	  # echo 8:16 300 > blkio.weight_device
 	  # cat blkio.weight_device
@@ -142,6 +175,12 @@ Proportional weight policy files
 	  dev     weight
 	  8:16    300
 
+- blkio.leaf_weight[_device]
+	- Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
 - blkio.time
 	- disk time allocated to cgroup per device in milliseconds. First
 	  two fields specify the major and minor number of the device and
@@ -236,7 +275,7 @@ Proportional weight policy files
 - blkio.idle_time
 	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
 	  This is the amount of time spent by the IO scheduler idling for a
-	  given cgroup in anticipation of a better request than the exising ones
+	  given cgroup in anticipation of a better request than the existing ones
 	  from other queues/cgroups. This is in nanoseconds. If this is read
 	  when the cgroup is in an idling state, the stat will only report the
 	  idle_time accumulated till the last idle period and will not include
@@ -249,38 +288,43 @@ Proportional weight policy files
 	  and minor number of the device and third field specifies the number
 	  of times a group was dequeued from a particular device.
 
+- blkio.*_recursive
+	- Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
+
 Throttling/Upper limit policy files
 -----------------------------------
 - blkio.throttle.read_bps_device
 	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in bytes per second. Rules are per deivce. Following is
+	  specified in bytes per second. Rules are per device. Following is
 	  the format.
 
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
 
 - blkio.throttle.write_bps_device
 	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in bytes per second. Rules are per deivce. Following is
+	  specified in bytes per second. Rules are per device. Following is
 	  the format.
 
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
 
 - blkio.throttle.read_iops_device
 	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in IO per second. Rules are per deivce. Following is
+	  specified in IO per second. Rules are per device. Following is
 	  the format.
 
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.read_iops_device
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
 
 - blkio.throttle.write_iops_device
 	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in io per second. Rules are per deivce. Following is
+	  specified in io per second. Rules are per device. Following is
 	  the format.
 
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.write_iops_device
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
 
 Note: If both BW and IOPS rules are specified for a device, then IO is
-      subjectd to both the constraints.
+      subjected to both the constraints.
 
 - blkio.throttle.io_serviced
 	- Number of IOs (bio) completed to/from the disk by the group (as
@@ -316,34 +360,6 @@ Common files among various policies
 
 CFQ sysfs tunable
 =================
-/sys/block/<disk>/queue/iosched/group_isolation
------------------------------------------------
-
-If group_isolation=1, it provides stronger isolation between groups at the
-expense of throughput. By default group_isolation is 0. In general that
-means that if group_isolation=0, expect fairness for sequential workload
-only. Set group_isolation=1 to see fairness for random IO workload also.
-
-Generally CFQ will put random seeky workload in sync-noidle category. CFQ
-will disable idling on these queues and it does a collective idling on group
-of such queues. Generally these are slow moving queues and if there is a
-sync-noidle service tree in each group, that group gets exclusive access to
-disk for certain period. That means it will bring the throughput down if
-group does not have enough IO to drive deeper queue depths and utilize disk
-capacity to the fullest in the slice allocated to it. But the flip side is
-that even a random reader should get better latencies and overall throughput
-if there are lots of sequential readers/sync-idle workload running in the
-system.
-
-If group_isolation=0, then CFQ automatically moves all the random seeky queues
-in the root group. That means there will be no service differentiation for
-that kind of workload. This leads to better throughput as we do collective
-idling on root sync-noidle tree.
-
-By default one should run with group_isolation=0. If that is not sufficient
-and one wants stronger isolation between groups, then set group_isolation=1
-but this will come at cost of reduced throughput.
-
 /sys/block/<disk>/queue/iosched/slice_idle
 ------------------------------------------
 On a faster hardware CFQ can be slow, especially with sequential workload.