From 3bf10fea3ba3804090e82aa59fabf25f43fa74c2 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 3 Oct 2012 16:56:56 -0400
Subject: cfq-iosched: Properly name all references to IO class

Currently CFQ has three IO classes, RT, BE and IDLE. At many a places we
are calling workloads belonging to these classes as "prio". This gets
very confusing as one starts to associate it with ioprio.

So this patch just does bunch of renaming so that reading code becomes
easier. All reference to RT, BE and IDLE workload are done using keyword
"class" and all references to subclass, SYNC, SYNC-IDLE, ASYNC are made
using keyword "type".

This makes me feel much better while I am reading the code. There is no
functionality change due to this patch.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/cfq-iosched.c | 67 +++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e62e9205b80..7646dfddba0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -155,7 +155,7 @@ struct cfq_queue {
  * First index in the service_trees.
  * IDLE is handled separately, so it has negative index
  */
-enum wl_prio_t {
+enum wl_class_t {
 	BE_WORKLOAD = 0,
 	RT_WORKLOAD = 1,
 	IDLE_WORKLOAD = 2,
@@ -250,7 +250,7 @@ struct cfq_group {
 
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
-	enum wl_prio_t saved_serving_prio;
+	enum wl_class_t saved_serving_class;
 
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
@@ -280,7 +280,7 @@ struct cfq_data {
 	/*
 	 * The priority currently being served
 	 */
-	enum wl_prio_t serving_prio;
+	enum wl_class_t serving_class;
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
@@ -354,16 +354,16 @@ struct cfq_data {
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
-					    enum wl_prio_t prio,
+					    enum wl_class_t class,
 					    enum wl_type_t type)
 {
 	if (!cfqg)
 		return NULL;
 
-	if (prio == IDLE_WORKLOAD)
+	if (class == IDLE_WORKLOAD)
 		return &cfqg->service_tree_idle;
 
-	return &cfqg->service_trees[prio][type];
+	return &cfqg->service_trees[class][type];
 }
 
 enum cfqq_state_flags {
@@ -732,7 +732,7 @@ static inline bool iops_mode(struct cfq_data *cfqd)
 		return false;
 }
 
-static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
+static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
 		return IDLE_WORKLOAD;
@@ -751,16 +751,16 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 	return SYNC_WORKLOAD;
 }
 
-static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
+static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
 					struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
-	if (wl == IDLE_WORKLOAD)
+	if (wl_class == IDLE_WORKLOAD)
 		return cfqg->service_tree_idle.count;
 
-	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
-		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
-		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
+	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count
+		+ cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count
+		+ cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
 }
 
 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
@@ -1304,7 +1304,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		cfqg->saved_workload_slice = cfqd->workload_expires
 						- jiffies;
 		cfqg->saved_workload = cfqd->serving_type;
-		cfqg->saved_serving_prio = cfqd->serving_prio;
+		cfqg->saved_serving_class = cfqd->serving_class;
 	} else
 		cfqg->saved_workload_slice = 0;
 
@@ -1616,7 +1616,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	int left;
 	int new_cfqq = 1;
 
-	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
+	service_tree = service_tree_for(cfqq->cfqg, cfqq_class(cfqq),
 						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
@@ -2030,8 +2030,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
-		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
-				cfqd->serving_prio, cfqd->serving_type);
+		cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
+				cfqd->serving_class, cfqd->serving_type);
 		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
@@ -2118,7 +2118,7 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
-		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
+		service_tree_for(cfqd->serving_group, cfqd->serving_class,
 					cfqd->serving_type);
 
 	if (!cfqd->rq_queued)
@@ -2285,7 +2285,7 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	enum wl_prio_t prio = cfqq_prio(cfqq);
+	enum wl_class_t wl_class = cfqq_class(cfqq);
 	struct cfq_rb_root *service_tree = cfqq->service_tree;
 
 	BUG_ON(!service_tree);
@@ -2295,7 +2295,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		return false;
 
 	/* We never do for idle class queues. */
-	if (prio == IDLE_WORKLOAD)
+	if (wl_class == IDLE_WORKLOAD)
 		return false;
 
 	/* We do for queues that were marked with idle window flag. */
@@ -2495,7 +2495,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 }
 
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
-				struct cfq_group *cfqg, enum wl_prio_t prio)
+			struct cfq_group *cfqg, enum wl_class_t wl_class)
 {
 	struct cfq_queue *queue;
 	int i;
@@ -2505,7 +2505,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
+		queue = cfq_rb_first(service_tree_for(cfqg, wl_class, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
@@ -2523,20 +2523,20 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
-	enum wl_prio_t original_prio = cfqd->serving_prio;
+	enum wl_class_t original_class = cfqd->serving_class;
 
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = RT_WORKLOAD;
+		cfqd->serving_class = RT_WORKLOAD;
 	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_prio = BE_WORKLOAD;
+		cfqd->serving_class = BE_WORKLOAD;
 	else {
-		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->serving_class = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
 
-	if (original_prio != cfqd->serving_prio)
+	if (original_class != cfqd->serving_class)
 		goto new_workload;
 
 	/*
@@ -2544,7 +2544,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+	st = service_tree_for(cfqg, cfqd->serving_class, cfqd->serving_type);
 	count = st->count;
 
 	/*
@@ -2556,8 +2556,8 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 new_workload:
 	/* otherwise select new workload type */
 	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+		cfq_choose_wl(cfqd, cfqg, cfqd->serving_class);
+	st = service_tree_for(cfqg, cfqd->serving_class, cfqd->serving_type);
 	count = st->count;
 
 	/*
@@ -2568,8 +2568,9 @@ new_workload:
 	group_slice = cfq_group_slice(cfqd, cfqg);
 
 	slice = group_slice * count /
-		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
-		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
+		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_class],
+		      cfq_group_busy_queues_wl(cfqd->serving_class, cfqd,
+					cfqg));
 
 	if (cfqd->serving_type == ASYNC_WORKLOAD) {
 		unsigned int tmp;
@@ -2620,7 +2621,7 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 	if (cfqg->saved_workload_slice) {
 		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 		cfqd->serving_type = cfqg->saved_workload;
-		cfqd->serving_prio = cfqg->saved_serving_prio;
+		cfqd->serving_class = cfqg->saved_serving_class;
 	} else
 		cfqd->workload_expires = jiffies - 1;
 
@@ -3645,7 +3646,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			service_tree = cfqq->service_tree;
 		else
 			service_tree = service_tree_for(cfqq->cfqg,
-				cfqq_prio(cfqq), cfqq_type(cfqq));
+				cfqq_class(cfqq), cfqq_type(cfqq));
 		service_tree->ttime.last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
-- 
cgit v1.2.3-18-g5258


From 4d2ceea4cb86060b03b2aa4826b365320bc78651 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 3 Oct 2012 16:56:57 -0400
Subject: cfq-iosched: More renaming to better represent wl_class and wl_type

Some more renaming. Again making the code uniform w.r.t use of
wl_class/class to represent IO class (RT, BE, IDLE) and using
wl_type/type to represent subclass (SYNC, SYNC-IDLE, ASYNC).

At places this patch shortens the string "workload" to "wl".
Renamed "saved_workload" to "saved_wl_type". Renamed
"saved_serving_class" to "saved_wl_class".

For uniformity with "saved_wl_*" variables, renamed "serving_class"
to "serving_wl_class" and renamed "serving_type" to "serving_wl_type".

Again, just trying to improve upon code uniformity and improve
readability. No functional change.

v2:
- Restored the usage of keyword "service" based on Jeff Moyer's feedback.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/cfq-iosched.c | 64 +++++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7646dfddba0..8f890bfba8f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -248,9 +248,9 @@ struct cfq_group {
 	struct cfq_rb_root service_trees[2][3];
 	struct cfq_rb_root service_tree_idle;
 
-	unsigned long saved_workload_slice;
-	enum wl_type_t saved_workload;
-	enum wl_class_t saved_serving_class;
+	unsigned long saved_wl_slice;
+	enum wl_type_t saved_wl_type;
+	enum wl_class_t saved_wl_class;
 
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
@@ -280,8 +280,8 @@ struct cfq_data {
 	/*
 	 * The priority currently being served
 	 */
-	enum wl_class_t serving_class;
-	enum wl_type_t serving_type;
+	enum wl_class_t serving_wl_class;
+	enum wl_type_t serving_wl_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 
@@ -1241,7 +1241,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfq_group_service_tree_del(st, cfqg);
-	cfqg->saved_workload_slice = 0;
+	cfqg->saved_wl_slice = 0;
 	cfqg_stats_update_dequeue(cfqg);
 }
 
@@ -1301,12 +1301,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
-		cfqg->saved_workload_slice = cfqd->workload_expires
+		cfqg->saved_wl_slice = cfqd->workload_expires
 						- jiffies;
-		cfqg->saved_workload = cfqd->serving_type;
-		cfqg->saved_serving_class = cfqd->serving_class;
+		cfqg->saved_wl_type = cfqd->serving_wl_type;
+		cfqg->saved_wl_class = cfqd->serving_wl_class;
 	} else
-		cfqg->saved_workload_slice = 0;
+		cfqg->saved_wl_slice = 0;
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
@@ -2031,7 +2031,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
-				cfqd->serving_class, cfqd->serving_type);
+				cfqd->serving_wl_class, cfqd->serving_wl_type);
 		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
@@ -2118,8 +2118,8 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
-		service_tree_for(cfqd->serving_group, cfqd->serving_class,
-					cfqd->serving_type);
+		service_tree_for(cfqd->serving_group, cfqd->serving_wl_class,
+						cfqd->serving_wl_type);
 
 	if (!cfqd->rq_queued)
 		return NULL;
@@ -2523,20 +2523,20 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
-	enum wl_class_t original_class = cfqd->serving_class;
+	enum wl_class_t original_class = cfqd->serving_wl_class;
 
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_class = RT_WORKLOAD;
+		cfqd->serving_wl_class = RT_WORKLOAD;
 	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_class = BE_WORKLOAD;
+		cfqd->serving_wl_class = BE_WORKLOAD;
 	else {
-		cfqd->serving_class = IDLE_WORKLOAD;
+		cfqd->serving_wl_class = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
 
-	if (original_class != cfqd->serving_class)
+	if (original_class != cfqd->serving_wl_class)
 		goto new_workload;
 
 	/*
@@ -2544,7 +2544,8 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
-	st = service_tree_for(cfqg, cfqd->serving_class, cfqd->serving_type);
+	st = service_tree_for(cfqg, cfqd->serving_wl_class,
+					cfqd->serving_wl_type);
 	count = st->count;
 
 	/*
@@ -2555,9 +2556,10 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 new_workload:
 	/* otherwise select new workload type */
-	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqg, cfqd->serving_class);
-	st = service_tree_for(cfqg, cfqd->serving_class, cfqd->serving_type);
+	cfqd->serving_wl_type = cfq_choose_wl(cfqd, cfqg,
+					cfqd->serving_wl_class);
+	st = service_tree_for(cfqg, cfqd->serving_wl_class,
+					cfqd->serving_wl_type);
 	count = st->count;
 
 	/*
@@ -2568,11 +2570,11 @@ new_workload:
 	group_slice = cfq_group_slice(cfqd, cfqg);
 
 	slice = group_slice * count /
-		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_class],
-		      cfq_group_busy_queues_wl(cfqd->serving_class, cfqd,
+		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
+		      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
 					cfqg));
 
-	if (cfqd->serving_type == ASYNC_WORKLOAD) {
+	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
 		unsigned int tmp;
 
 		/*
@@ -2618,10 +2620,10 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 	cfqd->serving_group = cfqg;
 
 	/* Restore the workload type data */
-	if (cfqg->saved_workload_slice) {
-		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
-		cfqd->serving_type = cfqg->saved_workload;
-		cfqd->serving_class = cfqg->saved_serving_class;
+	if (cfqg->saved_wl_slice) {
+		cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
+		cfqd->serving_wl_type = cfqg->saved_wl_type;
+		cfqd->serving_wl_class = cfqg->saved_wl_class;
 	} else
 		cfqd->workload_expires = jiffies - 1;
 
@@ -3404,7 +3406,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		return true;
 
 	/* Allow preemption only if we are idling on sync-noidle tree */
-	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
 	    new_cfqq->service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3456,7 +3458,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * doesn't happen
 	 */
 	if (old_type != cfqq_type(cfqq))
-		cfqq->cfqg->saved_workload_slice = 0;
+		cfqq->cfqg->saved_wl_slice = 0;
 
 	/*
 	 * Put the new queue at the front of the of the current list,
-- 
cgit v1.2.3-18-g5258


From 34b98d03bd6e3f3c67af1e4933aaf19887a61192 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 3 Oct 2012 16:56:58 -0400
Subject: cfq-iosched: Rename "service_tree" to "st" at some places

At quite a few places we use the keyword "service_tree". At some places,
especially local variables, I have abbreviated it to "st".

Also at couple of places moved binary operator "+" from beginning of line
to end of previous line, as per Tejun's feedback.

v2:
 Reverted most of the service tree name change based on Jeff Moyer's feedback.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/cfq-iosched.c | 77 +++++++++++++++++++++++++----------------------------
 1 file changed, 36 insertions(+), 41 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8f890bfba8f..db4a1a52c3d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -353,7 +353,7 @@ struct cfq_data {
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
-static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
+static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
 					    enum wl_class_t class,
 					    enum wl_type_t type)
 {
@@ -758,16 +758,16 @@ static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
 	if (wl_class == IDLE_WORKLOAD)
 		return cfqg->service_tree_idle.count;
 
-	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count
-		+ cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count
-		+ cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
+	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
+		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
+		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
 }
 
 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
-	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
-		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
+	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
+		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 }
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
@@ -1612,15 +1612,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
-	struct cfq_rb_root *service_tree;
+	struct cfq_rb_root *st;
 	int left;
 	int new_cfqq = 1;
 
-	service_tree = service_tree_for(cfqq->cfqg, cfqq_class(cfqq),
-						cfqq_type(cfqq));
+	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
-		parent = rb_last(&service_tree->rb);
+		parent = rb_last(&st->rb);
 		if (parent && parent != &cfqq->rb_node) {
 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 			rb_key += __cfqq->rb_key;
@@ -1638,7 +1637,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
-		__cfqq = cfq_rb_first(service_tree);
+		__cfqq = cfq_rb_first(st);
 		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
 	}
 
@@ -1647,8 +1646,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		/*
 		 * same position, nothing more to do
 		 */
-		if (rb_key == cfqq->rb_key &&
-		    cfqq->service_tree == service_tree)
+		if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
 			return;
 
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
@@ -1657,8 +1655,8 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	left = 1;
 	parent = NULL;
-	cfqq->service_tree = service_tree;
-	p = &service_tree->rb.rb_node;
+	cfqq->service_tree = st;
+	p = &st->rb.rb_node;
 	while (*p) {
 		struct rb_node **n;
 
@@ -1679,12 +1677,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 	if (left)
-		service_tree->left = &cfqq->rb_node;
+		st->left = &cfqq->rb_node;
 
 	cfqq->rb_key = rb_key;
 	rb_link_node(&cfqq->rb_node, parent, p);
-	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
-	service_tree->count++;
+	rb_insert_color(&cfqq->rb_node, &st->rb);
+	st->count++;
 	if (add_front || !new_cfqq)
 		return;
 	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
@@ -2117,19 +2115,18 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-	struct cfq_rb_root *service_tree =
-		service_tree_for(cfqd->serving_group, cfqd->serving_wl_class,
-						cfqd->serving_wl_type);
+	struct cfq_rb_root *st = st_for(cfqd->serving_group,
+			cfqd->serving_wl_class, cfqd->serving_wl_type);
 
 	if (!cfqd->rq_queued)
 		return NULL;
 
 	/* There is nothing to dispatch */
-	if (!service_tree)
+	if (!st)
 		return NULL;
-	if (RB_EMPTY_ROOT(&service_tree->rb))
+	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
-	return cfq_rb_first(service_tree);
+	return cfq_rb_first(st);
 }
 
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
@@ -2286,10 +2283,10 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	enum wl_class_t wl_class = cfqq_class(cfqq);
-	struct cfq_rb_root *service_tree = cfqq->service_tree;
+	struct cfq_rb_root *st = cfqq->service_tree;
 
-	BUG_ON(!service_tree);
-	BUG_ON(!service_tree->count);
+	BUG_ON(!st);
+	BUG_ON(!st->count);
 
 	if (!cfqd->cfq_slice_idle)
 		return false;
@@ -2307,11 +2304,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
-	   !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
+	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
+	   !cfq_io_thinktime_big(cfqd, &st->ttime, false))
 		return true;
-	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
-			service_tree->count);
+	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
 	return false;
 }
 
@@ -2505,7 +2501,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(cfqg, wl_class, i));
+		queue = cfq_rb_first(st_for(cfqg, wl_class, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
@@ -2544,8 +2540,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
-	st = service_tree_for(cfqg, cfqd->serving_wl_class,
-					cfqd->serving_wl_type);
+	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
 	count = st->count;
 
 	/*
@@ -2558,8 +2553,7 @@ new_workload:
 	/* otherwise select new workload type */
 	cfqd->serving_wl_type = cfq_choose_wl(cfqd, cfqg,
 					cfqd->serving_wl_class);
-	st = service_tree_for(cfqg, cfqd->serving_wl_class,
-					cfqd->serving_wl_type);
+	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
 	count = st->count;
 
 	/*
@@ -3640,16 +3634,17 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
 	if (sync) {
-		struct cfq_rb_root *service_tree;
+		struct cfq_rb_root *st;
 
 		RQ_CIC(rq)->ttime.last_end_request = now;
 
 		if (cfq_cfqq_on_rr(cfqq))
-			service_tree = cfqq->service_tree;
+			st = cfqq->service_tree;
 		else
-			service_tree = service_tree_for(cfqq->cfqg,
-				cfqq_class(cfqq), cfqq_type(cfqq));
-		service_tree->ttime.last_end_request = now;
+			st = st_for(cfqq->cfqg, cfqq_class(cfqq),
+					cfqq_type(cfqq));
+
+		st->ttime.last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
-- 
cgit v1.2.3-18-g5258


From 6d816ec7c83871da7c2472af7479d2438e641052 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 3 Oct 2012 16:56:59 -0400
Subject: cfq-iosched: Rename few functions related to selecting workload

choose_service_tree() selects/sets both wl_class and wl_type.  Rename it to
choose_wl_class_and_type() to make it very clear.

cfq_choose_wl() only selects and sets wl_type. It is easy to confuse
it with choose_st(). So rename it to cfq_choose_wl_type() to make
it clear what does it do.

Just renaming. No functionality change.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/cfq-iosched.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index db4a1a52c3d..e34e142b6db 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2490,7 +2490,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	}
 }
 
-static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
+static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
 			struct cfq_group *cfqg, enum wl_class_t wl_class)
 {
 	struct cfq_queue *queue;
@@ -2513,7 +2513,8 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 	return cur_best;
 }
 
-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void
+choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	unsigned slice;
 	unsigned count;
@@ -2551,7 +2552,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 new_workload:
 	/* otherwise select new workload type */
-	cfqd->serving_wl_type = cfq_choose_wl(cfqd, cfqg,
+	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
 					cfqd->serving_wl_class);
 	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
 	count = st->count;
@@ -2621,7 +2622,7 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 	} else
 		cfqd->workload_expires = jiffies - 1;
 
-	choose_service_tree(cfqd, cfqg);
+	choose_wl_class_and_type(cfqd, cfqg);
 }
 
 /*
-- 
cgit v1.2.3-18-g5258


From 1f23f12151ab508728dd5fca12180e2fce6c6949 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 3 Oct 2012 16:57:00 -0400
Subject: cfq-iosched: Get rid of unnecessary local variable

Use of local varibale "n" seems to be unnecessary. Remove it. This brings
it inline with function __cfq_group_st_add(), which is also doing the
similar operation of adding a group to a rb tree.

No functionality change here.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/cfq-iosched.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e34e142b6db..5ad4cae1beb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1658,8 +1658,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqq->service_tree = st;
 	p = &st->rb.rb_node;
 	while (*p) {
-		struct rb_node **n;
-
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
@@ -1667,13 +1665,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * sort by key, that represents service time.
 		 */
 		if (time_before(rb_key, __cfqq->rb_key))
-			n = &(*p)->rb_left;
+			p = &parent->rb_left;
 		else {
-			n = &(*p)->rb_right;
+			p = &parent->rb_right;
 			left = 0;
 		}
-
-		p = n;
 	}
 
 	if (left)
-- 
cgit v1.2.3-18-g5258


From b226e5c411759eec29308f0ea38e918aa695dc7f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 3 Oct 2012 16:57:01 -0400
Subject: cfq-iosched: Print sync-noidle information in blktrace messages

Currently we attach a character "S" or "A" to the cfqq<pid>, to represent
whether queues is sync or async. Add one more character "N" to represent
whether it is sync-noidle queue or sync queue. So now three different
type of queues will look as follows.

cfq1234S   --> sync queus
cfq1234SN  --> sync noidle queue
cfq1234A   --> Async queue

Previously S/A classification was being printed only if group scheduling
was enabled. This patch also makes sure that this classification is
displayed even if group idling is disabled.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/cfq-iosched.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5ad4cae1beb..bc076f45ca4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -586,8 +586,9 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 	char __pbuf[128];						\
 									\
 	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
-	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
-			  cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
+			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 			  __pbuf, ##args);				\
 } while (0)
 
@@ -675,7 +676,10 @@ static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
-	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
+			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
+				##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-- 
cgit v1.2.3-18-g5258


From e71357e118bdd4057e3bc020b9d80fecdd08f588 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:10 -0800
Subject: cfq-iosched: add leaf_weight

cfq blkcg is about to grow proper hierarchy handling, where a child
blkg's weight would nest inside the parent's.  This makes tasks in a
blkg to compete against both tasks in the sibling blkgs and the tasks
of child blkgs.

We're gonna use the existing weight as the group weight which decides
the blkg's weight against its siblings.  This patch introduces a new
weight - leaf_weight - which decides the weight of a blkg against the
child blkgs.

It's named leaf_weight because another way to look at it is that each
internal blkg nodes have a hidden child leaf node which contains all
its tasks and leaf_weight is the weight of the leaf node and handled
the same as the weight of the child blkgs.

This patch only adds leaf_weight fields and exposes it to userland.
The new weight isn't actually used anywhere yet.  Note that
cfq-iosched currently offcially supports only single level hierarchy
and root blkgs compete with the first level blkgs - ie. root weight is
basically being used as leaf_weight.  For root blkgs, the two weights
are kept in sync for backward compatibility.

v2: cfqd->root_group->leaf_weight initialization was missing from
    cfq_init_queue() causing divide by zero when
    !CONFIG_CFQ_GROUP_SCHED.  Fix it.  Reported by Fengguang.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
---
 block/cfq-iosched.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 126 insertions(+), 8 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index bc076f45ca4..175218d66d6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -223,10 +223,21 @@ struct cfq_group {
 
 	/* group service_tree key */
 	u64 vdisktime;
+
+	/*
+	 * There are two weights - (internal) weight is the weight of this
+	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
+	 * this cfqg against the child cfqgs.  For the root cfqg, both
+	 * weights are kept in sync for backward compatibility.
+	 */
 	unsigned int weight;
 	unsigned int new_weight;
 	unsigned int dev_weight;
 
+	unsigned int leaf_weight;
+	unsigned int new_leaf_weight;
+	unsigned int dev_leaf_weight;
+
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
@@ -1182,10 +1193,16 @@ static void
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
 	if (cfqg->new_weight) {
 		cfqg->weight = cfqg->new_weight;
 		cfqg->new_weight = 0;
 	}
+
+	if (cfqg->new_leaf_weight) {
+		cfqg->leaf_weight = cfqg->new_leaf_weight;
+		cfqg->new_leaf_weight = 0;
+	}
 }
 
 static void
@@ -1348,6 +1365,7 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
 
 	cfq_init_cfqg_base(cfqg);
 	cfqg->weight = blkg->blkcg->cfq_weight;
+	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
 }
 
 /*
@@ -1404,6 +1422,26 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
+static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
+					  struct blkg_policy_data *pd, int off)
+{
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+
+	if (!cfqg->dev_leaf_weight)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
+}
+
+static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
+					 struct cftype *cft,
+					 struct seq_file *sf)
+{
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
+			  false);
+	return 0;
+}
+
 static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
 			    struct seq_file *sf)
 {
@@ -1411,8 +1449,16 @@ static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
-static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
-				  const char *buf)
+static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
+				 struct seq_file *sf)
+{
+	seq_printf(sf, "%u\n",
+		   cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+	return 0;
+}
+
+static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				    const char *buf, bool is_leaf_weight)
 {
 	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkg_conf_ctx ctx;
@@ -1426,8 +1472,13 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
-		cfqg->dev_weight = ctx.v;
-		cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
+		if (!is_leaf_weight) {
+			cfqg->dev_weight = ctx.v;
+			cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+		} else {
+			cfqg->dev_leaf_weight = ctx.v;
+			cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+		}
 		ret = 0;
 	}
 
@@ -1435,7 +1486,20 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 	return ret;
 }
 
-static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				  const char *buf)
+{
+	return __cfqg_set_weight_device(cgrp, cft, buf, false);
+}
+
+static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				       const char *buf)
+{
+	return __cfqg_set_weight_device(cgrp, cft, buf, true);
+}
+
+static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
+			    bool is_leaf_weight)
 {
 	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 	struct blkcg_gq *blkg;
@@ -1445,19 +1509,41 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 		return -EINVAL;
 
 	spin_lock_irq(&blkcg->lock);
-	blkcg->cfq_weight = (unsigned int)val;
+
+	if (!is_leaf_weight)
+		blkcg->cfq_weight = val;
+	else
+		blkcg->cfq_leaf_weight = val;
 
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
-		if (cfqg && !cfqg->dev_weight)
-			cfqg->new_weight = blkcg->cfq_weight;
+		if (!cfqg)
+			continue;
+
+		if (!is_leaf_weight) {
+			if (!cfqg->dev_weight)
+				cfqg->new_weight = blkcg->cfq_weight;
+		} else {
+			if (!cfqg->dev_leaf_weight)
+				cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+		}
 	}
 
 	spin_unlock_irq(&blkcg->lock);
 	return 0;
 }
 
+static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	return __cfq_set_weight(cgrp, cft, val, false);
+}
+
+static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	return __cfq_set_weight(cgrp, cft, val, true);
+}
+
 static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 			   struct seq_file *sf)
 {
@@ -1518,6 +1604,37 @@ static struct cftype cfq_blkcg_files[] = {
 		.read_seq_string = cfq_print_weight,
 		.write_u64 = cfq_set_weight,
 	},
+
+	/* on root, leaf_weight is mapped to weight */
+	{
+		.name = "leaf_weight_device",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cfqg_print_weight_device,
+		.write_string = cfqg_set_weight_device,
+		.max_write_len = 256,
+	},
+	{
+		.name = "leaf_weight",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cfq_print_weight,
+		.write_u64 = cfq_set_weight,
+	},
+
+	/* no such mapping necessary for !roots */
+	{
+		.name = "leaf_weight_device",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_seq_string = cfqg_print_leaf_weight_device,
+		.write_string = cfqg_set_leaf_weight_device,
+		.max_write_len = 256,
+	},
+	{
+		.name = "leaf_weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_seq_string = cfq_print_leaf_weight,
+		.write_u64 = cfq_set_leaf_weight,
+	},
+
 	{
 		.name = "time",
 		.private = offsetof(struct cfq_group, stats.time),
@@ -3992,6 +4109,7 @@ static int cfq_init_queue(struct request_queue *q)
 	cfq_init_cfqg_base(cfqd->root_group);
 #endif
 	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
+	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.3-18-g5258


From 7918ffb5b83e3373206ada84873c674fbddf61cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:11 -0800
Subject: cfq-iosched: implement cfq_group->nr_active and ->children_weight

To prepare for blkcg hierarchy support, add cfqg->nr_active and
->children_weight.  cfqg->nr_active counts the number of active cfqgs
at the cfqg's level and ->children_weight is sum of weights of those
cfqgs.  The level covers itself (cfqg->leaf_weight) and immediate
children.

The two values are updated when a cfqg enters and leaves the group
service tree.  Unless the hierarchy is very deep, the added overhead
should be negligible.

Currently, the parent is determined using cfqg_flat_parent() which
makes the root cfqg the parent of all other cfqgs.  This is to make
the transition to hierarchy-aware scheduling gradual.  Scheduling
logic will be converted to use cfqg->children_weight without actually
changing the behavior.  When everything is ready,
blkcg_weight_parent() will be replaced with proper parent function.

This patch doesn't introduce any behavior chagne.

v2: s/cfqg->level_weight/cfqg->children_weight/ as per Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 175218d66d6..7701c3fe49b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -224,6 +224,18 @@ struct cfq_group {
 	/* group service_tree key */
 	u64 vdisktime;
 
+	/*
+	 * The number of active cfqgs and sum of their weights under this
+	 * cfqg.  This covers this cfqg's leaf_weight and all children's
+	 * weights, but does not cover weights of further descendants.
+	 *
+	 * If a cfqg is on the service tree, it's active.  An active cfqg
+	 * also activates its parent and contributes to the children_weight
+	 * of the parent.
+	 */
+	int nr_active;
+	unsigned int children_weight;
+
 	/*
 	 * There are two weights - (internal) weight is the weight of this
 	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
@@ -583,6 +595,22 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 }
 
+/*
+ * Determine the parent cfqg for weight calculation.  Currently, cfqg
+ * scheduling is flat and the root is the parent of everyone else.
+ */
+static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg)
+{
+	struct blkcg_gq *blkg = cfqg_to_blkg(cfqg);
+	struct cfq_group *root;
+
+	while (blkg->parent)
+		blkg = blkg->parent;
+	root = blkg_to_cfqg(blkg);
+
+	return root != cfqg ? root : NULL;
+}
+
 static inline void cfqg_get(struct cfq_group *cfqg)
 {
 	return blkg_get(cfqg_to_blkg(cfqg));
@@ -683,6 +711,7 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
+static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) { return NULL; }
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 
@@ -1208,11 +1237,33 @@ cfq_update_group_weight(struct cfq_group *cfqg)
 static void
 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
+	struct cfq_group *pos = cfqg;
+	bool propagate;
+
+	/* add to the service tree */
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
 
 	cfq_update_group_weight(cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 	st->total_weight += cfqg->weight;
+
+	/*
+	 * Activate @cfqg and propagate activation upwards until we meet an
+	 * already activated node or reach root.
+	 */
+	propagate = !pos->nr_active++;
+	pos->children_weight += pos->leaf_weight;
+
+	while (propagate) {
+		struct cfq_group *parent = cfqg_flat_parent(pos);
+
+		if (!parent)
+			break;
+
+		propagate = !parent->nr_active++;
+		parent->children_weight += pos->weight;
+		pos = parent;
+	}
 }
 
 static void
@@ -1243,6 +1294,31 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 static void
 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
+	struct cfq_group *pos = cfqg;
+	bool propagate;
+
+	/*
+	 * Undo activation from cfq_group_service_tree_add().  Deactivate
+	 * @cfqg and propagate deactivation upwards.
+	 */
+	propagate = !--pos->nr_active;
+	pos->children_weight -= pos->leaf_weight;
+
+	while (propagate) {
+		struct cfq_group *parent = cfqg_flat_parent(pos);
+
+		/* @pos has 0 nr_active at this point */
+		WARN_ON_ONCE(pos->children_weight);
+
+		if (!parent)
+			break;
+
+		propagate = !--parent->nr_active;
+		parent->children_weight -= pos->weight;
+		pos = parent;
+	}
+
+	/* remove from the service tree */
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
-- 
cgit v1.2.3-18-g5258


From 1d3650f713e7f6392b02fde450c5bae40291e65b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:11 -0800
Subject: cfq-iosched: implement hierarchy-ready cfq_group charge scaling

Currently, cfqg charges are scaled directly according to cfqg->weight.
Regardless of the number of active cfqgs or the amount of active
weights, a given weight value always scales charge the same way.  This
works fine as long as all cfqgs are treated equally regardless of
their positions in the hierarchy, which is what cfq currently
implements.  It can't work in hierarchical settings because the
interpretation of a given weight value depends on where the weight is
located in the hierarchy.

This patch reimplements cfqg charge scaling so that it can be used to
support hierarchy properly.  The scheme is fairly simple and
light-weight.

* When a cfqg is added to the service tree, v(disktime)weight is
  calculated.  It walks up the tree to root calculating the fraction
  it has in the hierarchy.  At each level, the fraction can be
  calculated as

    cfqg->weight / parent->level_weight

  By compounding these, the global fraction of vdisktime the cfqg has
  claim to - vfraction - can be determined.

* When the cfqg needs to be charged, the charge is scaled inversely
  proportionally to the vfraction.

The new scaling scheme uses the same CFQ_SERVICE_SHIFT for fixed point
representation as before; however, the smallest scaling factor is now
1 (ie. 1 << CFQ_SERVICE_SHIFT).  This is different from before where 1
was for CFQ_WEIGHT_DEFAULT and higher weight would result in smaller
scaling factor.

While this shifts the global scale of vdisktime a bit, it doesn't
change the relative relationships among cfqgs and the scheduling
result isn't different.

cfq_group_notify_queue_add uses fixed CFQ_IDLE_DELAY when appending
new cfqg to the service tree.  The specific value of CFQ_IDLE_DELAY
didn't have any relevance to vdisktime before and is unlikely to cause
any visible behavior difference now especially as the scale shift
isn't that large.

As the new scheme now makes proper distinction between cfqg->weight
and ->leaf_weight, reverse the weight aliasing for root cfqgs.  For
root, both weights are now mapped to ->leaf_weight instead of the
other way around.

Because we're still using cfqg_flat_parent(), this patch shouldn't
change the scheduling behavior in any noticeable way.

v2: Beefed up comments on vfraction as requested by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 107 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 77 insertions(+), 30 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7701c3fe49b..b24acf66d5b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -236,6 +236,18 @@ struct cfq_group {
 	int nr_active;
 	unsigned int children_weight;
 
+	/*
+	 * vfraction is the fraction of vdisktime that the tasks in this
+	 * cfqg are entitled to.  This is determined by compounding the
+	 * ratios walking up from this cfqg to the root.
+	 *
+	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
+	 * vfractions on a service tree is approximately 1.  The sum may
+	 * deviate a bit due to rounding errors and fluctuations caused by
+	 * cfqgs entering and leaving the service tree.
+	 */
+	unsigned int vfraction;
+
 	/*
 	 * There are two weights - (internal) weight is the weight of this
 	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
@@ -891,13 +903,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+/**
+ * cfqg_scale_charge - scale disk time charge according to cfqg weight
+ * @charge: disk time being charged
+ * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
+ *
+ * Scale @charge according to @vfraction, which is in range (0, 1].  The
+ * scaling is inversely proportional.
+ *
+ * scaled = charge / vfraction
+ *
+ * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
+ */
+static inline u64 cfqg_scale_charge(unsigned long charge,
+				    unsigned int vfraction)
 {
-	u64 d = delta << CFQ_SERVICE_SHIFT;
+	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
 
-	d = d * CFQ_WEIGHT_DEFAULT;
-	do_div(d, cfqg->weight);
-	return d;
+	/* charge / vfraction */
+	c <<= CFQ_SERVICE_SHIFT;
+	do_div(c, vfraction);
+	return c;
 }
 
 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
@@ -1237,7 +1263,9 @@ cfq_update_group_weight(struct cfq_group *cfqg)
 static void
 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
+	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;	/* start with 1 */
 	struct cfq_group *pos = cfqg;
+	struct cfq_group *parent;
 	bool propagate;
 
 	/* add to the service tree */
@@ -1248,22 +1276,34 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	st->total_weight += cfqg->weight;
 
 	/*
-	 * Activate @cfqg and propagate activation upwards until we meet an
-	 * already activated node or reach root.
+	 * Activate @cfqg and calculate the portion of vfraction @cfqg is
+	 * entitled to.  vfraction is calculated by walking the tree
+	 * towards the root calculating the fraction it has at each level.
+	 * The compounded ratio is how much vfraction @cfqg owns.
+	 *
+	 * Start with the proportion tasks in this cfqg has against active
+	 * children cfqgs - its leaf_weight against children_weight.
 	 */
 	propagate = !pos->nr_active++;
 	pos->children_weight += pos->leaf_weight;
+	vfr = vfr * pos->leaf_weight / pos->children_weight;
 
-	while (propagate) {
-		struct cfq_group *parent = cfqg_flat_parent(pos);
-
-		if (!parent)
-			break;
-
-		propagate = !parent->nr_active++;
-		parent->children_weight += pos->weight;
+	/*
+	 * Compound ->weight walking up the tree.  Both activation and
+	 * vfraction calculation are done in the same loop.  Propagation
+	 * stops once an already activated node is met.  vfraction
+	 * calculation should always continue to the root.
+	 */
+	while ((parent = cfqg_flat_parent(pos))) {
+		if (propagate) {
+			propagate = !parent->nr_active++;
+			parent->children_weight += pos->weight;
+		}
+		vfr = vfr * pos->weight / parent->children_weight;
 		pos = parent;
 	}
+
+	cfqg->vfraction = max_t(unsigned, vfr, 1);
 }
 
 static void
@@ -1309,6 +1349,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
 
 		/* @pos has 0 nr_active at this point */
 		WARN_ON_ONCE(pos->children_weight);
+		pos->vfraction = 0;
 
 		if (!parent)
 			break;
@@ -1381,6 +1422,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	unsigned int used_sl, charge, unaccounted_sl = 0;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
+	unsigned int vfr;
 
 	BUG_ON(nr_sync < 0);
 	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
@@ -1390,10 +1432,15 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
 
-	/* Can't update vdisktime while group is on service tree */
+	/*
+	 * Can't update vdisktime while on service tree and cfqg->vfraction
+	 * is valid only while on it.  Cache vfr, leave the service tree,
+	 * update vdisktime and go back on.  The re-addition to the tree
+	 * will also update the weights as necessary.
+	 */
+	vfr = cfqg->vfraction;
 	cfq_group_service_tree_del(st, cfqg);
-	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-	/* If a new weight was requested, update now, off tree */
+	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
 	cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
@@ -1669,44 +1716,44 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 
 static struct cftype cfq_blkcg_files[] = {
+	/* on root, weight is mapped to leaf_weight */
 	{
 		.name = "weight_device",
-		.read_seq_string = cfqg_print_weight_device,
-		.write_string = cfqg_set_weight_device,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cfqg_print_leaf_weight_device,
+		.write_string = cfqg_set_leaf_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
-		.read_seq_string = cfq_print_weight,
-		.write_u64 = cfq_set_weight,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cfq_print_leaf_weight,
+		.write_u64 = cfq_set_leaf_weight,
 	},
 
-	/* on root, leaf_weight is mapped to weight */
+	/* no such mapping necessary for !roots */
 	{
-		.name = "leaf_weight_device",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.name = "weight_device",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = cfqg_print_weight_device,
 		.write_string = cfqg_set_weight_device,
 		.max_write_len = 256,
 	},
 	{
-		.name = "leaf_weight",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = cfq_print_weight,
 		.write_u64 = cfq_set_weight,
 	},
 
-	/* no such mapping necessary for !roots */
 	{
 		.name = "leaf_weight_device",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = cfqg_print_leaf_weight_device,
 		.write_string = cfqg_set_leaf_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "leaf_weight",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = cfq_print_leaf_weight,
 		.write_u64 = cfq_set_leaf_weight,
 	},
-- 
cgit v1.2.3-18-g5258


From 41cad6ab2cb9ccb3b11546ad56b8b285e47c6279 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:11 -0800
Subject: cfq-iosched: convert cfq_group_slice() to use cfqg->vfraction

cfq_group_slice() calculates slice by taking a fraction of
cfq_target_latency according to the ratio of cfqg->weight against
service_tree->total_weight.  This currently works only because all
cfqgs are treated to be at the same level.

To prepare for proper hierarchy support, convert cfq_group_slice() to
base the calculation on cfqg->vfraction.  As cfqg->vfraction is always
a fraction of 1 and represents the fraction allocated to the cfqg with
hierarchy considered, the slice can be simply calculated by
multiplying cfqg->vfraction to cfq_target_latency (with fixed point
shift factored in).

As vfraction calculation currently treats all non-root cfqgs as
children of the root cfqg, this patch doesn't introduce noticeable
behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b24acf66d5b..ee342826fd9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -85,7 +85,6 @@ struct cfq_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned count;
-	unsigned total_weight;
 	u64 min_vdisktime;
 	struct cfq_ttime ttime;
 };
@@ -979,9 +978,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-
-	return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
+	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
 }
 
 static inline unsigned
@@ -1273,7 +1270,6 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 
 	cfq_update_group_weight(cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqg->weight;
 
 	/*
 	 * Activate @cfqg and calculate the portion of vfraction @cfqg is
@@ -1360,7 +1356,6 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	}
 
 	/* remove from the service tree */
-	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 }
-- 
cgit v1.2.3-18-g5258


From d02f7aa8dce8166dbbc515ce393912aa45e6b8a6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:11 -0800
Subject: cfq-iosched: enable full blkcg hierarchy support

With the previous two patches, all cfqg scheduling decisions are based
on vfraction and ready for hierarchy support.  The only thing which
keeps the behavior flat is cfqg_flat_parent() which makes vfraction
calculation consider all non-root cfqgs children of the root cfqg.

Replace it with cfqg_parent() which returns the real parent.  This
enables full blkcg hierarchy support for cfq-iosched.  For example,
consider the following hierarchy.

        root
      /      \
   A:500      B:250
  /     \
 AA:500  AB:1000

For simplicity, let's say all the leaf nodes have active tasks and are
on service tree.  For each leaf node, vfraction would be

 AA: (500  / 1500) * (500 / 750) =~ 0.2222
 AB: (1000 / 1500) * (500 / 750) =~ 0.4444
  B:                 (250 / 750) =~ 0.3333

and vdisktime will be distributed accordingly.  For more detail,
please refer to Documentation/block/cfq-iosched.txt.

v2: cfq-iosched.txt updated to describe group scheduling as suggested
    by Vivek.

v3: blkio-controller.txt updated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee342826fd9..e8f31069b37 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -606,20 +606,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 }
 
-/*
- * Determine the parent cfqg for weight calculation.  Currently, cfqg
- * scheduling is flat and the root is the parent of everyone else.
- */
-static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg)
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 {
-	struct blkcg_gq *blkg = cfqg_to_blkg(cfqg);
-	struct cfq_group *root;
-
-	while (blkg->parent)
-		blkg = blkg->parent;
-	root = blkg_to_cfqg(blkg);
+	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
 
-	return root != cfqg ? root : NULL;
+	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
 }
 
 static inline void cfqg_get(struct cfq_group *cfqg)
@@ -722,7 +713,7 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
 
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
-static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) { return NULL; }
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 
@@ -1290,7 +1281,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	 * stops once an already activated node is met.  vfraction
 	 * calculation should always continue to the root.
 	 */
-	while ((parent = cfqg_flat_parent(pos))) {
+	while ((parent = cfqg_parent(pos))) {
 		if (propagate) {
 			propagate = !parent->nr_active++;
 			parent->children_weight += pos->weight;
@@ -1341,7 +1332,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	pos->children_weight -= pos->leaf_weight;
 
 	while (propagate) {
-		struct cfq_group *parent = cfqg_flat_parent(pos);
+		struct cfq_group *parent = cfqg_parent(pos);
 
 		/* @pos has 0 nr_active at this point */
 		WARN_ON_ONCE(pos->children_weight);
-- 
cgit v1.2.3-18-g5258


From 4d5e80a76074786a49879ff482a83e72ad634606 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:12 -0800
Subject: blkcg: s/blkg_rwstat_sum()/blkg_rwstat_total()/

Rename blkg_rwstat_sum() to blkg_rwstat_total().  sum will be used for
summing up stats from multiple blkgs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e8f31069b37..d43145cc008 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -536,7 +536,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 
-	if (blkg_rwstat_sum(&stats->queued))
+	if (blkg_rwstat_total(&stats->queued))
 		return;
 
 	/*
@@ -580,7 +580,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 	struct cfqg_stats *stats = &cfqg->stats;
 
 	blkg_stat_add(&stats->avg_queue_size_sum,
-		      blkg_rwstat_sum(&stats->queued));
+		      blkg_rwstat_total(&stats->queued));
 	blkg_stat_add(&stats->avg_queue_size_samples, 1);
 	cfqg_stats_update_group_wait_time(stats);
 }
-- 
cgit v1.2.3-18-g5258


From 689665af4489f779bc82e7869509c9ac11b5a903 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:13 -0800
Subject: cfq-iosched: separate out cfqg_stats_reset() from
 cfq_pd_reset_stats()

Separate out cfqg_stats_reset() which takes struct cfqg_stats * from
cfq_pd_reset_stats() and move the latter to where other pd methods are
defined.  cfqg_stats_reset() will be used to implement hierarchical
stats.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d43145cc008..f8b34bbbd37 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -688,11 +688,9 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 				io_start_time - start_time);
 }
 
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+/* @stats = 0 */
+static void cfqg_stats_reset(struct cfqg_stats *stats)
 {
-	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-	struct cfqg_stats *stats = &cfqg->stats;
-
 	/* queued stats shouldn't be cleared */
 	blkg_rwstat_reset(&stats->service_bytes);
 	blkg_rwstat_reset(&stats->serviced);
@@ -1477,6 +1475,13 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
 	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
 }
 
+static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+{
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
+	cfqg_stats_reset(&cfqg->stats);
+}
+
 /*
  * Search for the cfq group current task belongs to. request_queue lock must
  * be held.
-- 
cgit v1.2.3-18-g5258


From 0b39920b5f9f3ad37dd259bfa2e9cbca33475b28 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:13 -0800
Subject: cfq-iosched: collect stats from dead cfqgs

To support hierarchical stats, it's necessary to remember stats from
dead children.  Add cfqg->dead_stats and make a dying cfqg transfer
its stats to the parent's dead-stats.

The transfer happens form ->pd_offline_fn() and it is possible that
there are some residual IOs completing afterwards.  Currently, we lose
these stats.  Given that cgroup removal isn't a very high frequency
operation and the amount of residual IOs on offline are likely to be
nil or small, this shouldn't be a big deal and the complexity needed
to handle residual IOs - another callback and rather elaborate
synchronization to reach and lock the matching q - doesn't seem
justified.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f8b34bbbd37..4d75b794457 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -289,7 +289,8 @@ struct cfq_group {
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	struct cfq_ttime ttime;
-	struct cfqg_stats stats;
+	struct cfqg_stats stats;	/* stats for this cfqg */
+	struct cfqg_stats dead_stats;	/* stats pushed from dead children */
 };
 
 struct cfq_io_cq {
@@ -709,6 +710,47 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 #endif
 }
 
+/* @to += @from */
+static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+{
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
+	blkg_rwstat_merge(&to->serviced, &from->serviced);
+	blkg_rwstat_merge(&to->merged, &from->merged);
+	blkg_rwstat_merge(&to->service_time, &from->service_time);
+	blkg_rwstat_merge(&to->wait_time, &from->wait_time);
+	blkg_stat_merge(&from->time, &from->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_merge(&to->dequeue, &from->dequeue);
+	blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_merge(&to->idle_time, &from->idle_time);
+	blkg_stat_merge(&to->empty_time, &from->empty_time);
+#endif
+}
+
+/*
+ * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this cfqg after
+ * it's gone.
+ */
+static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
+{
+	struct cfq_group *parent = cfqg_parent(cfqg);
+
+	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
+	cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+	cfqg_stats_reset(&cfqg->stats);
+	cfqg_stats_reset(&cfqg->dead_stats);
+}
+
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
@@ -1475,11 +1517,23 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
 	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
 }
 
+static void cfq_pd_offline(struct blkcg_gq *blkg)
+{
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
+	cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+}
+
 static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
 	cfqg_stats_reset(&cfqg->stats);
+	cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 /*
@@ -4408,6 +4462,7 @@ static struct blkcg_policy blkcg_policy_cfq = {
 	.cftypes		= cfq_blkcg_files,
 
 	.pd_init_fn		= cfq_pd_init,
+	.pd_offline_fn		= cfq_pd_offline,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
 };
 #endif
-- 
cgit v1.2.3-18-g5258


From 43114018cb0b253fd03c4ff4d42bcdc43389ac1c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Jan 2013 08:05:13 -0800
Subject: cfq-iosched: add hierarchical cfq_group statistics

Unfortunately, at this point, there's no way to make the existing
statistics hierarchical without creating nasty surprises for the
existing users.  Just create recursive counterpart of the existing
stats.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/cfq-iosched.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4d75b794457..b66365b6ba7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1528,6 +1528,32 @@ static void cfq_pd_offline(struct blkcg_gq *blkg)
 	cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
 }
 
+/* offset delta from cfqg->stats to cfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
+					offsetof(struct cfq_group, stats);
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+	u64 sum = 0;
+
+	sum += blkg_stat_recursive_sum(pd, off);
+	sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
+	return sum;
+}
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+						       int off)
+{
+	struct blkg_rwstat a, b;
+
+	a = blkg_rwstat_recursive_sum(pd, off);
+	b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
+	blkg_rwstat_merge(&a, &b);
+	return a;
+}
+
 static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1732,6 +1758,42 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
+static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *sf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
+			  &blkcg_policy_cfq, cft->private, false);
+	return 0;
+}
+
+static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
+				       struct seq_file *sf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
+			  &blkcg_policy_cfq, cft->private, true);
+	return 0;
+}
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
@@ -1803,6 +1865,7 @@ static struct cftype cfq_blkcg_files[] = {
 		.write_u64 = cfq_set_leaf_weight,
 	},
 
+	/* statistics, covers only the tasks in the cfqg */
 	{
 		.name = "time",
 		.private = offsetof(struct cfq_group, stats.time),
@@ -1843,6 +1906,48 @@ static struct cftype cfq_blkcg_files[] = {
 		.private = offsetof(struct cfq_group, stats.queued),
 		.read_seq_string = cfqg_print_rwstat,
 	},
+
+	/* the same statictics which cover the cfqg and its descendants */
+	{
+		.name = "time_recursive",
+		.private = offsetof(struct cfq_group, stats.time),
+		.read_seq_string = cfqg_print_stat_recursive,
+	},
+	{
+		.name = "sectors_recursive",
+		.private = offsetof(struct cfq_group, stats.sectors),
+		.read_seq_string = cfqg_print_stat_recursive,
+	},
+	{
+		.name = "io_service_bytes_recursive",
+		.private = offsetof(struct cfq_group, stats.service_bytes),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_serviced_recursive",
+		.private = offsetof(struct cfq_group, stats.serviced),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_service_time_recursive",
+		.private = offsetof(struct cfq_group, stats.service_time),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_wait_time_recursive",
+		.private = offsetof(struct cfq_group, stats.wait_time),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_merged_recursive",
+		.private = offsetof(struct cfq_group, stats.merged),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "io_queued_recursive",
+		.private = offsetof(struct cfq_group, stats.queued),
+		.read_seq_string = cfqg_print_rwstat_recursive,
+	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-- 
cgit v1.2.3-18-g5258


From a3cc86c2f00839453d2dbeb46bfc44e885b073db Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Thu, 21 Feb 2013 15:16:41 -0800
Subject: cfq: fix lock imbalance with failed allocations

While stress-running very-small container scenarios with the Kernel Memory
Controller, I've run into a lockdep-detected lock imbalance in
cfq-iosched.c.

I'll apologize beforehand for not posting a backlog: I didn't anticipate
it would be so hard to reproduce, so I didn't save my serial output and
went directly on debugging.  Turns out that it did not happen again in
more than 20 runs, making it a quite rare pattern.

But here is my analysis:

When we are in very low-memory situations, we will arrive at
cfq_find_alloc_queue and may not find a queue, having to resort to the oom
queue, in an rcu-locked condition:

  if (!cfqq || cfqq == &cfqd->oom_cfqq)
      [ ... ]

Next, we will release the rcu lock, and try to allocate a queue, retrying
if we succeed:

  rcu_read_unlock();
  spin_unlock_irq(cfqd->queue->queue_lock);
  new_cfqq = kmem_cache_alloc_node(cfq_pool,
                  gfp_mask | __GFP_ZERO,
                  cfqd->queue->node);
   spin_lock_irq(cfqd->queue->queue_lock);
   if (new_cfqq)
       goto retry;

We are unlocked at this point, but it should be fine, since we will
reacquire the rcu_read_lock when we retry.

Except of course, that we may not retry: the allocation may very well fail
and we'll keep on going through the flow:

The next branch is:

    if (cfqq) {
	[ ... ]
    } else
        cfqq = &cfqd->oom_cfqq;

And right before exiting, we'll issue rcu_read_unlock().

Being already unlocked, this is the likely source of our imbalance.  Since
cfqq is either already NULL or made NULL in the first statement of the
outter branch, the only viable alternative here seems to be to return the
oom queue right away in case of allocation failure.

Please review the following patch and apply if you agree with my analysis.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/cfq-iosched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block/cfq-iosched.c')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b66365b6ba7..1bf9307e8f5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3594,6 +3594,8 @@ retry:
 			spin_lock_irq(cfqd->queue->queue_lock);
 			if (new_cfqq)
 				goto retry;
+			else
+				return &cfqd->oom_cfqq;
 		} else {
 			cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
-- 
cgit v1.2.3-18-g5258