aboutsummaryrefslogtreecommitdiff
path: root/block/cfq-iosched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/cfq-iosched.c')
-rw-r--r--block/cfq-iosched.c1696
1 files changed, 1207 insertions, 489 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d0ba5053366..cadc3784174 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -15,7 +15,7 @@
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
#include "blk.h"
-#include "cfq.h"
+#include "blk-cgroup.h"
/*
* tunables
@@ -85,7 +85,6 @@ struct cfq_rb_root {
struct rb_root rb;
struct rb_node *left;
unsigned count;
- unsigned total_weight;
u64 min_vdisktime;
struct cfq_ttime ttime;
};
@@ -155,7 +154,7 @@ struct cfq_queue {
* First index in the service_trees.
* IDLE is handled separately, so it has negative index
*/
-enum wl_prio_t {
+enum wl_class_t {
BE_WORKLOAD = 0,
RT_WORKLOAD = 1,
IDLE_WORKLOAD = 2,
@@ -171,16 +170,96 @@ enum wl_type_t {
SYNC_WORKLOAD = 2
};
+struct cfqg_stats {
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ /* total bytes transferred */
+ struct blkg_rwstat service_bytes;
+ /* total IOs serviced, post merge */
+ struct blkg_rwstat serviced;
+ /* number of ios merged */
+ struct blkg_rwstat merged;
+ /* total time spent on device in ns, may not be accurate w/ queueing */
+ struct blkg_rwstat service_time;
+ /* total time spent waiting in scheduler queue in ns */
+ struct blkg_rwstat wait_time;
+ /* number of IOs queued up */
+ struct blkg_rwstat queued;
+ /* total sectors transferred */
+ struct blkg_stat sectors;
+ /* total disk time and nr sectors dispatched by this group */
+ struct blkg_stat time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* time not charged to this cgroup */
+ struct blkg_stat unaccounted_time;
+ /* sum of number of ios queued across all samples */
+ struct blkg_stat avg_queue_size_sum;
+ /* count of samples taken for average */
+ struct blkg_stat avg_queue_size_samples;
+ /* how many times this group has been removed from service tree */
+ struct blkg_stat dequeue;
+ /* total time spent waiting for it to be assigned a timeslice. */
+ struct blkg_stat group_wait_time;
+ /* time spent idling for this blkcg_gq */
+ struct blkg_stat idle_time;
+ /* total time with empty current active q with other requests queued */
+ struct blkg_stat empty_time;
+ /* fields after this shouldn't be cleared on stat reset */
+ uint64_t start_group_wait_time;
+ uint64_t start_idle_time;
+ uint64_t start_empty_time;
+ uint16_t flags;
+#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_CFQ_GROUP_IOSCHED */
+};
+
/* This is per cgroup per device grouping structure */
struct cfq_group {
+ /* must be the first member */
+ struct blkg_policy_data pd;
+
/* group service_tree member */
struct rb_node rb_node;
/* group service_tree key */
u64 vdisktime;
+
+ /*
+ * The number of active cfqgs and sum of their weights under this
+ * cfqg. This covers this cfqg's leaf_weight and all children's
+ * weights, but does not cover weights of further descendants.
+ *
+ * If a cfqg is on the service tree, it's active. An active cfqg
+ * also activates its parent and contributes to the children_weight
+ * of the parent.
+ */
+ int nr_active;
+ unsigned int children_weight;
+
+ /*
+ * vfraction is the fraction of vdisktime that the tasks in this
+ * cfqg are entitled to. This is determined by compounding the
+ * ratios walking up from this cfqg to the root.
+ *
+ * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
+ * vfractions on a service tree is approximately 1. The sum may
+ * deviate a bit due to rounding errors and fluctuations caused by
+ * cfqgs entering and leaving the service tree.
+ */
+ unsigned int vfraction;
+
+ /*
+ * There are two weights - (internal) weight is the weight of this
+ * cfqg against the sibling cfqgs. leaf_weight is the wight of
+ * this cfqg against the child cfqgs. For the root cfqg, both
+ * weights are kept in sync for backward compatibility.
+ */
unsigned int weight;
unsigned int new_weight;
- bool needs_update;
+ unsigned int dev_weight;
+
+ unsigned int leaf_weight;
+ unsigned int new_leaf_weight;
+ unsigned int dev_leaf_weight;
/* number of cfqq currently on this group */
int nr_cfqq;
@@ -203,23 +282,25 @@ struct cfq_group {
struct cfq_rb_root service_trees[2][3];
struct cfq_rb_root service_tree_idle;
- unsigned long saved_workload_slice;
- enum wl_type_t saved_workload;
- enum wl_prio_t saved_serving_prio;
- struct blkio_group blkg;
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
- struct hlist_node cfqd_node;
- int ref;
-#endif
+ unsigned long saved_wl_slice;
+ enum wl_type_t saved_wl_type;
+ enum wl_class_t saved_wl_class;
+
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
struct cfq_ttime ttime;
+ struct cfqg_stats stats; /* stats for this cfqg */
+ struct cfqg_stats dead_stats; /* stats pushed from dead children */
};
struct cfq_io_cq {
struct io_cq icq; /* must be the first member */
struct cfq_queue *cfqq[2];
struct cfq_ttime ttime;
+ int ioprio; /* the current ioprio */
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ uint64_t blkcg_id; /* the current blkcg ID */
+#endif
};
/*
@@ -229,13 +310,13 @@ struct cfq_data {
struct request_queue *queue;
/* Root service tree for cfq_groups */
struct cfq_rb_root grp_service_tree;
- struct cfq_group root_group;
+ struct cfq_group *root_group;
/*
* The priority currently being served
*/
- enum wl_prio_t serving_prio;
- enum wl_type_t serving_type;
+ enum wl_class_t serving_wl_class;
+ enum wl_type_t serving_wl_type;
unsigned long workload_expires;
struct cfq_group *serving_group;
@@ -295,6 +376,7 @@ struct cfq_data {
unsigned int cfq_slice_idle;
unsigned int cfq_group_idle;
unsigned int cfq_latency;
+ unsigned int cfq_target_latency;
/*
* Fallback dummy cfqq for extreme OOM conditions
@@ -302,27 +384,21 @@ struct cfq_data {
struct cfq_queue oom_cfqq;
unsigned long last_delayed_sync;
-
- /* List of cfq groups being managed on this device*/
- struct hlist_head cfqg_list;
-
- /* Number of groups which are on blkcg->blkg_list */
- unsigned int nr_blkcg_linked_grps;
};
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
- enum wl_prio_t prio,
+static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
+ enum wl_class_t class,
enum wl_type_t type)
{
if (!cfqg)
return NULL;
- if (prio == IDLE_WORKLOAD)
+ if (class == IDLE_WORKLOAD)
return &cfqg->service_tree_idle;
- return &cfqg->service_trees[prio][type];
+ return &cfqg->service_trees[class][type];
}
enum cfqq_state_flags {
@@ -370,21 +446,337 @@ CFQ_CFQQ_FNS(deep);
CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS
+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+ return pd_to_blkg(&cfqg->pd);
+}
+
+#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+
+/* cfqg stats flags */
+enum cfqg_stats_flags {
+ CFQG_stats_waiting = 0,
+ CFQG_stats_idling,
+ CFQG_stats_empty,
+};
+
+#define CFQG_FLAG_FNS(name) \
+static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
+{ \
+ stats->flags |= (1 << CFQG_stats_##name); \
+} \
+static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
+{ \
+ stats->flags &= ~(1 << CFQG_stats_##name); \
+} \
+static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
+{ \
+ return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
+} \
+
+CFQG_FLAG_FNS(waiting)
+CFQG_FLAG_FNS(idling)
+CFQG_FLAG_FNS(empty)
+#undef CFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!cfqg_stats_waiting(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_group_wait_time))
+ blkg_stat_add(&stats->group_wait_time,
+ now - stats->start_group_wait_time);
+ cfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
+ struct cfq_group *curr_cfqg)
+{
+ struct cfqg_stats *stats = &cfqg->stats;
+
+ if (cfqg_stats_waiting(stats))
+ return;
+ if (cfqg == curr_cfqg)
+ return;
+ stats->start_group_wait_time = sched_clock();
+ cfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!cfqg_stats_empty(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_empty_time))
+ blkg_stat_add(&stats->empty_time,
+ now - stats->start_empty_time);
+ cfqg_stats_clear_empty(stats);
+}
+
+static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
+{
+ blkg_stat_add(&cfqg->stats.dequeue, 1);
+}
+
+static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
+{
+ struct cfqg_stats *stats = &cfqg->stats;
+
+ if (blkg_rwstat_total(&stats->queued))
+ return;
+
+ /*
+ * group is already marked empty. This can happen if cfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if (cfqg_stats_empty(stats))
+ return;
+
+ stats->start_empty_time = sched_clock();
+ cfqg_stats_mark_empty(stats);
+}
+
+static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
+{
+ struct cfqg_stats *stats = &cfqg->stats;
+
+ if (cfqg_stats_idling(stats)) {
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, stats->start_idle_time))
+ blkg_stat_add(&stats->idle_time,
+ now - stats->start_idle_time);
+ cfqg_stats_clear_idling(stats);
+ }
+}
+
+static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
+{
+ struct cfqg_stats *stats = &cfqg->stats;
+
+ BUG_ON(cfqg_stats_idling(stats));
+
+ stats->start_idle_time = sched_clock();
+ cfqg_stats_mark_idling(stats);
+}
+
+static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
+{
+ struct cfqg_stats *stats = &cfqg->stats;
+
+ blkg_stat_add(&stats->avg_queue_size_sum,
+ blkg_rwstat_total(&stats->queued));
+ blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ cfqg_stats_update_group_wait_time(stats);
+}
+
+#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
+static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
+static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
+static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
+
+#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
- blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
- cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
- blkg_path(&(cfqq)->cfqg->blkg), ##args)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
- blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
- blkg_path(&(cfqg)->blkg), ##args) \
+static struct blkcg_policy blkcg_policy_cfq;
+
+static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
+{
+ return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
+}
+
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
+{
+ struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
+
+ return pblkg ? blkg_to_cfqg(pblkg) : NULL;
+}
+
+static inline void cfqg_get(struct cfq_group *cfqg)
+{
+ return blkg_get(cfqg_to_blkg(cfqg));
+}
+
+static inline void cfqg_put(struct cfq_group *cfqg)
+{
+ return blkg_put(cfqg_to_blkg(cfqg));
+}
+
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
+ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
+ cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
+ __pbuf, ##args); \
+} while (0)
+
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \
+} while (0)
+
+static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
+ struct cfq_group *curr_cfqg, int rw)
+{
+ blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
+ cfqg_stats_end_empty_time(&cfqg->stats);
+ cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
+}
+
+static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
+ unsigned long time, unsigned long unaccounted_time)
+{
+ blkg_stat_add(&cfqg->stats.time, time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
+#endif
+}
+
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
+{
+ blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
+}
+
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
+{
+ blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
+}
+
+static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
+ uint64_t bytes, int rw)
+{
+ blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
+ blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
+ blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
+}
+
+static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
+ uint64_t start_time, uint64_t io_start_time, int rw)
+{
+ struct cfqg_stats *stats = &cfqg->stats;
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, io_start_time))
+ blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+ if (time_after64(io_start_time, start_time))
+ blkg_rwstat_add(&stats->wait_time, rw,
+ io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void cfqg_stats_reset(struct cfqg_stats *stats)
+{
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_reset(&stats->service_bytes);
+ blkg_rwstat_reset(&stats->serviced);
+ blkg_rwstat_reset(&stats->merged);
+ blkg_rwstat_reset(&stats->service_time);
+ blkg_rwstat_reset(&stats->wait_time);
+ blkg_stat_reset(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_reset(&stats->unaccounted_time);
+ blkg_stat_reset(&stats->avg_queue_size_sum);
+ blkg_stat_reset(&stats->avg_queue_size_samples);
+ blkg_stat_reset(&stats->dequeue);
+ blkg_stat_reset(&stats->group_wait_time);
+ blkg_stat_reset(&stats->idle_time);
+ blkg_stat_reset(&stats->empty_time);
+#endif
+}
+
+/* @to += @from */
+static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+{
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
+ blkg_rwstat_merge(&to->serviced, &from->serviced);
+ blkg_rwstat_merge(&to->merged, &from->merged);
+ blkg_rwstat_merge(&to->service_time, &from->service_time);
+ blkg_rwstat_merge(&to->wait_time, &from->wait_time);
+ blkg_stat_merge(&from->time, &from->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
+ blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+ blkg_stat_merge(&to->dequeue, &from->dequeue);
+ blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_merge(&to->idle_time, &from->idle_time);
+ blkg_stat_merge(&to->empty_time, &from->empty_time);
+#endif
+}
+
+/*
+ * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this cfqg after
+ * it's gone.
+ */
+static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
+{
+ struct cfq_group *parent = cfqg_parent(cfqg);
+
+ lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
+
+ if (unlikely(!parent))
+ return;
+
+ cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
+ cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+ cfqg_stats_reset(&cfqg->stats);
+ cfqg_stats_reset(&cfqg->dead_stats);
+}
+
+#else /* CONFIG_CFQ_GROUP_IOSCHED */
+
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
+static inline void cfqg_get(struct cfq_group *cfqg) { }
+static inline void cfqg_put(struct cfq_group *cfqg) { }
-#else
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
- blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+ blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
+ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
+ cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
+ ##args)
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
-#endif
+
+static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
+ struct cfq_group *curr_cfqg, int rw) { }
+static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
+ unsigned long time, unsigned long unaccounted_time) { }
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
+static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
+ uint64_t bytes, int rw) { }
+static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
+ uint64_t start_time, uint64_t io_start_time, int rw) { }
+
+#endif /* CONFIG_CFQ_GROUP_IOSCHED */
+
#define cfq_log(cfqd, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
@@ -426,7 +818,7 @@ static inline bool iops_mode(struct cfq_data *cfqd)
return false;
}
-static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
+static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
{
if (cfq_class_idle(cfqq))
return IDLE_WORKLOAD;
@@ -445,28 +837,29 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
return SYNC_WORKLOAD;
}
-static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
+static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
struct cfq_data *cfqd,
struct cfq_group *cfqg)
{
- if (wl == IDLE_WORKLOAD)
+ if (wl_class == IDLE_WORKLOAD)
return cfqg->service_tree_idle.count;
- return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
- + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
- + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
+ return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
+ cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
+ cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
}
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
struct cfq_group *cfqg)
{
- return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
- + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
+ return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
}
static void cfq_dispatch_insert(struct request_queue *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
- struct io_context *, gfp_t);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
+ struct cfq_io_cq *cic, struct bio *bio,
+ gfp_t gfp_mask);
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
@@ -515,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
if (cfqd->busy_queues) {
cfq_log(cfqd, "schedule dispatch");
- kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+ kblockd_schedule_work(&cfqd->unplug_work);
}
}
@@ -540,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+/**
+ * cfqg_scale_charge - scale disk time charge according to cfqg weight
+ * @charge: disk time being charged
+ * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
+ *
+ * Scale @charge according to @vfraction, which is in range (0, 1]. The
+ * scaling is inversely proportional.
+ *
+ * scaled = charge / vfraction
+ *
+ * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
+ */
+static inline u64 cfqg_scale_charge(unsigned long charge,
+ unsigned int vfraction)
{
- u64 d = delta << CFQ_SERVICE_SHIFT;
+ u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
- d = d * BLKIO_WEIGHT_DEFAULT;
- do_div(d, cfqg->weight);
- return d;
+ /* charge / vfraction */
+ c <<= CFQ_SERVICE_SHIFT;
+ do_div(c, vfraction);
+ return c;
}
static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
@@ -602,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
-
- return cfq_target_latency * cfqg->weight / st->total_weight;
+ return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
}
static inline unsigned
@@ -871,20 +1276,61 @@ static void
cfq_update_group_weight(struct cfq_group *cfqg)
{
BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
- if (cfqg->needs_update) {
+
+ if (cfqg->new_weight) {
cfqg->weight = cfqg->new_weight;
- cfqg->needs_update = false;
+ cfqg->new_weight = 0;
+ }
+
+ if (cfqg->new_leaf_weight) {
+ cfqg->leaf_weight = cfqg->new_leaf_weight;
+ cfqg->new_leaf_weight = 0;
}
}
static void
cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
+ unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
+ struct cfq_group *pos = cfqg;
+ struct cfq_group *parent;
+ bool propagate;
+
+ /* add to the service tree */
BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
cfq_update_group_weight(cfqg);
__cfq_group_service_tree_add(st, cfqg);
- st->total_weight += cfqg->weight;
+
+ /*
+ * Activate @cfqg and calculate the portion of vfraction @cfqg is
+ * entitled to. vfraction is calculated by walking the tree
+ * towards the root calculating the fraction it has at each level.
+ * The compounded ratio is how much vfraction @cfqg owns.
+ *
+ * Start with the proportion tasks in this cfqg has against active
+ * children cfqgs - its leaf_weight against children_weight.
+ */
+ propagate = !pos->nr_active++;
+ pos->children_weight += pos->leaf_weight;
+ vfr = vfr * pos->leaf_weight / pos->children_weight;
+
+ /*
+ * Compound ->weight walking up the tree. Both activation and
+ * vfraction calculation are done in the same loop. Propagation
+ * stops once an already activated node is met. vfraction
+ * calculation should always continue to the root.
+ */
+ while ((parent = cfqg_parent(pos))) {
+ if (propagate) {
+ propagate = !parent->nr_active++;
+ parent->children_weight += pos->weight;
+ }
+ vfr = vfr * pos->weight / parent->children_weight;
+ pos = parent;
+ }
+
+ cfqg->vfraction = max_t(unsigned, vfr, 1);
}
static void
@@ -915,7 +1361,32 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
static void
cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
- st->total_weight -= cfqg->weight;
+ struct cfq_group *pos = cfqg;
+ bool propagate;
+
+ /*
+ * Undo activation from cfq_group_service_tree_add(). Deactivate
+ * @cfqg and propagate deactivation upwards.
+ */
+ propagate = !--pos->nr_active;
+ pos->children_weight -= pos->leaf_weight;
+
+ while (propagate) {
+ struct cfq_group *parent = cfqg_parent(pos);
+
+ /* @pos has 0 nr_active at this point */
+ WARN_ON_ONCE(pos->children_weight);
+ pos->vfraction = 0;
+
+ if (!parent)
+ break;
+
+ propagate = !--parent->nr_active;
+ parent->children_weight -= pos->weight;
+ pos = parent;
+ }
+
+ /* remove from the service tree */
if (!RB_EMPTY_NODE(&cfqg->rb_node))
cfq_rb_erase(&cfqg->rb_node, st);
}
@@ -934,8 +1405,8 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
cfq_group_service_tree_del(st, cfqg);
- cfqg->saved_workload_slice = 0;
- cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+ cfqg->saved_wl_slice = 0;
+ cfqg_stats_update_dequeue(cfqg);
}
static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -977,6 +1448,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
unsigned int used_sl, charge, unaccounted_sl = 0;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
+ unsigned int vfr;
BUG_ON(nr_sync < 0);
used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
@@ -986,20 +1458,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;
- /* Can't update vdisktime while group is on service tree */
+ /*
+ * Can't update vdisktime while on service tree and cfqg->vfraction
+ * is valid only while on it. Cache vfr, leave the service tree,
+ * update vdisktime and go back on. The re-addition to the tree
+ * will also update the weights as necessary.
+ */
+ vfr = cfqg->vfraction;
cfq_group_service_tree_del(st, cfqg);
- cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
- /* If a new weight was requested, update now, off tree */
+ cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
- cfqg->saved_workload_slice = cfqd->workload_expires
+ cfqg->saved_wl_slice = cfqd->workload_expires
- jiffies;
- cfqg->saved_workload = cfqd->serving_type;
- cfqg->saved_serving_prio = cfqd->serving_prio;
+ cfqg->saved_wl_type = cfqd->serving_wl_type;
+ cfqg->saved_wl_class = cfqd->serving_wl_class;
} else
- cfqg->saved_workload_slice = 0;
+ cfqg->saved_wl_slice = 0;
cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
st->min_vdisktime);
@@ -1007,273 +1484,514 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
"sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
- cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
- unaccounted_sl);
- cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
+ cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
+ cfqg_stats_set_start_empty_time(cfqg);
}
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
+/**
+ * cfq_init_cfqg_base - initialize base part of a cfq_group
+ * @cfqg: cfq_group to initialize
+ *
+ * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
+ * is enabled or not.
+ */
+static void cfq_init_cfqg_base(struct cfq_group *cfqg)
{
- if (blkg)
- return container_of(blkg, struct cfq_group, blkg);
- return NULL;
+ struct cfq_rb_root *st;
+ int i, j;
+
+ for_each_cfqg_st(cfqg, i, j, st)
+ *st = CFQ_RB_ROOT;
+ RB_CLEAR_NODE(&cfqg->rb_node);
+
+ cfqg->ttime.last_end_request = jiffies;
}
-static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
- unsigned int weight)
-{
- struct cfq_group *cfqg = cfqg_of_blkg(blkg);
- cfqg->new_weight = weight;
- cfqg->needs_update = true;
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void cfqg_stats_init(struct cfqg_stats *stats)
+{
+ blkg_rwstat_init(&stats->service_bytes);
+ blkg_rwstat_init(&stats->serviced);
+ blkg_rwstat_init(&stats->merged);
+ blkg_rwstat_init(&stats->service_time);
+ blkg_rwstat_init(&stats->wait_time);
+ blkg_rwstat_init(&stats->queued);
+
+ blkg_stat_init(&stats->sectors);
+ blkg_stat_init(&stats->time);
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_init(&stats->unaccounted_time);
+ blkg_stat_init(&stats->avg_queue_size_sum);
+ blkg_stat_init(&stats->avg_queue_size_samples);
+ blkg_stat_init(&stats->dequeue);
+ blkg_stat_init(&stats->group_wait_time);
+ blkg_stat_init(&stats->idle_time);
+ blkg_stat_init(&stats->empty_time);
+#endif
}
-static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
- struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
+static void cfq_pd_init(struct blkcg_gq *blkg)
{
- struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
- unsigned int major, minor;
+ struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
+ cfq_init_cfqg_base(cfqg);
+ cfqg->weight = blkg->blkcg->cfq_weight;
+ cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+ cfqg_stats_init(&cfqg->stats);
+ cfqg_stats_init(&cfqg->dead_stats);
+}
+static void cfq_pd_offline(struct blkcg_gq *blkg)
+{
/*
- * Add group onto cgroup list. It might happen that bdi->dev is
- * not initialized yet. Initialize this new group without major
- * and minor info and this info will be filled in once a new thread
- * comes for IO.
+ * @blkg is going offline and will be ignored by
+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
+ * that they don't get lost. If IOs complete after this point, the
+ * stats for them will be lost. Oh well...
*/
- if (bdi->dev) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
- (void *)cfqd, MKDEV(major, minor));
- } else
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
- (void *)cfqd, 0);
+ cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+}
- cfqd->nr_blkcg_linked_grps++;
- cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+/* offset delta from cfqg->stats to cfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
+ offsetof(struct cfq_group, stats);
- /* Add group on cfqd list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+ u64 sum = 0;
+
+ sum += blkg_stat_recursive_sum(pd, off);
+ sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
+ return sum;
}
-/*
- * Should be called from sleepable context. No request queue lock as per
- * cpu stats are allocated dynamically and alloc_percpu needs to be called
- * from sleepable context.
- */
-static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+ int off)
{
- struct cfq_group *cfqg = NULL;
- int i, j, ret;
- struct cfq_rb_root *st;
+ struct blkg_rwstat a, b;
- cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
- if (!cfqg)
- return NULL;
+ a = blkg_rwstat_recursive_sum(pd, off);
+ b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
+ blkg_rwstat_merge(&a, &b);
+ return a;
+}
- for_each_cfqg_st(cfqg, i, j, st)
- *st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->rb_node);
+static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+{
+ struct cfq_group *cfqg = blkg_to_cfqg(blkg);
- cfqg->ttime.last_end_request = jiffies;
+ cfqg_stats_reset(&cfqg->stats);
+ cfqg_stats_reset(&cfqg->dead_stats);
+}
- /*
- * Take the initial reference that will be released on destroy
- * This can be thought of a joint reference by cgroup and
- * elevator which will be dropped by either elevator exit
- * or cgroup deletion path depending on who is exiting first.
- */
- cfqg->ref = 1;
+/*
+ * Search for the cfq group current task belongs to. request_queue lock must
+ * be held.
+ */
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+ struct blkcg *blkcg)
+{
+ struct request_queue *q = cfqd->queue;
+ struct cfq_group *cfqg = NULL;
- ret = blkio_alloc_blkg_stats(&cfqg->blkg);
- if (ret) {
- kfree(cfqg);
- return NULL;
+ /* avoid lookup for the common case where there's no blkcg */
+ if (blkcg == &blkcg_root) {
+ cfqg = cfqd->root_group;
+ } else {
+ struct blkcg_gq *blkg;
+
+ blkg = blkg_lookup_create(blkcg, q);
+ if (!IS_ERR(blkg))
+ cfqg = blkg_to_cfqg(blkg);
}
return cfqg;
}
-static struct cfq_group *
-cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
{
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
- struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
- unsigned int major, minor;
+ /* Currently, all async queues are mapped to root group */
+ if (!cfq_cfqq_sync(cfqq))
+ cfqg = cfqq->cfqd->root_group;
- /*
- * This is the common case when there are no blkio cgroups.
- * Avoid lookup in this case
- */
- if (blkcg == &blkio_root_cgroup)
- cfqg = &cfqd->root_group;
- else
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ cfqq->cfqg = cfqg;
+ /* cfqq reference on cfqg */
+ cfqg_get(cfqg);
+}
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- }
+static u64 cfqg_prfill_weight_device(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
- return cfqg;
+ if (!cfqg->dev_weight)
+ return 0;
+ return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
}
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
+static int cfqg_print_weight_device(struct seq_file *sf, void *v)
{
- struct blkio_cgroup *blkcg;
- struct cfq_group *cfqg = NULL, *__cfqg = NULL;
- struct request_queue *q = cfqd->queue;
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_weight_device, &blkcg_policy_cfq,
+ 0, false);
+ return 0;
+}
- rcu_read_lock();
- blkcg = task_blkio_cgroup(current);
- cfqg = cfq_find_cfqg(cfqd, blkcg);
- if (cfqg) {
- rcu_read_unlock();
- return cfqg;
- }
+static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
- /*
- * Need to allocate a group. Allocation of group also needs allocation
- * of per cpu stats which in-turn takes a mutex() and can block. Hence
- * we need to drop rcu lock and queue_lock before we call alloc.
- *
- * Not taking any queue reference here and assuming that queue is
- * around by the time we return. CFQ queue allocation code does
- * the same. It might be racy though.
- */
+ if (!cfqg->dev_leaf_weight)
+ return 0;
+ return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
+}
- rcu_read_unlock();
- spin_unlock_irq(q->queue_lock);
+static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
+ 0, false);
+ return 0;
+}
- cfqg = cfq_alloc_cfqg(cfqd);
+static int cfq_print_weight(struct seq_file *sf, void *v)
+{
+ seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
+ return 0;
+}
- spin_lock_irq(q->queue_lock);
+static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
+{
+ seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
+ return 0;
+}
- rcu_read_lock();
- blkcg = task_blkio_cgroup(current);
+static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ bool is_leaf_weight)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkg_conf_ctx ctx;
+ struct cfq_group *cfqg;
+ int ret;
- /*
- * If some other thread already allocated the group while we were
- * not holding queue lock, free up the group
- */
- __cfqg = cfq_find_cfqg(cfqd, blkcg);
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
+ if (ret)
+ return ret;
- if (__cfqg) {
- kfree(cfqg);
- rcu_read_unlock();
- return __cfqg;
+ ret = -EINVAL;
+ cfqg = blkg_to_cfqg(ctx.blkg);
+ if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+ if (!is_leaf_weight) {
+ cfqg->dev_weight = ctx.v;
+ cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+ } else {
+ cfqg->dev_leaf_weight = ctx.v;
+ cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+ }
+ ret = 0;
}
- if (!cfqg)
- cfqg = &cfqd->root_group;
+ blkg_conf_finish(&ctx);
+ return ret ?: nbytes;
+}
- cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
- rcu_read_unlock();
- return cfqg;
+static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cfqg_set_weight_device(of, buf, nbytes, off, false);
}
-static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- cfqg->ref++;
- return cfqg;
+ return __cfqg_set_weight_device(of, buf, nbytes, off, true);
}
-static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val, bool is_leaf_weight)
{
- /* Currently, all async queues are mapped to root group */
- if (!cfq_cfqq_sync(cfqq))
- cfqg = &cfqq->cfqd->root_group;
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct blkcg_gq *blkg;
- cfqq->cfqg = cfqg;
- /* cfqq reference on cfqg */
- cfqq->cfqg->ref++;
+ if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
+ return -EINVAL;
+
+ spin_lock_irq(&blkcg->lock);
+
+ if (!is_leaf_weight)
+ blkcg->cfq_weight = val;
+ else
+ blkcg->cfq_leaf_weight = val;
+
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
+ if (!cfqg)
+ continue;
+
+ if (!is_leaf_weight) {
+ if (!cfqg->dev_weight)
+ cfqg->new_weight = blkcg->cfq_weight;
+ } else {
+ if (!cfqg->dev_leaf_weight)
+ cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+ }
+ }
+
+ spin_unlock_irq(&blkcg->lock);
+ return 0;
}
-static void cfq_put_cfqg(struct cfq_group *cfqg)
+static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
{
- struct cfq_rb_root *st;
- int i, j;
+ return __cfq_set_weight(css, cft, val, false);
+}
- BUG_ON(cfqg->ref <= 0);
- cfqg->ref--;
- if (cfqg->ref)
- return;
- for_each_cfqg_st(cfqg, i, j, st)
- BUG_ON(!RB_EMPTY_ROOT(&st->rb));
- free_percpu(cfqg->blkg.stats_cpu);
- kfree(cfqg);
+static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ return __cfq_set_weight(css, cft, val, true);
}
-static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static int cfqg_print_stat(struct seq_file *sf, void *v)
{
- /* Something wrong if we are trying to remove same group twice */
- BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_cfq, seq_cft(sf)->private, false);
+ return 0;
+}
- hlist_del_init(&cfqg->cfqd_node);
+static int cfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+ &blkcg_policy_cfq, seq_cft(sf)->private, true);
+ return 0;
+}
- BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
- cfqd->nr_blkcg_linked_grps--;
+static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
- /*
- * Put the reference taken at the time of creation so that when all
- * queues are gone, group can be destroyed.
- */
- cfq_put_cfqg(cfqg);
+ return __blkg_prfill_u64(sf, pd, sum);
}
-static void cfq_release_cfq_groups(struct cfq_data *cfqd)
+static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
{
- struct hlist_node *pos, *n;
- struct cfq_group *cfqg;
+ struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
- hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
- /*
- * If cgroup removal path got to blk_group first and removed
- * it from cgroup list, then it will take care of destroying
- * cfqg also.
- */
- if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
- cfq_destroy_cfqg(cfqd, cfqg);
- }
+ return __blkg_prfill_rwstat(sf, pd, &sum);
}
-/*
- * Blk cgroup controller notification saying that blkio_group object is being
- * delinked as associated cgroup object is going away. That also means that
- * no new IO will come in this group. So get rid of this group as soon as
- * any pending IO in the group is finished.
- *
- * This function is called under rcu_read_lock(). key is the rcu protected
- * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
- * read lock.
- *
- * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
- * it should not be NULL as even if elevator was exiting, cgroup deltion
- * path got to it first.
- */
-static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
{
- unsigned long flags;
- struct cfq_data *cfqd = key;
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
+ seq_cft(sf)->private, false);
+ return 0;
+}
- spin_lock_irqsave(cfqd->queue->queue_lock, flags);
- cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
- spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
+ seq_cft(sf)->private, true);
+ return 0;
}
-#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
{
- return &cfqd->root_group;
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
+ u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
+ u64 v = 0;
+
+ if (samples) {
+ v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
+ v = div64_u64(v, samples);
+ }
+ __blkg_prfill_u64(sf, pd, v);
+ return 0;
}
-static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+/* print avg_queue_size */
+static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
{
- return cfqg;
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
+ 0, false);
+ return 0;
+}
+#endif /* CONFIG_DEBUG_BLK_CGROUP */
+
+static struct cftype cfq_blkcg_files[] = {
+ /* on root, weight is mapped to leaf_weight */
+ {
+ .name = "weight_device",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cfqg_print_leaf_weight_device,
+ .write = cfqg_set_leaf_weight_device,
+ },
+ {
+ .name = "weight",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cfq_print_leaf_weight,
+ .write_u64 = cfq_set_leaf_weight,
+ },
+
+ /* no such mapping necessary for !roots */
+ {
+ .name = "weight_device",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cfqg_print_weight_device,
+ .write = cfqg_set_weight_device,
+ },
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cfq_print_weight,
+ .write_u64 = cfq_set_weight,
+ },
+
+ {
+ .name = "leaf_weight_device",
+ .seq_show = cfqg_print_leaf_weight_device,
+ .write = cfqg_set_leaf_weight_device,
+ },
+ {
+ .name = "leaf_weight",
+ .seq_show = cfq_print_leaf_weight,
+ .write_u64 = cfq_set_leaf_weight,
+ },
+
+ /* statistics, covers only the tasks in the cfqg */
+ {
+ .name = "time",
+ .private = offsetof(struct cfq_group, stats.time),
+ .seq_show = cfqg_print_stat,
+ },
+ {
+ .name = "sectors",
+ .private = offsetof(struct cfq_group, stats.sectors),
+ .seq_show = cfqg_print_stat,
+ },
+ {
+ .name = "io_service_bytes",
+ .private = offsetof(struct cfq_group, stats.service_bytes),
+ .seq_show = cfqg_print_rwstat,
+ },
+ {
+ .name = "io_serviced",
+ .private = offsetof(struct cfq_group, stats.serviced),
+ .seq_show = cfqg_print_rwstat,
+ },
+ {
+ .name = "io_service_time",
+ .private = offsetof(struct cfq_group, stats.service_time),
+ .seq_show = cfqg_print_rwstat,
+ },
+ {
+ .name = "io_wait_time",
+ .private = offsetof(struct cfq_group, stats.wait_time),
+ .seq_show = cfqg_print_rwstat,
+ },
+ {
+ .name = "io_merged",
+ .private = offsetof(struct cfq_group, stats.merged),
+ .seq_show = cfqg_print_rwstat,
+ },
+ {
+ .name = "io_queued",
+ .private = offsetof(struct cfq_group, stats.queued),
+ .seq_show = cfqg_print_rwstat,
+ },
+
+ /* the same statictics which cover the cfqg and its descendants */
+ {
+ .name = "time_recursive",
+ .private = offsetof(struct cfq_group, stats.time),
+ .seq_show = cfqg_print_stat_recursive,
+ },
+ {
+ .name = "sectors_recursive",
+ .private = offsetof(struct cfq_group, stats.sectors),
+ .seq_show = cfqg_print_stat_recursive,
+ },
+ {
+ .name = "io_service_bytes_recursive",
+ .private = offsetof(struct cfq_group, stats.service_bytes),
+ .seq_show = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_serviced_recursive",
+ .private = offsetof(struct cfq_group, stats.serviced),
+ .seq_show = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_service_time_recursive",
+ .private = offsetof(struct cfq_group, stats.service_time),
+ .seq_show = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_wait_time_recursive",
+ .private = offsetof(struct cfq_group, stats.wait_time),
+ .seq_show = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_merged_recursive",
+ .private = offsetof(struct cfq_group, stats.merged),
+ .seq_show = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_queued_recursive",
+ .private = offsetof(struct cfq_group, stats.queued),
+ .seq_show = cfqg_print_rwstat_recursive,
+ },
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ {
+ .name = "avg_queue_size",
+ .seq_show = cfqg_print_avg_queue_size,
+ },
+ {
+ .name = "group_wait_time",
+ .private = offsetof(struct cfq_group, stats.group_wait_time),
+ .seq_show = cfqg_print_stat,
+ },
+ {
+ .name = "idle_time",
+ .private = offsetof(struct cfq_group, stats.idle_time),
+ .seq_show = cfqg_print_stat,
+ },
+ {
+ .name = "empty_time",
+ .private = offsetof(struct cfq_group, stats.empty_time),
+ .seq_show = cfqg_print_stat,
+ },
+ {
+ .name = "dequeue",
+ .private = offsetof(struct cfq_group, stats.dequeue),
+ .seq_show = cfqg_print_stat,
+ },
+ {
+ .name = "unaccounted_time",
+ .private = offsetof(struct cfq_group, stats.unaccounted_time),
+ .seq_show = cfqg_print_stat,
+ },
+#endif /* CONFIG_DEBUG_BLK_CGROUP */
+ { } /* terminate */
+};
+#else /* GROUP_IOSCHED */
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+ struct blkcg *blkcg)
+{
+ return cfqd->root_group;
}
static inline void
@@ -1281,9 +1999,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
cfqq->cfqg = cfqg;
}
-static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
-static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
-
#endif /* GROUP_IOSCHED */
/*
@@ -1297,15 +2012,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct rb_node **p, *parent;
struct cfq_queue *__cfqq;
unsigned long rb_key;
- struct cfq_rb_root *service_tree;
+ struct cfq_rb_root *st;
int left;
int new_cfqq = 1;
- service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
- cfqq_type(cfqq));
+ st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
if (cfq_class_idle(cfqq)) {
rb_key = CFQ_IDLE_DELAY;
- parent = rb_last(&service_tree->rb);
+ parent = rb_last(&st->rb);
if (parent && parent != &cfqq->rb_node) {
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
rb_key += __cfqq->rb_key;
@@ -1323,7 +2037,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqq->slice_resid = 0;
} else {
rb_key = -HZ;
- __cfqq = cfq_rb_first(service_tree);
+ __cfqq = cfq_rb_first(st);
rb_key += __cfqq ? __cfqq->rb_key : jiffies;
}
@@ -1332,8 +2046,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
/*
* same position, nothing more to do
*/
- if (rb_key == cfqq->rb_key &&
- cfqq->service_tree == service_tree)
+ if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
return;
cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
@@ -1342,11 +2055,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
left = 1;
parent = NULL;
- cfqq->service_tree = service_tree;
- p = &service_tree->rb.rb_node;
+ cfqq->service_tree = st;
+ p = &st->rb.rb_node;
while (*p) {
- struct rb_node **n;
-
parent = *p;
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
@@ -1354,22 +2065,20 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* sort by key, that represents service time.
*/
if (time_before(rb_key, __cfqq->rb_key))
- n = &(*p)->rb_left;
+ p = &parent->rb_left;
else {
- n = &(*p)->rb_right;
+ p = &parent->rb_right;
left = 0;
}
-
- p = n;
}
if (left)
- service_tree->left = &cfqq->rb_node;
+ st->left = &cfqq->rb_node;
cfqq->rb_key = rb_key;
rb_link_node(&cfqq->rb_node, parent, p);
- rb_insert_color(&cfqq->rb_node, &service_tree->rb);
- service_tree->count++;
+ rb_insert_color(&cfqq->rb_node, &st->rb);
+ st->count++;
if (add_front || !new_cfqq)
return;
cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
@@ -1550,12 +2259,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
{
elv_rb_del(&cfqq->sort_list, rq);
cfqq->queued[rq_is_sync(rq)]--;
- cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
- rq_data_dir(rq), rq_is_sync(rq));
+ cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
cfq_add_rq_rb(rq);
- cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
- &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
- rq_is_sync(rq));
+ cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
+ rq->cmd_flags);
}
static struct request *
@@ -1570,11 +2277,8 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
return NULL;
cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
- if (cfqq) {
- sector_t sector = bio->bi_sector + bio_sectors(bio);
-
- return elv_rb_find(&cfqq->sort_list, sector);
- }
+ if (cfqq)
+ return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
return NULL;
}
@@ -1611,8 +2315,7 @@ static void cfq_remove_request(struct request *rq)
cfq_del_rq_rb(rq);
cfqq->cfqd->rq_queued--;
- cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
- rq_data_dir(rq), rq_is_sync(rq));
+ cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
if (rq->cmd_flags & REQ_PRIO) {
WARN_ON(!cfqq->prio_pending);
cfqq->prio_pending--;
@@ -1647,8 +2350,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
static void cfq_bio_merged(struct request_queue *q, struct request *req,
struct bio *bio)
{
- cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
- bio_data_dir(bio), cfq_bio_sync(bio));
+ cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
}
static void
@@ -1662,16 +2364,16 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
* reposition in fifo if next is older than rq
*/
if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
- time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
+ time_before(next->fifo_time, rq->fifo_time) &&
+ cfqq == RQ_CFQQ(next)) {
list_move(&rq->queuelist, &next->queuelist);
- rq_set_fifo_time(rq, rq_fifo_time(next));
+ rq->fifo_time = next->fifo_time;
}
if (cfqq->next_rq == next)
cfqq->next_rq = rq;
cfq_remove_request(next);
- cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
- rq_data_dir(next), rq_is_sync(next));
+ cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
cfqq = RQ_CFQQ(next);
/*
@@ -1712,16 +2414,16 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
del_timer(&cfqd->idle_slice_timer);
- cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+ cfqg_stats_update_idle_time(cfqq->cfqg);
}
static void __cfq_set_active_queue(struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
if (cfqq) {
- cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
- cfqd->serving_prio, cfqd->serving_type);
- cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+ cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
+ cfqd->serving_wl_class, cfqd->serving_wl_type);
+ cfqg_stats_update_avg_queue_size(cfqq->cfqg);
cfqq->slice_start = 0;
cfqq->dispatch_start = jiffies;
cfqq->allocated_slice = 0;
@@ -1806,19 +2508,18 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
*/
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
{
- struct cfq_rb_root *service_tree =
- service_tree_for(cfqd->serving_group, cfqd->serving_prio,
- cfqd->serving_type);
+ struct cfq_rb_root *st = st_for(cfqd->serving_group,
+ cfqd->serving_wl_class, cfqd->serving_wl_type);
if (!cfqd->rq_queued)
return NULL;
/* There is nothing to dispatch */
- if (!service_tree)
+ if (!st)
return NULL;
- if (RB_EMPTY_ROOT(&service_tree->rb))
+ if (RB_EMPTY_ROOT(&st->rb))
return NULL;
- return cfq_rb_first(service_tree);
+ return cfq_rb_first(st);
}
static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
@@ -1974,17 +2675,17 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- enum wl_prio_t prio = cfqq_prio(cfqq);
- struct cfq_rb_root *service_tree = cfqq->service_tree;
+ enum wl_class_t wl_class = cfqq_class(cfqq);
+ struct cfq_rb_root *st = cfqq->service_tree;
- BUG_ON(!service_tree);
- BUG_ON(!service_tree->count);
+ BUG_ON(!st);
+ BUG_ON(!st->count);
if (!cfqd->cfq_slice_idle)
return false;
/* We never do for idle class queues. */
- if (prio == IDLE_WORKLOAD)
+ if (wl_class == IDLE_WORKLOAD)
return false;
/* We do for queues that were marked with idle window flag. */
@@ -1996,11 +2697,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
* Otherwise, we do only if they are the last ones
* in their service tree.
*/
- if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
- !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
+ if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
+ !cfq_io_thinktime_big(cfqd, &st->ttime, false))
return true;
- cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
- service_tree->count);
+ cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
return false;
}
@@ -2042,7 +2742,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
* task has exited, don't wait
*/
cic = cfqd->active_cic;
- if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
+ if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
return;
/*
@@ -2069,7 +2769,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
sl = cfqd->cfq_slice_idle;
mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
- cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
+ cfqg_stats_set_start_idle_time(cfqq->cfqg);
cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
group_idle ? 1 : 0);
}
@@ -2092,8 +2792,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
cfqq->nr_sectors += blk_rq_sectors(rq);
- cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
- rq_data_dir(rq), rq_is_sync(rq));
+ cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
}
/*
@@ -2112,7 +2811,7 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
return NULL;
rq = rq_entry_fifo(cfqq->fifo.next);
- if (time_before(jiffies, rq_fifo_time(rq)))
+ if (time_before(jiffies, rq->fifo_time))
rq = NULL;
cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
@@ -2184,8 +2883,8 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
}
}
-static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
- struct cfq_group *cfqg, enum wl_prio_t prio)
+static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
+ struct cfq_group *cfqg, enum wl_class_t wl_class)
{
struct cfq_queue *queue;
int i;
@@ -2195,7 +2894,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
for (i = 0; i <= SYNC_WORKLOAD; ++i) {
/* select the one with lowest rb_key */
- queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
+ queue = cfq_rb_first(st_for(cfqg, wl_class, i));
if (queue &&
(!key_valid || time_before(queue->rb_key, lowest_key))) {
lowest_key = queue->rb_key;
@@ -2207,26 +2906,27 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
return cur_best;
}
-static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static void
+choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
unsigned slice;
unsigned count;
struct cfq_rb_root *st;
unsigned group_slice;
- enum wl_prio_t original_prio = cfqd->serving_prio;
+ enum wl_class_t original_class = cfqd->serving_wl_class;
/* Choose next priority. RT > BE > IDLE */
if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = RT_WORKLOAD;
+ cfqd->serving_wl_class = RT_WORKLOAD;
else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
- cfqd->serving_prio = BE_WORKLOAD;
+ cfqd->serving_wl_class = BE_WORKLOAD;
else {
- cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->serving_wl_class = IDLE_WORKLOAD;
cfqd->workload_expires = jiffies + 1;
return;
}
- if (original_prio != cfqd->serving_prio)
+ if (original_class != cfqd->serving_wl_class)
goto new_workload;
/*
@@ -2234,7 +2934,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
* expiration time
*/
- st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+ st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
count = st->count;
/*
@@ -2245,9 +2945,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
new_workload:
/* otherwise select new workload type */
- cfqd->serving_type =
- cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
- st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+ cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
+ cfqd->serving_wl_class);
+ st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
count = st->count;
/*
@@ -2258,10 +2958,11 @@ new_workload:
group_slice = cfq_group_slice(cfqd, cfqg);
slice = group_slice * count /
- max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
- cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
+ max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
+ cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
+ cfqg));
- if (cfqd->serving_type == ASYNC_WORKLOAD) {
+ if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
unsigned int tmp;
/*
@@ -2271,7 +2972,8 @@ new_workload:
* to have higher weight. A more accurate thing would be to
* calculate system wide asnc/sync ratio.
*/
- tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
+ tmp = cfqd->cfq_target_latency *
+ cfqg_busy_async_queues(cfqd, cfqg);
tmp = tmp/cfqd->busy_queues;
slice = min_t(unsigned, slice, tmp);
@@ -2306,14 +3008,14 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
cfqd->serving_group = cfqg;
/* Restore the workload type data */
- if (cfqg->saved_workload_slice) {
- cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
- cfqd->serving_type = cfqg->saved_workload;
- cfqd->serving_prio = cfqg->saved_serving_prio;
+ if (cfqg->saved_wl_slice) {
+ cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
+ cfqd->serving_wl_type = cfqg->saved_wl_type;
+ cfqd->serving_wl_class = cfqg->saved_wl_class;
} else
cfqd->workload_expires = jiffies - 1;
- choose_service_tree(cfqd, cfqg);
+ choose_wl_class_and_type(cfqd, cfqg);
}
/*
@@ -2675,7 +3377,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
BUG_ON(cfq_cfqq_on_rr(cfqq));
kmem_cache_free(cfq_pool, cfqq);
- cfq_put_cfqg(cfqg);
+ cfqg_put(cfqg);
}
static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2734,7 +3436,7 @@ static void cfq_exit_icq(struct io_cq *icq)
}
}
-static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
+static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
{
struct task_struct *tsk = current;
int ioprio_class;
@@ -2742,7 +3444,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
if (!cfq_cfqq_prio_changed(cfqq))
return;
- ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
+ ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
switch (ioprio_class) {
default:
printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -2754,11 +3456,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
cfqq->ioprio_class = task_nice_ioclass(tsk);
break;
case IOPRIO_CLASS_RT:
- cfqq->ioprio = task_ioprio(ioc);
+ cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
cfqq->ioprio_class = IOPRIO_CLASS_RT;
break;
case IOPRIO_CLASS_BE:
- cfqq->ioprio = task_ioprio(ioc);
+ cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
cfqq->ioprio_class = IOPRIO_CLASS_BE;
break;
case IOPRIO_CLASS_IDLE:
@@ -2776,19 +3478,24 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
cfq_clear_cfqq_prio_changed(cfqq);
}
-static void changed_ioprio(struct cfq_io_cq *cic)
+static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
{
+ int ioprio = cic->icq.ioc->ioprio;
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
- if (unlikely(!cfqd))
+ /*
+ * Check whether ioprio has changed. The condition may trigger
+ * spuriously on a newly created cic but there's no harm.
+ */
+ if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
return;
cfqq = cic->cfqq[BLK_RW_ASYNC];
if (cfqq) {
struct cfq_queue *new_cfqq;
- new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
- GFP_ATOMIC);
+ new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
+ GFP_ATOMIC);
if (new_cfqq) {
cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
cfq_put_queue(cfqq);
@@ -2798,6 +3505,8 @@ static void changed_ioprio(struct cfq_io_cq *cic)
cfqq = cic->cfqq[BLK_RW_SYNC];
if (cfqq)
cfq_mark_cfqq_prio_changed(cfqq);
+
+ cic->ioprio = ioprio;
}
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2821,17 +3530,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct cfq_io_cq *cic)
+static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
- struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
struct cfq_data *cfqd = cic_to_cfqd(cic);
- struct request_queue *q;
+ struct cfq_queue *sync_cfqq;
+ uint64_t id;
- if (unlikely(!cfqd))
- return;
+ rcu_read_lock();
+ id = bio_blkcg(bio)->id;
+ rcu_read_unlock();
- q = cfqd->queue;
+ /*
+ * Check whether blkcg has changed. The condition may trigger
+ * spuriously on a newly created cic but there's no harm.
+ */
+ if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
+ return;
+ sync_cfqq = cic_to_cfqq(cic, 1);
if (sync_cfqq) {
/*
* Drop reference to sync queue. A new sync queue will be
@@ -2841,21 +3557,26 @@ static void changed_cgroup(struct cfq_io_cq *cic)
cic_set_cfqq(cic, NULL, 1);
cfq_put_queue(sync_cfqq);
}
+
+ cic->blkcg_id = id;
}
+#else
+static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
- struct io_context *ioc, gfp_t gfp_mask)
+cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
+ struct bio *bio, gfp_t gfp_mask)
{
+ struct blkcg *blkcg;
struct cfq_queue *cfqq, *new_cfqq = NULL;
- struct cfq_io_cq *cic;
struct cfq_group *cfqg;
retry:
- cfqg = cfq_get_cfqg(cfqd);
- cic = cfq_cic_lookup(cfqd, ioc);
- /* cic always exists here */
+ rcu_read_lock();
+
+ blkcg = bio_blkcg(bio);
+ cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
cfqq = cic_to_cfqq(cic, is_sync);
/*
@@ -2868,6 +3589,7 @@ retry:
cfqq = new_cfqq;
new_cfqq = NULL;
} else if (gfp_mask & __GFP_WAIT) {
+ rcu_read_unlock();
spin_unlock_irq(cfqd->queue->queue_lock);
new_cfqq = kmem_cache_alloc_node(cfq_pool,
gfp_mask | __GFP_ZERO,
@@ -2875,6 +3597,8 @@ retry:
spin_lock_irq(cfqd->queue->queue_lock);
if (new_cfqq)
goto retry;
+ else
+ return &cfqd->oom_cfqq;
} else {
cfqq = kmem_cache_alloc_node(cfq_pool,
gfp_mask | __GFP_ZERO,
@@ -2883,7 +3607,7 @@ retry:
if (cfqq) {
cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
- cfq_init_prio_data(cfqq, ioc);
+ cfq_init_prio_data(cfqq, cic);
cfq_link_cfqq_cfqg(cfqq, cfqg);
cfq_log_cfqq(cfqd, cfqq, "alloced");
} else
@@ -2893,6 +3617,7 @@ retry:
if (new_cfqq)
kmem_cache_free(cfq_pool, new_cfqq);
+ rcu_read_unlock();
return cfqq;
}
@@ -2902,6 +3627,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
switch (ioprio_class) {
case IOPRIO_CLASS_RT:
return &cfqd->async_cfqq[0][ioprio];
+ case IOPRIO_CLASS_NONE:
+ ioprio = IOPRIO_NORM;
+ /* fall through */
case IOPRIO_CLASS_BE:
return &cfqd->async_cfqq[1][ioprio];
case IOPRIO_CLASS_IDLE:
@@ -2912,11 +3640,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
}
static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
- gfp_t gfp_mask)
+cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
+ struct bio *bio, gfp_t gfp_mask)
{
- const int ioprio = task_ioprio(ioc);
- const int ioprio_class = task_ioprio_class(ioc);
+ const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
+ const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
struct cfq_queue **async_cfqq = NULL;
struct cfq_queue *cfqq = NULL;
@@ -2926,7 +3654,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
}
if (!cfqq)
- cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
+ cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
/*
* pin the queue now that it's allocated, scheduler exit will prune it
@@ -3008,7 +3736,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
enable_idle = 0;
- else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
+ else if (!atomic_read(&cic->icq.ioc->active_ref) ||
!cfqd->cfq_slice_idle ||
(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
enable_idle = 0;
@@ -3068,7 +3796,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
return true;
/* Allow preemption only if we are idling on sync-noidle tree */
- if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+ if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
new_cfqq->service_tree->count == 2 &&
RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3120,7 +3848,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
* doesn't happen
*/
if (old_type != cfqq_type(cfqq))
- cfqq->cfqg->saved_workload_slice = 0;
+ cfqq->cfqg->saved_wl_slice = 0;
/*
* Put the new queue at the front of the of the current list,
@@ -3172,8 +3900,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfq_clear_cfqq_wait_request(cfqq);
__blk_run_queue(cfqd->queue);
} else {
- cfq_blkiocg_update_idle_time_stats(
- &cfqq->cfqg->blkg);
+ cfqg_stats_update_idle_time(cfqq->cfqg);
cfq_mark_cfqq_must_dispatch(cfqq);
}
}
@@ -3195,14 +3922,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
struct cfq_queue *cfqq = RQ_CFQQ(rq);
cfq_log_cfqq(cfqd, cfqq, "insert_request");
- cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
+ cfq_init_prio_data(cfqq, RQ_CIC(rq));
- rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
+ rq->fifo_time = jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &cfqq->fifo);
cfq_add_rq_rb(rq);
- cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
- &cfqd->serving_group->blkg, rq_data_dir(rq),
- rq_is_sync(rq));
+ cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
+ rq->cmd_flags);
cfq_rq_enqueued(cfqd, cfqq, rq);
}
@@ -3298,23 +4024,23 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
cfqd->rq_in_driver--;
cfqq->dispatched--;
(RQ_CFQG(rq))->dispatched--;
- cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
- rq_start_time_ns(rq), rq_io_start_time_ns(rq),
- rq_data_dir(rq), rq_is_sync(rq));
+ cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
+ rq_io_start_time_ns(rq), rq->cmd_flags);
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
if (sync) {
- struct cfq_rb_root *service_tree;
+ struct cfq_rb_root *st;
RQ_CIC(rq)->ttime.last_end_request = now;
if (cfq_cfqq_on_rr(cfqq))
- service_tree = cfqq->service_tree;
+ st = cfqq->service_tree;
else
- service_tree = service_tree_for(cfqq->cfqg,
- cfqq_prio(cfqq), cfqq_type(cfqq));
- service_tree->ttime.last_end_request = now;
+ st = st_for(cfqq->cfqg, cfqq_class(cfqq),
+ cfqq_type(cfqq));
+
+ st->ttime.last_end_request = now;
if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
cfqd->last_delayed_sync = now;
}
@@ -3397,7 +4123,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
if (cfqq) {
- cfq_init_prio_data(cfqq, cic->icq.ioc);
+ cfq_init_prio_data(cfqq, cic);
return __cfq_may_queue(cfqq);
}
@@ -3419,7 +4145,7 @@ static void cfq_put_request(struct request *rq)
cfqq->allocated[rw]--;
/* Put down rq reference on cfqg */
- cfq_put_cfqg(RQ_CFQG(rq));
+ cfqg_put(RQ_CFQG(rq));
rq->elv.priv[0] = NULL;
rq->elv.priv[1] = NULL;
@@ -3463,7 +4189,8 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
* Allocate cfq data structures associated with this request.
*/
static int
-cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
+ gfp_t gfp_mask)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
@@ -3475,20 +4202,12 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
spin_lock_irq(q->queue_lock);
- /* handle changed notifications */
- if (unlikely(cic->icq.changed)) {
- if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
- changed_ioprio(cic);
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
- if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
- changed_cgroup(cic);
-#endif
- }
-
+ check_ioprio_changed(cic, bio);
+ check_blkcg_changed(cic, bio);
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
- cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
+ cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
cic_set_cfqq(cic, cfqq, is_sync);
} else {
/*
@@ -3514,8 +4233,9 @@ new_queue:
cfqq->allocated[rw]++;
cfqq->ref++;
+ cfqg_get(cfqq->cfqg);
rq->elv.priv[0] = cfqq;
- rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
+ rq->elv.priv[1] = cfqq->cfqg;
spin_unlock_irq(q->queue_lock);
return 0;
}
@@ -3612,7 +4332,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
{
struct cfq_data *cfqd = e->elevator_data;
struct request_queue *q = cfqd->queue;
- bool wait = false;
cfq_shutdown_timer_wq(cfqd);
@@ -3622,89 +4341,64 @@ static void cfq_exit_queue(struct elevator_queue *e)
__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
cfq_put_async_queues(cfqd);
- cfq_release_cfq_groups(cfqd);
-
- /*
- * If there are groups which we could not unlink from blkcg list,
- * wait for a rcu period for them to be freed.
- */
- if (cfqd->nr_blkcg_linked_grps)
- wait = true;
spin_unlock_irq(q->queue_lock);
cfq_shutdown_timer_wq(cfqd);
- /*
- * Wait for cfqg->blkg->key accessors to exit their grace periods.
- * Do this wait only if there are other unlinked groups out
- * there. This can happen if cgroup deletion path claimed the
- * responsibility of cleaning up a group before queue cleanup code
- * get to the group.
- *
- * Do not call synchronize_rcu() unconditionally as there are drivers
- * which create/delete request queue hundreds of times during scan/boot
- * and synchronize_rcu() can take significant time and slow down boot.
- */
- if (wait)
- synchronize_rcu();
-
#ifdef CONFIG_CFQ_GROUP_IOSCHED
- /* Free up per cpu stats for root group */
- free_percpu(cfqd->root_group.blkg.stats_cpu);
+ blkcg_deactivate_policy(q, &blkcg_policy_cfq);
+#else
+ kfree(cfqd->root_group);
#endif
kfree(cfqd);
}
-static void *cfq_init_queue(struct request_queue *q)
+static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct cfq_data *cfqd;
- int i, j;
- struct cfq_group *cfqg;
- struct cfq_rb_root *st;
+ struct blkcg_gq *blkg __maybe_unused;
+ int i, ret;
+ struct elevator_queue *eq;
- cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!cfqd)
- return NULL;
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
- /* Init root service tree */
- cfqd->grp_service_tree = CFQ_RB_ROOT;
+ cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
+ if (!cfqd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = cfqd;
- /* Init root group */
- cfqg = &cfqd->root_group;
- for_each_cfqg_st(cfqg, i, j, st)
- *st = CFQ_RB_ROOT;
- RB_CLEAR_NODE(&cfqg->rb_node);
+ cfqd->queue = q;
+ spin_lock_irq(q->queue_lock);
+ q->elevator = eq;
+ spin_unlock_irq(q->queue_lock);
- /* Give preference to root group over other groups */
- cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+ /* Init root service tree */
+ cfqd->grp_service_tree = CFQ_RB_ROOT;
+ /* Init root group and prefer root group over other groups by default */
#ifdef CONFIG_CFQ_GROUP_IOSCHED
- /*
- * Set root group reference to 2. One reference will be dropped when
- * all groups on cfqd->cfqg_list are being deleted during queue exit.
- * Other reference will remain there as we don't want to delete this
- * group as it is statically allocated and gets destroyed when
- * throtl_data goes away.
- */
- cfqg->ref = 2;
-
- if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
- kfree(cfqg);
- kfree(cfqd);
- return NULL;
- }
-
- rcu_read_lock();
+ ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
+ if (ret)
+ goto out_free;
- cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
- (void *)cfqd, 0);
- rcu_read_unlock();
- cfqd->nr_blkcg_linked_grps++;
+ cfqd->root_group = blkg_to_cfqg(q->root_blkg);
+#else
+ ret = -ENOMEM;
+ cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
+ GFP_KERNEL, cfqd->queue->node);
+ if (!cfqd->root_group)
+ goto out_free;
- /* Add group on cfqd->cfqg_list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+ cfq_init_cfqg_base(cfqd->root_group);
#endif
+ cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
+ cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+
/*
* Not strictly needed (since RB_ROOT just clears the node and we
* zeroed cfqd on alloc), but better be safe in case someone decides
@@ -3716,13 +4410,17 @@ static void *cfq_init_queue(struct request_queue *q)
/*
* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
* Grab a permanent reference to it, so that the normal code flow
- * will not attempt to free it.
+ * will not attempt to free it. oom_cfqq is linked to root_group
+ * but shouldn't hold a reference as it'll never be unlinked. Lose
+ * the reference from linking right away.
*/
cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
cfqd->oom_cfqq.ref++;
- cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
- cfqd->queue = q;
+ spin_lock_irq(q->queue_lock);
+ cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
+ cfqg_put(cfqd->root_group);
+ spin_unlock_irq(q->queue_lock);
init_timer(&cfqd->idle_slice_timer);
cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
@@ -3737,6 +4435,7 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_back_penalty = cfq_back_penalty;
cfqd->cfq_slice[0] = cfq_slice_async;
cfqd->cfq_slice[1] = cfq_slice_sync;
+ cfqd->cfq_target_latency = cfq_target_latency;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
cfqd->cfq_group_idle = cfq_group_idle;
@@ -3747,7 +4446,12 @@ static void *cfq_init_queue(struct request_queue *q)
* second, in order to have larger depth for async operations.
*/
cfqd->last_delayed_sync = jiffies - HZ;
- return cfqd;
+ return 0;
+
+out_free:
+ kfree(cfqd);
+ kobject_put(&eq->kobj);
+ return ret;
}
/*
@@ -3756,7 +4460,7 @@ static void *cfq_init_queue(struct request_queue *q)
static ssize_t
cfq_var_show(unsigned int var, char *page)
{
- return sprintf(page, "%d\n", var);
+ return sprintf(page, "%u\n", var);
}
static ssize_t
@@ -3788,6 +4492,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
+SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -3821,6 +4526,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
+STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
#undef STORE_FUNCTION
#define CFQ_ATTR(name) \
@@ -3838,6 +4544,7 @@ static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(slice_idle),
CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
+ CFQ_ATTR(target_latency),
__ATTR_NULL
};
@@ -3871,15 +4578,14 @@ static struct elevator_type iosched_cfq = {
};
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static struct blkio_policy_type blkio_policy_cfq = {
- .ops = {
- .blkio_unlink_group_fn = cfq_unlink_blkio_group,
- .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
- },
- .plid = BLKIO_POLICY_PROP,
+static struct blkcg_policy blkcg_policy_cfq = {
+ .pd_size = sizeof(struct cfq_group),
+ .cftypes = cfq_blkcg_files,
+
+ .pd_init_fn = cfq_pd_init,
+ .pd_offline_fn = cfq_pd_offline,
+ .pd_reset_stats_fn = cfq_pd_reset_stats,
};
-#else
-static struct blkio_policy_type blkio_policy_cfq;
#endif
static int __init cfq_init(void)
@@ -3897,27 +4603,39 @@ static int __init cfq_init(void)
#ifdef CONFIG_CFQ_GROUP_IOSCHED
if (!cfq_group_idle)
cfq_group_idle = 1;
+
+ ret = blkcg_policy_register(&blkcg_policy_cfq);
+ if (ret)
+ return ret;
#else
- cfq_group_idle = 0;
+ cfq_group_idle = 0;
#endif
+
+ ret = -ENOMEM;
cfq_pool = KMEM_CACHE(cfq_queue, 0);
if (!cfq_pool)
- return -ENOMEM;
+ goto err_pol_unreg;
ret = elv_register(&iosched_cfq);
- if (ret) {
- kmem_cache_destroy(cfq_pool);
- return ret;
- }
-
- blkio_policy_register(&blkio_policy_cfq);
+ if (ret)
+ goto err_free_pool;
return 0;
+
+err_free_pool:
+ kmem_cache_destroy(cfq_pool);
+err_pol_unreg:
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_cfq);
+#endif
+ return ret;
}
static void __exit cfq_exit(void)
{
- blkio_policy_unregister(&blkio_policy_cfq);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_cfq);
+#endif
elv_unregister(&iosched_cfq);
kmem_cache_destroy(cfq_pool);
}