diff options
Diffstat (limited to 'block/cfq-iosched.c')
-rw-r--r-- | block/cfq-iosched.c | 1493 |
1 files changed, 1338 insertions, 155 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index aa1e9535e35..cfb0b2f5f63 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -9,9 +9,11 @@ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/elevator.h> +#include <linux/jiffies.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> +#include "blk-cgroup.h" /* * tunables @@ -27,6 +29,8 @@ static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ +static const int cfq_hist_divisor = 4; /* * offset from end of service tree @@ -38,8 +42,15 @@ static int cfq_slice_idle = HZ / 125; */ #define CFQ_MIN_TT (2) +/* + * Allow merged cfqqs to perform this amount of seeky I/O before + * deciding to break the queues up again. + */ +#define CFQQ_COOP_TOUT (HZ) + #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) +#define CFQ_SERVICE_SHIFT 12 #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) @@ -57,6 +68,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock); #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) +#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) /* * Most of our rbtree usage is for sorting with min extraction, so @@ -67,8 +79,12 @@ static DEFINE_SPINLOCK(ioc_gone_lock); struct cfq_rb_root { struct rb_root rb; struct rb_node *left; + unsigned count; + u64 min_vdisktime; + struct rb_node *active; + unsigned total_weight; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } /* * Per process-grouping structure @@ -99,6 +115,11 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; + /* time when queue got scheduled in to dispatch first request. */ + unsigned long dispatch_start; + unsigned int allocated_slice; + /* time when first request from queue completed and slice started. */ + unsigned long slice_start; unsigned long slice_end; long slice_resid; unsigned int slice_dispatch; @@ -112,7 +133,71 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + unsigned long seeky_start; + pid_t pid; + + struct cfq_rb_root *service_tree; + struct cfq_queue *new_cfqq; + struct cfq_group *cfqg; + struct cfq_group *orig_cfqg; + /* Sectors dispatched in current dispatch round */ + unsigned long nr_sectors; +}; + +/* + * First index in the service_trees. + * IDLE is handled separately, so it has negative index + */ +enum wl_prio_t { + BE_WORKLOAD = 0, + RT_WORKLOAD = 1, + IDLE_WORKLOAD = 2, +}; + +/* + * Second index in the service_trees. + */ +enum wl_type_t { + ASYNC_WORKLOAD = 0, + SYNC_NOIDLE_WORKLOAD = 1, + SYNC_WORKLOAD = 2 +}; + +/* This is per cgroup per device grouping structure */ +struct cfq_group { + /* group service_tree member */ + struct rb_node rb_node; + + /* group service_tree key */ + u64 vdisktime; + unsigned int weight; + bool on_st; + + /* number of cfqq currently on this group */ + int nr_cfqq; + + /* Per group busy queus average. Useful for workload slice calc. */ + unsigned int busy_queues_avg[2]; + /* + * rr lists of queues with requests, onle rr for each priority class. + * Counts are embedded in the cfq_rb_root + */ + struct cfq_rb_root service_trees[2][3]; + struct cfq_rb_root service_tree_idle; + + unsigned long saved_workload_slice; + enum wl_type_t saved_workload; + enum wl_prio_t saved_serving_prio; + struct blkio_group blkg; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + struct hlist_node cfqd_node; + atomic_t ref; +#endif }; /* @@ -120,11 +205,20 @@ struct cfq_queue { */ struct cfq_data { struct request_queue *queue; + /* Root service tree for cfq_groups */ + struct cfq_rb_root grp_service_tree; + struct cfq_group root_group; + /* Number of active cfq groups on group service tree */ + int nr_groups; /* - * rr list of queues with requests and the count of them + * The priority currently being served */ - struct cfq_rb_root service_tree; + enum wl_prio_t serving_prio; + enum wl_type_t serving_type; + unsigned long workload_expires; + struct cfq_group *serving_group; + bool noidle_tree_requires_idle; /* * Each priority tree is sorted by next_request position. These @@ -143,8 +237,14 @@ struct cfq_data { */ int rq_queued; int hw_tag; - int hw_tag_samples; - int rq_in_driver_peak; + /* + * hw_tag can be + * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) + * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) + * 0 => no NCQ + */ + int hw_tag_est_depth; + unsigned int hw_tag_samples; /* * idle window management @@ -174,6 +274,7 @@ struct cfq_data { unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; unsigned int cfq_latency; + unsigned int cfq_group_isolation; struct list_head cic_list; @@ -183,8 +284,28 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_end_sync_rq; + + /* List of cfq groups being managed on this device*/ + struct hlist_head cfqg_list; + struct rcu_head rcu; }; +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); + +static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, + enum wl_prio_t prio, + enum wl_type_t type, + struct cfq_data *cfqd) +{ + if (!cfqg) + return NULL; + + if (prio == IDLE_WORKLOAD) + return &cfqg->service_tree_idle; + + return &cfqg->service_trees[prio][type]; +} + enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ @@ -195,8 +316,10 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ - CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ - CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */ + CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ + CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ + CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ + CFQ_CFQQ_FLAG_wait_busy_done, /* Got new request. Expire the queue */ }; #define CFQ_CFQQ_FNS(name) \ @@ -223,14 +346,78 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); -CFQ_CFQQ_FNS(coop_preempt); +CFQ_CFQQ_FNS(deep); +CFQ_CFQQ_FNS(wait_busy); +CFQ_CFQQ_FNS(wait_busy_done); #undef CFQ_CFQQ_FNS +#ifdef CONFIG_DEBUG_CFQ_IOSCHED +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&(cfqq)->cfqg->blkg), ##args); + +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ + blkg_path(&(cfqg)->blkg), ##args); \ + +#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +#endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) +/* Traverses through cfq group service trees */ +#define for_each_cfqg_st(cfqg, i, j, st) \ + for (i = 0; i <= IDLE_WORKLOAD; i++) \ + for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ + : &cfqg->service_tree_idle; \ + (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ + (i == IDLE_WORKLOAD && j == 0); \ + j++, st = i < IDLE_WORKLOAD ? \ + &cfqg->service_trees[i][j]: NULL) \ + + +static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) +{ + if (cfq_class_idle(cfqq)) + return IDLE_WORKLOAD; + if (cfq_class_rt(cfqq)) + return RT_WORKLOAD; + return BE_WORKLOAD; +} + + +static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) +{ + if (!cfq_cfqq_sync(cfqq)) + return ASYNC_WORKLOAD; + if (!cfq_cfqq_idle_window(cfqq)) + return SYNC_NOIDLE_WORKLOAD; + return SYNC_WORKLOAD; +} + +static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, + struct cfq_data *cfqd, + struct cfq_group *cfqg) +{ + if (wl == IDLE_WORKLOAD) + return cfqg->service_tree_idle.count; + + return cfqg->service_trees[wl][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_WORKLOAD].count; +} + +static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg) +{ + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; +} + static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, struct io_context *, gfp_t); @@ -279,7 +466,7 @@ static int cfq_queue_empty(struct request_queue *q) { struct cfq_data *cfqd = q->elevator->elevator_data; - return !cfqd->busy_queues; + return !cfqd->rq_queued; } /* @@ -303,10 +490,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } +static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +{ + u64 d = delta << CFQ_SERVICE_SHIFT; + + d = d * BLKIO_WEIGHT_DEFAULT; + do_div(d, cfqg->weight); + return d; +} + +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta > 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta < 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static void update_min_vdisktime(struct cfq_rb_root *st) +{ + u64 vdisktime = st->min_vdisktime; + struct cfq_group *cfqg; + + if (st->active) { + cfqg = rb_entry_cfqg(st->active); + vdisktime = cfqg->vdisktime; + } + + if (st->left) { + cfqg = rb_entry_cfqg(st->left); + vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); + } + + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + +/* + * get averaged number of queues of RT/BE priority. + * average is updated, with a formula that gives more weight to higher numbers, + * to quickly follows sudden increases and decrease slowly + */ + +static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg, bool rt) +{ + unsigned min_q, max_q; + unsigned mult = cfq_hist_divisor - 1; + unsigned round = cfq_hist_divisor / 2; + unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); + + min_q = min(cfqg->busy_queues_avg[rt], busy); + max_q = max(cfqg->busy_queues_avg[rt], busy); + cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / + cfq_hist_divisor; + return cfqg->busy_queues_avg[rt]; +} + +static inline unsigned +cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + return cfq_target_latency * cfqg->weight / st->total_weight; +} + static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; + unsigned slice = cfq_prio_to_slice(cfqd, cfqq); + if (cfqd->cfq_latency) { + /* + * interested queues (we consider only the ones with the same + * priority class in the cfq group) + */ + unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, + cfq_class_rt(cfqq)); + unsigned sync_slice = cfqd->cfq_slice[1]; + unsigned expect_latency = sync_slice * iq; + unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); + + if (expect_latency > group_slice) { + unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; + /* scale low_slice according to IO priority + * and sync vs async */ + unsigned low_slice = + min(slice, base_low_slice * slice / sync_slice); + /* the adapted slice value is scaled to fit all iqs + * into the target latency */ + slice = max(slice * group_slice / expect_latency, + low_slice); + } + } + cfqq->slice_start = jiffies; + cfqq->slice_end = jiffies + slice; + cfqq->allocated_slice = slice; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } @@ -331,9 +618,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq) * behind the head is penalized and only allowed to a certain extent. */ static struct request * -cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) +cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) { - sector_t last, s1, s2, d1 = 0, d2 = 0; + sector_t s1, s2, d1 = 0, d2 = 0; unsigned long back_max; #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ @@ -356,8 +643,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); - last = cfqd->last_position; - /* * by definition, 1KiB is 2 sectors */ @@ -425,6 +710,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) */ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) { + /* Service tree is empty */ + if (!root->count) + return NULL; + if (!root->left) root->left = rb_first(&root->rb); @@ -434,6 +723,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) return NULL; } +static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) +{ + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_cfqg(root->left); + + return NULL; +} + static void rb_erase_init(struct rb_node *n, struct rb_root *root) { rb_erase(n, root); @@ -445,6 +745,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) if (root->left == n) root->left = NULL; rb_erase_init(n, &root->rb); + --root->count; } /* @@ -471,7 +772,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, next = rb_entry_rq(rbnext); } - return cfq_choose_req(cfqd, next, prev); + return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); } static unsigned long cfq_slice_offset(struct cfq_data *cfqd, @@ -480,12 +781,336 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, /* * just an approximation, should be ok. */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - + return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } +static inline s64 +cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + return cfqg->vdisktime - st->min_vdisktime; +} + +static void +__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct cfq_group *__cfqg; + s64 key = cfqg_key(st, cfqg); + int left = 1; + + while (*node != NULL) { + parent = *node; + __cfqg = rb_entry_cfqg(parent); + + if (key < cfqg_key(st, __cfqg)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &cfqg->rb_node; + + rb_link_node(&cfqg->rb_node, parent, node); + rb_insert_color(&cfqg->rb_node, &st->rb); +} + +static void +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *__cfqg; + struct rb_node *n; + + cfqg->nr_cfqq++; + if (cfqg->on_st) + return; + + /* + * Currently put the group at the end. Later implement something + * so that groups get lesser vtime based on their weights, so that + * if group does not loose all if it was not continously backlogged. + */ + n = rb_last(&st->rb); + if (n) { + __cfqg = rb_entry_cfqg(n); + cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; + } else + cfqg->vdisktime = st->min_vdisktime; + + __cfq_group_service_tree_add(st, cfqg); + cfqg->on_st = true; + cfqd->nr_groups++; + st->total_weight += cfqg->weight; +} + +static void +cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + if (st->active == &cfqg->rb_node) + st->active = NULL; + + BUG_ON(cfqg->nr_cfqq < 1); + cfqg->nr_cfqq--; + + /* If there are other cfq queues under this group, don't delete it */ + if (cfqg->nr_cfqq) + return; + + cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); + cfqg->on_st = false; + cfqd->nr_groups--; + st->total_weight -= cfqg->weight; + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + cfq_rb_erase(&cfqg->rb_node, st); + cfqg->saved_workload_slice = 0; + blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); +} + +static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) +{ + unsigned int slice_used; + + /* + * Queue got expired before even a single request completed or + * got expired immediately after first request completion. + */ + if (!cfqq->slice_start || cfqq->slice_start == jiffies) { + /* + * Also charge the seek time incurred to the group, otherwise + * if there are mutiple queues in the group, each can dispatch + * a single request on seeky media and cause lots of seek time + * and group will never know it. + */ + slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), + 1); + } else { + slice_used = jiffies - cfqq->slice_start; + if (slice_used > cfqq->allocated_slice) + slice_used = cfqq->allocated_slice; + } + + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, + cfqq->nr_sectors); + return slice_used; +} + +static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, + struct cfq_queue *cfqq) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + unsigned int used_sl, charge_sl; + int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) + - cfqg->service_tree_idle.count; + + BUG_ON(nr_sync < 0); + used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); + + if (!cfq_cfqq_sync(cfqq) && !nr_sync) + charge_sl = cfqq->allocated_slice; + + /* Can't update vdisktime while group is on service tree */ + cfq_rb_erase(&cfqg->rb_node, st); + cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); + __cfq_group_service_tree_add(st, cfqg); + + /* This group is being expired. Save the context */ + if (time_after(cfqd->workload_expires, jiffies)) { + cfqg->saved_workload_slice = cfqd->workload_expires + - jiffies; + cfqg->saved_workload = cfqd->serving_type; + cfqg->saved_serving_prio = cfqd->serving_prio; + } else + cfqg->saved_workload_slice = 0; + + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, + st->min_vdisktime); + blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, + cfqq->nr_sectors); +} + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct cfq_group, blkg); + return NULL; +} + +void +cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + cfqg_of_blkg(blkg)->weight = weight; +} + +static struct cfq_group * +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct cfq_group *cfqg = NULL; + void *key = cfqd; + int i, j; + struct cfq_rb_root *st; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + + /* Do we need to take this reference */ + if (!blkiocg_css_tryget(blkcg)) + return NULL;; + + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg || !create) + goto done; + + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); + if (!cfqg) + goto done; + + cfqg->weight = blkcg->weight; + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&cfqg->ref, 1); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + MKDEV(major, minor)); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + +done: + blkiocg_css_put(blkcg); + return cfqg; +} + /* - * The cfqd->service_tree holds all pending cfq_queue's that have + * Search for the cfq group current task belongs to. If create = 1, then also + * create the cfq group if it does not exist. request_queue lock must be held. + */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + struct cgroup *cgroup; + struct cfq_group *cfqg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); + if (!cfqg && create) + cfqg = &cfqd->root_group; + rcu_read_unlock(); + return cfqg; +} + +static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + /* Currently, all async queues are mapped to root group */ + if (!cfq_cfqq_sync(cfqq)) + cfqg = &cfqq->cfqd->root_group; + + cfqq->cfqg = cfqg; + /* cfqq reference on cfqg */ + atomic_inc(&cfqq->cfqg->ref); +} + +static void cfq_put_cfqg(struct cfq_group *cfqg) +{ + struct cfq_rb_root *st; + int i, j; + + BUG_ON(atomic_read(&cfqg->ref) <= 0); + if (!atomic_dec_and_test(&cfqg->ref)) + return; + for_each_cfqg_st(cfqg, i, j, st) + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); + kfree(cfqg); +} + +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + + hlist_del_init(&cfqg->cfqd_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + cfq_put_cfqg(cfqg); +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) +{ + struct hlist_node *pos, *n; + struct cfq_group *cfqg; + + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); + } +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu + * read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if elevator was exiting, cgroup deltion + * path got to it first. + */ +void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct cfq_data *cfqd = key; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + return &cfqd->root_group; +} +static inline void +cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { + cfqq->cfqg = cfqg; +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} +static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} + +#endif /* GROUP_IOSCHED */ + +/* + * The cfqd->service_trees holds all pending cfq_queue's that have * requests waiting to be processed. It is sorted in the order that * we will service the queues. */ @@ -495,11 +1120,42 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; + struct cfq_rb_root *service_tree; int left; + int new_cfqq = 1; + int group_changed = 0; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD + && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { + /* Move this cfq to root group */ + cfq_log_cfqq(cfqd, cfqq, "moving to root group"); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfqq->orig_cfqg = cfqq->cfqg; + cfqq->cfqg = &cfqd->root_group; + atomic_inc(&cfqd->root_group.ref); + group_changed = 1; + } else if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { + /* cfqq is sequential now needs to go to its original group */ + BUG_ON(cfqq->cfqg != &cfqd->root_group); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfq_put_cfqg(cfqq->cfqg); + cfqq->cfqg = cfqq->orig_cfqg; + cfqq->orig_cfqg = NULL; + group_changed = 1; + cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); + } +#endif + service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), + cfqq_type(cfqq), cfqd); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); + parent = rb_last(&service_tree->rb); if (parent && parent != &cfqq->rb_node) { __cfqq = rb_entry(parent, struct cfq_queue, rb_node); rb_key += __cfqq->rb_key; @@ -517,23 +1173,27 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->slice_resid = 0; } else { rb_key = -HZ; - __cfqq = cfq_rb_first(&cfqd->service_tree); + __cfqq = cfq_rb_first(service_tree); rb_key += __cfqq ? __cfqq->rb_key : jiffies; } if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + new_cfqq = 0; /* * same position, nothing more to do */ - if (rb_key == cfqq->rb_key) + if (rb_key == cfqq->rb_key && + cfqq->service_tree == service_tree) return; - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; } left = 1; parent = NULL; - p = &cfqd->service_tree.rb.rb_node; + cfqq->service_tree = service_tree; + p = &service_tree->rb.rb_node; while (*p) { struct rb_node **n; @@ -541,35 +1201,28 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, __cfqq = rb_entry(parent, struct cfq_queue, rb_node); /* - * sort RT queues first, we always want to give - * preference to them. IDLE queues goes to the back. - * after that, sort on the next service time. + * sort by key, that represents service time. */ - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) - n = &(*p)->rb_right; - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) - n = &(*p)->rb_right; - else if (time_before(rb_key, __cfqq->rb_key)) + if (time_before(rb_key, __cfqq->rb_key)) n = &(*p)->rb_left; - else + else { n = &(*p)->rb_right; - - if (n == &(*p)->rb_right) left = 0; + } p = n; } if (left) - cfqd->service_tree.left = &cfqq->rb_node; + service_tree->left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); + rb_insert_color(&cfqq->rb_node, &service_tree->rb); + service_tree->count++; + if ((add_front || !new_cfqq) && !group_changed) + return; + cfq_group_service_tree_add(cfqd, cfqq->cfqg); } static struct cfq_queue * @@ -671,13 +1324,16 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; + } if (cfqq->p_root) { rb_erase(&cfqq->p_node, cfqq->p_root); cfqq->p_root = NULL; } + cfq_group_service_tree_del(cfqd, cfqq->cfqg); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; } @@ -688,7 +1344,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -696,8 +1351,17 @@ static void cfq_del_rq_rb(struct request *rq) elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { + /* + * Queue will be deleted from service tree when we actually + * expire it later. Right now just remove it from prio tree + * as it is empty. + */ + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) @@ -722,7 +1386,7 @@ static void cfq_add_rq_rb(struct request *rq) * check if this request is a better next-serve candidate */ prev = cfqq->next_rq; - cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); + cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); /* * adjust priority tree position, if ->next_rq changes @@ -829,6 +1493,7 @@ static void cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { + struct cfq_queue *cfqq = RQ_CFQQ(rq); /* * reposition in fifo if next is older than rq */ @@ -838,6 +1503,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, rq_set_fifo_time(rq, rq_fifo_time(next)); } + if (cfqq->next_rq == next) + cfqq->next_rq = rq; cfq_remove_request(next); } @@ -848,6 +1515,9 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, struct cfq_io_context *cic; struct cfq_queue *cfqq; + /* Deny merge if bio and rq don't belong to same cfq group */ + if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0)) + return false; /* * Disallow merge of a sync bio into an async request. */ @@ -871,8 +1541,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, { if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfqq->slice_start = 0; + cfqq->dispatch_start = jiffies; + cfqq->allocated_slice = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -899,6 +1573,8 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, del_timer(&cfqd->idle_slice_timer); cfq_clear_cfqq_wait_request(cfqq); + cfq_clear_cfqq_wait_busy(cfqq); + cfq_clear_cfqq_wait_busy_done(cfqq); /* * store what was left of this slice, if the queue idled/timed out @@ -908,11 +1584,19 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } + cfq_group_served(cfqd, cfqq->cfqg, cfqq); + + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) + cfq_del_cfqq_rr(cfqd, cfqq); + cfq_resort_rr_list(cfqd, cfqq); if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; + if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) + cfqd->grp_service_tree.active = NULL; + if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -933,10 +1617,39 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) + struct cfq_rb_root *service_tree = + service_tree_for(cfqd->serving_group, cfqd->serving_prio, + cfqd->serving_type, cfqd); + + if (!cfqd->rq_queued) return NULL; - return cfq_rb_first(&cfqd->service_tree); + /* There is nothing to dispatch */ + if (!service_tree) + return NULL; + if (RB_EMPTY_ROOT(&service_tree->rb)) + return NULL; + return cfq_rb_first(service_tree); +} + +static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg; + struct cfq_queue *cfqq; + int i, j; + struct cfq_rb_root *st; + + if (!cfqd->rq_queued) + return NULL; + + cfqg = cfq_get_next_cfqg(cfqd); + if (!cfqg) + return NULL; + + for_each_cfqg_st(cfqg, i, j, st) + if ((cfqq = cfq_rb_first(st)) != NULL) + return cfqq; + return NULL; } /* @@ -945,14 +1658,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - if (!cfqq) { + if (!cfqq) cfqq = cfq_get_next_queue(cfqd); - if (cfqq && !cfq_cfqq_coop_preempt(cfqq)) - cfq_clear_cfqq_coop(cfqq); - } - - if (cfqq) - cfq_clear_cfqq_coop_preempt(cfqq); __cfq_set_active_queue(cfqd, cfqq); return cfqq; |