diff options
author | Jens Axboe <axboe@kernel.dk> | 2013-01-11 19:53:53 +0100 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2013-01-11 19:53:53 +0100 |
commit | ac9a19745196388ae5d828c0be7a1d6e472101f3 (patch) | |
tree | 49c47e1a07241653deb4a4b4e7a91626f586ad05 /block | |
parent | 422765c2638924da10ff363b5eed77924911bdc7 (diff) | |
parent | 43114018cb0b253fd03c4ff4d42bcdc43389ac1c (diff) |
Merge branch 'blkcg-cfq-hierarchy' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup into for-3.9/core
Tejun writes:
Hello, Jens.
Please consider pulling from the following branch to receive cfq blkcg
hierarchy support. The branch is based on top of v3.8-rc2.
git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git blkcg-cfq-hierarchy
The patchset was reviewd in the following thread.
http://thread.gmane.org/gmane.linux.kernel.cgroups/5571
Diffstat (limited to 'block')
-rw-r--r-- | block/blk-cgroup.c | 277 | ||||
-rw-r--r-- | block/blk-cgroup.h | 68 | ||||
-rw-r--r-- | block/blk-sysfs.c | 9 | ||||
-rw-r--r-- | block/cfq-iosched.c | 627 |
4 files changed, 818 insertions, 163 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b8858fb0caf..87ea95d1f53 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -26,11 +26,32 @@ static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; +struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, + .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip + * subtree. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ + cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ + (p_blkg)->q, false))) + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -112,9 +133,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->pd[i] = pd; pd->blkg = blkg; + pd->plid = i; /* invoke per-policy init */ - if (blkcg_policy_enabled(blkg->q, pol)) + if (pol->pd_init_fn) pol->pd_init_fn(blkg); } @@ -125,8 +147,19 @@ err_free: return NULL; } +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; @@ -135,14 +168,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, return blkg; /* - * Hint didn't match. Look up from the radix tree. Note that we - * may not be holding queue_lock and thus are not sure whether - * @blkg from blkg_tree has already been removed or not, so we - * can't update hint to the lookup result. Leave it to the caller. + * Hint didn't match. Look up from the radix tree. Note that the + * hint can only be updated under queue_lock as otherwise @blkg + * could have already been removed from blkg_tree. The caller is + * responsible for grabbing queue_lock if @update_hint. */ blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) + if (blkg && blkg->q == q) { + if (update_hint) { + lockdep_assert_held(q->queue_lock); + rcu_assign_pointer(blkcg->blkg_hint, blkg); + } return blkg; + } return NULL; } @@ -162,7 +200,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) if (unlikely(blk_queue_bypass(q))) return NULL; - return __blkg_lookup(blkcg, q); + return __blkg_lookup(blkcg, q, false); } EXPORT_SYMBOL_GPL(blkg_lookup); @@ -170,75 +208,129 @@ EXPORT_SYMBOL_GPL(blkg_lookup); * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. */ -static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q, - struct blkcg_gq *new_blkg) +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, + struct request_queue *q, + struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - int ret; + int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); - /* lookup and update hint on success, see __blkg_lookup() for details */ - blkg = __blkg_lookup(blkcg, q); - if (blkg) { - rcu_assign_pointer(blkcg->blkg_hint, blkg); - goto out_free; - } - /* blkg holds a reference to blkcg */ if (!css_tryget(&blkcg->css)) { - blkg = ERR_PTR(-EINVAL); - goto out_free; + ret = -EINVAL; + goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); if (unlikely(!new_blkg)) { - blkg = ERR_PTR(-ENOMEM); - goto out_put; + ret = -ENOMEM; + goto err_put_css; } } blkg = new_blkg; - /* insert */ + /* link parent and insert */ + if (blkcg_parent(blkcg)) { + blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + if (WARN_ON_ONCE(!blkg->parent)) { + blkg = ERR_PTR(-EINVAL); + goto err_put_css; + } + blkg_get(blkg->parent); + } + spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &q->blkg_list); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_online_fn) + pol->pd_online_fn(blkg); + } } + blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; - blkg = ERR_PTR(ret); -out_put: + /* @blkg failed fully initialized, use the usual release path */ + blkg_put(blkg); + return ERR_PTR(ret); + +err_put_css: css_put(&blkcg->css); -out_free: +err_free_blkg: blkg_free(new_blkg); - return blkg; + return ERR_PTR(ret); } +/** + * blkg_lookup_create - lookup blkg, try to create one if not there + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * create one. blkg creation is performed recursively from blkcg_root such + * that all non-root blkg's have access to the parent blkg. This function + * should be called under RCU read lock and @q->queue_lock. + * + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). + */ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { + struct blkcg_gq *blkg; + + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + /* * This could be the first entry point of blkcg implementation and * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); - return __blkg_lookup_create(blkcg, q, NULL); + + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + return blkg; + + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent = blkcg_parent(blkcg); + + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } + + blkg = blkg_create(pos, q, NULL); + if (pos == blkcg || IS_ERR(blkg)) + return blkg; + } } EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + int i; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); @@ -247,6 +339,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_offline_fn) + pol->pd_offline_fn(blkg); + } + blkg->online = false; + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); @@ -301,8 +401,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head) void __blkg_release(struct blkcg_gq *blkg) { - /* release the extra blkcg reference this blkg has been holding */ + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); /* * A group is freed in rcu manner. But having an rcu lock does not @@ -402,8 +504,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg) * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the - * policy data and @data. If @show_total is %true, the sum of the return - * values from @prfill is printed with "Total" label at the end. + * policy data and @data and the matching queue lock held. If @show_total + * is %true, the sum of the return values from @prfill is printed with + * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. @@ -418,11 +521,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, struct hlist_node *n; u64 total = 0; - spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(&blkcg->lock); + spin_unlock_irq(blkg->q->queue_lock); + } + rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); @@ -481,6 +587,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); /** * blkg_prfill_stat - prfill callback for blkg_stat @@ -514,6 +621,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); /** + * blkg_stat_recursive_sum - collect hierarchical blkg_stat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_stat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + u64 sum; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_stat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_stat *stat = (void *)pos_pd + off; + + if (pos_blkg->online) + sum += blkg_stat_read(stat); + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_rwstat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + struct blkg_rwstat sum; + int i; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_rwstat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_rwstat *rwstat = (void *)pos_pd + off; + struct blkg_rwstat tmp; + + if (!pos_blkg->online) + continue; + + tmp = blkg_rwstat_read(rwstat); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum.cnt[i] += tmp.cnt[i]; + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); + +/** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy @@ -658,6 +841,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; + blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ done: spin_lock_init(&blkcg->lock); @@ -777,7 +961,7 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { LIST_HEAD(pds); - struct blkcg_gq *blkg; + struct blkcg_gq *blkg, *new_blkg; struct blkg_policy_data *pd, *n; int cnt = 0, ret; bool preloaded; @@ -786,19 +970,27 @@ int blkcg_activate_policy(struct request_queue *q, return 0; /* preallocations for root blkg */ - blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!blkg) + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); blk_queue_bypass_start(q); - /* make sure the root blkg exists and count the existing blkgs */ + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code it. + */ spin_lock_irq(q->queue_lock); rcu_read_lock(); - blkg = __blkg_lookup_create(&blkcg_root, q, blkg); + blkg = __blkg_lookup(&blkcg_root, q, false); + if (blkg) + blkg_free(new_blkg); + else + blkg = blkg_create(&blkcg_root, q, new_blkg); rcu_read_unlock(); if (preloaded) @@ -846,6 +1038,7 @@ int blkcg_activate_policy(struct request_queue *q, blkg->pd[pol->plid] = pd; pd->blkg = blkg; + pd->plid = pol->plid; pol->pd_init_fn(blkg); spin_unlock(&blkg->blkcg->lock); @@ -892,6 +1085,8 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg); if (pol->pd_exit_fn) pol->pd_exit_fn(blkg); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 24597309e23..f2b292925cc 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -54,6 +54,7 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ + unsigned int cfq_leaf_weight; }; struct blkg_stat { @@ -80,8 +81,9 @@ struct blkg_rwstat { * beginning and pd_size can't be smaller than pd. */ struct blkg_policy_data { - /* the blkg this per-policy data belongs to */ + /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; + int plid; /* used during policy activation */ struct list_head alloc_node; @@ -94,17 +96,27 @@ struct blkcg_gq { struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + /* request allocation list for this blkcg-q pair */ struct request_list rl; + /* reference count */ int refcnt; + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; }; typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); @@ -117,6 +129,8 @@ struct blkcg_policy { /* operations */ blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_exit_pd_fn *pd_exit_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; @@ -150,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; @@ -181,6 +199,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) } /** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + struct cgroup *pcg = blkcg->css.cgroup->parent; + + return pcg ? cgroup_to_blkcg(pcg) : NULL; +} + +/** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest @@ -387,6 +418,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat) } /** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + +/** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat * @rw: mask of REQ_{WRITE|SYNC} @@ -434,14 +477,14 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) } /** - * blkg_rwstat_sum - read the total count of a blkg_rwstat + * blkg_rwstat_total - read the total count of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Return the total count of @rwstat regardless of the IO direction. This * function can be called without synchronization and takes care of u64 * atomicity. */ -static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); @@ -457,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); } +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + #else /* CONFIG_BLK_CGROUP */ struct cgroup; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 788147797a7..6206a934eb8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -497,6 +497,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, return res; } +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + struct request_queue *q = container_of(rcu_head, struct request_queue, + rcu_head); + kmem_cache_free(blk_requestq_cachep, q); +} + /** * blk_release_queue: - release a &struct request_queue when it is no longer needed * @kobj: the kobj belonging to the request queue to be released @@ -538,7 +545,7 @@ static void blk_release_queue(struct kobject *kobj) bdi_destroy(&q->backing_dev_info); ida_simple_remove(&blk_queue_ida, q->id); - kmem_cache_free(blk_requestq_cachep, q); + call_rcu(&q->rcu_head, blk_free_queue_rcu); } static const struct sysfs_ops queue_sysfs_ops = { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e62e9205b80..b66365b6ba7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -85,7 +85,6 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; - unsigned total_weight; u64 min_vdisktime; struct cfq_ttime ttime; }; @@ -155,7 +154,7 @@ struct cfq_queue { * First index in the service_trees. * IDLE is handled separately, so it has negative index */ -enum wl_prio_t { +enum wl_class_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, @@ -223,10 +222,45 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; + + /* + * The number of active cfqgs and sum of their weights under this + * cfqg. This covers this cfqg's leaf_weight and all children's + * weights, but does not cover weights of further descendants. + * + * If a cfqg is on the service tree, it's active. An active cfqg + * also activates its parent and contributes to the children_weight + * of the parent. + */ + int nr_active; + unsigned int children_weight; + + /* + * vfraction is the fraction of vdisktime that the tasks in this + * cfqg are entitled to. This is determined by compounding the + * ratios walking up from this cfqg to the root. + * + * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all + * vfractions on a service tree is approximately 1. The sum may + * deviate a bit due to rounding errors and fluctuations caused by + * cfqgs entering and leaving the service tree. + */ + unsigned int vfraction; + + /* + * There are two weights - (internal) weight is the weight of this + * cfqg against the sibling cfqgs. leaf_weight is the wight of + * this cfqg against the child cfqgs. For the root cfqg, both + * weights are kept in sync for backward compatibility. + */ unsigned int weight; unsigned int new_weight; unsigned int dev_weight; + unsigned int leaf_weight; + unsigned int new_leaf_weight; + unsigned int dev_leaf_weight; + /* number of cfqq currently on this group */ int nr_cfqq; @@ -248,14 +282,15 @@ struct cfq_group { struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; - unsigned long saved_workload_slice; - enum wl_type_t saved_workload; - enum wl_prio_t saved_serving_prio; + unsigned long saved_wl_slice; + enum wl_type_t saved_wl_type; + enum wl_class_t saved_wl_class; /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; - struct cfqg_stats stats; + struct cfqg_stats stats; /* stats for this cfqg */ + struct cfqg_stats dead_stats; /* stats pushed from dead children */ }; struct cfq_io_cq { @@ -280,8 +315,8 @@ struct cfq_data { /* * The priority currently being served */ - enum wl_prio_t serving_prio; - enum wl_type_t serving_type; + enum wl_class_t serving_wl_class; + enum wl_type_t serving_wl_type; unsigned long workload_expires; struct cfq_group *serving_group; @@ -353,17 +388,17 @@ struct cfq_data { static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); -static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, - enum wl_prio_t prio, +static struct cfq_rb_root *st_for(struct cfq_group *cfqg, + enum wl_class_t class, enum wl_type_t type) { if (!cfqg) return NULL; - if (prio == IDLE_WORKLOAD) + if (class == IDLE_WORKLOAD) return &cfqg->service_tree_idle; - return &cfqg->service_trees[prio][type]; + return &cfqg->service_trees[class][type]; } enum cfqq_state_flags { @@ -502,7 +537,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { struct cfqg_stats *stats = &cfqg->stats; - if (blkg_rwstat_sum(&stats->queued)) + if (blkg_rwstat_total(&stats->queued)) return; /* @@ -546,7 +581,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) struct cfqg_stats *stats = &cfqg->stats; blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_sum(&stats->queued)); + blkg_rwstat_total(&stats->queued)); blkg_stat_add(&stats->avg_queue_size_samples, 1); cfqg_stats_update_group_wait_time(stats); } @@ -572,6 +607,13 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } +static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) +{ + struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; + + return pblkg ? blkg_to_cfqg(pblkg) : NULL; +} + static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); @@ -586,8 +628,9 @@ static inline void cfqg_put(struct cfq_group *cfqg) char __pbuf[128]; \ \ blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ __pbuf, ##args); \ } while (0) @@ -646,11 +689,9 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, io_start_time - start_time); } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +/* @stats = 0 */ +static void cfqg_stats_reset(struct cfqg_stats *stats) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - struct cfqg_stats *stats = &cfqg->stats; - /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->service_bytes); blkg_rwstat_reset(&stats->serviced); @@ -669,13 +710,58 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg) #endif } +/* @to += @from */ +static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +{ + /* queued stats shouldn't be cleared */ + blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); + blkg_rwstat_merge(&to->serviced, &from->serviced); + blkg_rwstat_merge(&to->merged, &from->merged); + blkg_rwstat_merge(&to->service_time, &from->service_time); + blkg_rwstat_merge(&to->wait_time, &from->wait_time); + blkg_stat_merge(&from->time, &from->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_merge(&to->dequeue, &from->dequeue); + blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); + blkg_stat_merge(&to->idle_time, &from->idle_time); + blkg_stat_merge(&to->empty_time, &from->empty_time); +#endif +} + +/* + * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * recursive stats can still account for the amount used by this cfqg after + * it's gone. + */ +static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) +{ + struct cfq_group *parent = cfqg_parent(cfqg); + + lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); + + if (unlikely(!parent)) + return; + + cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); + cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_reset(&cfqg->stats); + cfqg_stats_reset(&cfqg->dead_stats); +} + #else /* CONFIG_CFQ_GROUP_IOSCHED */ +static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } static inline void cfqg_get(struct cfq_group *cfqg) { } static inline void cfqg_put(struct cfq_group *cfqg) { } #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) + blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ + ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, @@ -732,7 +818,7 @@ static inline bool iops_mode(struct cfq_data *cfqd) return false; } -static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) +static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) { if (cfq_class_idle(cfqq)) return IDLE_WORKLOAD; @@ -751,23 +837,23 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) return SYNC_WORKLOAD; } -static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, +static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, struct cfq_data *cfqd, struct cfq_group *cfqg) { - if (wl == IDLE_WORKLOAD) + if (wl_class == IDLE_WORKLOAD) return cfqg->service_tree_idle.count; - return cfqg->service_trees[wl][ASYNC_WORKLOAD].count - + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count - + cfqg->service_trees[wl][SYNC_WORKLOAD].count; + return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; } static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, struct cfq_group *cfqg) { - return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count - + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; } static void cfq_dispatch_insert(struct request_queue *, struct request *); @@ -847,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +/** + * cfqg_scale_charge - scale disk time charge according to cfqg weight + * @charge: disk time being charged + * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT + * + * Scale @charge according to @vfraction, which is in range (0, 1]. The + * scaling is inversely proportional. + * + * scaled = charge / vfraction + * + * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. + */ +static inline u64 cfqg_scale_charge(unsigned long charge, + unsigned int vfraction) { - u64 d = delta << CFQ_SERVICE_SHIFT; + u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ - d = d * CFQ_WEIGHT_DEFAULT; - do_div(d, cfqg->weight); - return d; + /* charge / vfraction */ + c <<= CFQ_SERVICE_SHIFT; + do_div(c, vfraction); + return c; } static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) @@ -909,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, static inline unsigned cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) { - struct cfq_rb_root *st = &cfqd->grp_service_tree; - - return cfqd->cfq_target_latency * cfqg->weight / st->total_weight; + return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; } static inline unsigned @@ -1178,20 +1276,61 @@ static void cfq_update_group_weight(struct cfq_group *cfqg) { BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; cfqg->new_weight = 0; } + + if (cfqg->new_leaf_weight) { + cfqg->leaf_weight = cfqg->new_leaf_weight; + cfqg->new_leaf_weight = 0; + } } static void cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) { + unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ + struct cfq_group *pos = cfqg; + struct cfq_group *parent; + bool propagate; + + /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); cfq_update_group_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); - st->total_weight += cfqg->weight; + + /* + * Activate @cfqg and calculate the portion of vfraction @cfqg is + * entitled to. vfraction is calculated by walking the tree + * towards the root calculating the fraction it has at each level. + * The compounded ratio is how much vfraction @cfqg owns. + * + * Start with the proportion tasks in this cfqg has against active + * children cfqgs - its leaf_weight against children_weight. + */ + propagate = !pos->nr_active++; + pos->children_weight += pos->leaf_weight; + vfr = vfr * pos->leaf_weight / pos->children_weight; + + /* + * Compound ->weight walking up the tree. Both activation and + * vfraction calculation are done in the same loop. Propagation + * stops once an already activated node is met. vfraction + * calculation should always continue to the root. + */ + while ((parent = cfqg_parent(pos))) { + if (propagate) { + propagate = !parent->nr_active++; + parent->children_weight += pos->weight; + } + vfr = vfr * pos->weight / parent->children_weight; + pos = parent; + } + + cfqg->vfraction = max_t(unsigned, vfr, 1); } static void @@ -1222,7 +1361,32 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) static void cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) { - st->total_weight -= cfqg->w |