diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-28 12:52:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-28 12:52:24 -0800 |
commit | ee89f81252179dcbf6cd65bd48299f5e52292d88 (patch) | |
tree | 805846cd12821f84cfe619d44c9e3e36e0b0f9e6 /block | |
parent | 21f3b24da9328415792efc780f50b9f434c12465 (diff) | |
parent | de33127d8d3f1d570aad8c2223cd81b206636bc1 (diff) |
Merge branch 'for-3.9/core' of git://git.kernel.dk/linux-block
Pull block IO core bits from Jens Axboe:
"Below are the core block IO bits for 3.9. It was delayed a few days
since my workstation kept crashing every 2-8h after pulling it into
current -git, but turns out it is a bug in the new pstate code (divide
by zero, will report separately). In any case, it contains:
- The big cfq/blkcg update from Tejun and and Vivek.
- Additional block and writeback tracepoints from Tejun.
- Improvement of the should sort (based on queues) logic in the plug
flushing.
- _io() variants of the wait_for_completion() interface, using
io_schedule() instead of schedule() to contribute to io wait
properly.
- Various little fixes.
You'll get two trivial merge conflicts, which should be easy enough to
fix up"
Fix up the trivial conflicts due to hlist traversal cleanups (commit
b67bfe0d42ca: "hlist: drop the node parameter from iterators").
* 'for-3.9/core' of git://git.kernel.dk/linux-block: (39 commits)
block: remove redundant check to bd_openers()
block: use i_size_write() in bd_set_size()
cfq: fix lock imbalance with failed allocations
drivers/block/swim3.c: fix null pointer dereference
block: don't select PERCPU_RWSEM
block: account iowait time when waiting for completion of IO request
sched: add wait_for_completion_io[_timeout]
writeback: add more tracepoints
block: add block_{touch|dirty}_buffer tracepoint
buffer: make touch_buffer() an exported function
block: add @req to bio_{front|back}_merge tracepoints
block: add missing block_bio_complete() tracepoint
block: Remove should_sort judgement when flush blk_plug
block,elevator: use new hashtable implementation
cfq-iosched: add hierarchical cfq_group statistics
cfq-iosched: collect stats from dead cfqgs
cfq-iosched: separate out cfqg_stats_reset() from cfq_pd_reset_stats()
blkcg: make blkcg_print_blkgs() grab q locks instead of blkcg lock
block: RCU free request_queue
blkcg: implement blkg_[rw]stat_recursive_sum() and blkg_[rw]stat_merge()
...
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 1 | ||||
-rw-r--r-- | block/blk-cgroup.c | 277 | ||||
-rw-r--r-- | block/blk-cgroup.h | 68 | ||||
-rw-r--r-- | block/blk-core.c | 18 | ||||
-rw-r--r-- | block/blk-exec.c | 4 | ||||
-rw-r--r-- | block/blk-flush.c | 2 | ||||
-rw-r--r-- | block/blk-lib.c | 6 | ||||
-rw-r--r-- | block/blk-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/cfq-iosched.c | 629 | ||||
-rw-r--r-- | block/elevator.c | 23 |
11 files changed, 834 insertions, 205 deletions
diff --git a/block/Kconfig b/block/Kconfig index 4a85ccf8d4c..a7e40a7c821 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -4,7 +4,6 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y - select PERCPU_RWSEM help Provide block layer support for the kernel. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8bdebb6781e..b2b9837f9dd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -26,11 +26,32 @@ static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; +struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, + .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip + * subtree. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ + cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ + if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ + (p_blkg)->q, false))) + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -112,9 +133,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->pd[i] = pd; pd->blkg = blkg; + pd->plid = i; /* invoke per-policy init */ - if (blkcg_policy_enabled(blkg->q, pol)) + if (pol->pd_init_fn) pol->pd_init_fn(blkg); } @@ -125,8 +147,19 @@ err_free: return NULL; } +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; @@ -135,14 +168,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, return blkg; /* - * Hint didn't match. Look up from the radix tree. Note that we - * may not be holding queue_lock and thus are not sure whether - * @blkg from blkg_tree has already been removed or not, so we - * can't update hint to the lookup result. Leave it to the caller. + * Hint didn't match. Look up from the radix tree. Note that the + * hint can only be updated under queue_lock as otherwise @blkg + * could have already been removed from blkg_tree. The caller is + * responsible for grabbing queue_lock if @update_hint. */ blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) + if (blkg && blkg->q == q) { + if (update_hint) { + lockdep_assert_held(q->queue_lock); + rcu_assign_pointer(blkcg->blkg_hint, blkg); + } return blkg; + } return NULL; } @@ -162,7 +200,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) if (unlikely(blk_queue_bypass(q))) return NULL; - return __blkg_lookup(blkcg, q); + return __blkg_lookup(blkcg, q, false); } EXPORT_SYMBOL_GPL(blkg_lookup); @@ -170,75 +208,129 @@ EXPORT_SYMBOL_GPL(blkg_lookup); * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. */ -static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q, - struct blkcg_gq *new_blkg) +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, + struct request_queue *q, + struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - int ret; + int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); - /* lookup and update hint on success, see __blkg_lookup() for details */ - blkg = __blkg_lookup(blkcg, q); - if (blkg) { - rcu_assign_pointer(blkcg->blkg_hint, blkg); - goto out_free; - } - /* blkg holds a reference to blkcg */ if (!css_tryget(&blkcg->css)) { - blkg = ERR_PTR(-EINVAL); - goto out_free; + ret = -EINVAL; + goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); if (unlikely(!new_blkg)) { - blkg = ERR_PTR(-ENOMEM); - goto out_put; + ret = -ENOMEM; + goto err_put_css; } } blkg = new_blkg; - /* insert */ + /* link parent and insert */ + if (blkcg_parent(blkcg)) { + blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + if (WARN_ON_ONCE(!blkg->parent)) { + blkg = ERR_PTR(-EINVAL); + goto err_put_css; + } + blkg_get(blkg->parent); + } + spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &q->blkg_list); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_online_fn) + pol->pd_online_fn(blkg); + } } + blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; - blkg = ERR_PTR(ret); -out_put: + /* @blkg failed fully initialized, use the usual release path */ + blkg_put(blkg); + return ERR_PTR(ret); + +err_put_css: css_put(&blkcg->css); -out_free: +err_free_blkg: blkg_free(new_blkg); - return blkg; + return ERR_PTR(ret); } +/** + * blkg_lookup_create - lookup blkg, try to create one if not there + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * create one. blkg creation is performed recursively from blkcg_root such + * that all non-root blkg's have access to the parent blkg. This function + * should be called under RCU read lock and @q->queue_lock. + * + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). + */ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { + struct blkcg_gq *blkg; + + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + /* * This could be the first entry point of blkcg implementation and * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); - return __blkg_lookup_create(blkcg, q, NULL); + + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + return blkg; + + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent = blkcg_parent(blkcg); + + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } + + blkg = blkg_create(pos, q, NULL); + if (pos == blkcg || IS_ERR(blkg)) + return blkg; + } } EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + int i; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); @@ -247,6 +339,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_offline_fn) + pol->pd_offline_fn(blkg); + } + blkg->online = false; + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); @@ -301,8 +401,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head) void __blkg_release(struct blkcg_gq *blkg) { - /* release the extra blkcg reference this blkg has been holding */ + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); /* * A group is freed in rcu manner. But having an rcu lock does not @@ -401,8 +503,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg) * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the - * policy data and @data. If @show_total is %true, the sum of the return - * values from @prfill is printed with "Total" label at the end. + * policy data and @data and the matching queue lock held. If @show_total + * is %true, the sum of the return values from @prfill is printed with + * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. @@ -416,11 +519,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, struct blkcg_gq *blkg; u64 total = 0; - spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(&blkcg->lock); + spin_unlock_irq(blkg->q->queue_lock); + } + rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); @@ -479,6 +585,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); /** * blkg_prfill_stat - prfill callback for blkg_stat @@ -512,6 +619,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); /** + * blkg_stat_recursive_sum - collect hierarchical blkg_stat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_stat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + u64 sum; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_stat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_stat *stat = (void *)pos_pd + off; + + if (pos_blkg->online) + sum += blkg_stat_read(stat); + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * Collect the blkg_rwstat specified by @off from @pd and all its online + * descendants and return the sum. The caller must be holding the queue + * lock for online tests. + */ +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off) +{ + struct blkcg_policy *pol = blkcg_policy[pd->plid]; + struct blkcg_gq *pos_blkg; + struct cgroup *pos_cgrp; + struct blkg_rwstat sum; + int i; + + lockdep_assert_held(pd->blkg->q->queue_lock); + + sum = blkg_rwstat_read((void *)pd + off); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); + struct blkg_rwstat *rwstat = (void *)pos_pd + off; + struct blkg_rwstat tmp; + + if (!pos_blkg->online) + continue; + + tmp = blkg_rwstat_read(rwstat); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum.cnt[i] += tmp.cnt[i]; + } + rcu_read_unlock(); + + return sum; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); + +/** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy @@ -656,6 +839,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; + blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ done: spin_lock_init(&blkcg->lock); @@ -775,7 +959,7 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { LIST_HEAD(pds); - struct blkcg_gq *blkg; + struct blkcg_gq *blkg, *new_blkg; struct blkg_policy_data *pd, *n; int cnt = 0, ret; bool preloaded; @@ -784,19 +968,27 @@ int blkcg_activate_policy(struct request_queue *q, return 0; /* preallocations for root blkg */ - blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!blkg) + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); blk_queue_bypass_start(q); - /* make sure the root blkg exists and count the existing blkgs */ + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code it. + */ spin_lock_irq(q->queue_lock); rcu_read_lock(); - blkg = __blkg_lookup_create(&blkcg_root, q, blkg); + blkg = __blkg_lookup(&blkcg_root, q, false); + if (blkg) + blkg_free(new_blkg); + else + blkg = blkg_create(&blkcg_root, q, new_blkg); rcu_read_unlock(); if (preloaded) @@ -844,6 +1036,7 @@ int blkcg_activate_policy(struct request_queue *q, blkg->pd[pol->plid] = pd; pd->blkg = blkg; + pd->plid = pol->plid; pol->pd_init_fn(blkg); spin_unlock(&blkg->blkcg->lock); @@ -890,6 +1083,8 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg); if (pol->pd_exit_fn) pol->pd_exit_fn(blkg); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 24597309e23..f2b292925cc 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -54,6 +54,7 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ + unsigned int cfq_leaf_weight; }; struct blkg_stat { @@ -80,8 +81,9 @@ struct blkg_rwstat { * beginning and pd_size can't be smaller than pd. */ struct blkg_policy_data { - /* the blkg this per-policy data belongs to */ + /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; + int plid; /* used during policy activation */ struct list_head alloc_node; @@ -94,17 +96,27 @@ struct blkcg_gq { struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + /* request allocation list for this blkcg-q pair */ struct request_list rl; + /* reference count */ int refcnt; + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; }; typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); @@ -117,6 +129,8 @@ struct blkcg_policy { /* operations */ blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_exit_pd_fn *pd_exit_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; @@ -150,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, + int off); + struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; @@ -181,6 +199,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) } /** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + struct cgroup *pcg = blkcg->css.cgroup->parent; + + return pcg ? cgroup_to_blkcg(pcg) : NULL; +} + +/** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest @@ -387,6 +418,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat) } /** + * blkg_stat_merge - merge a blkg_stat into another + * @to: the destination blkg_stat + * @from: the source + * + * Add @from's count to @to. + */ +static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +{ + blkg_stat_add(to, blkg_stat_read(from)); +} + +/** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat * @rw: mask of REQ_{WRITE|SYNC} @@ -434,14 +477,14 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) } /** - * blkg_rwstat_sum - read the total count of a blkg_rwstat + * blkg_rwstat_total - read the total count of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Return the total count of @rwstat regardless of the IO direction. This * function can be called without synchronization and takes care of u64 * atomicity. */ -static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); @@ -457,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); } +/** + * blkg_rwstat_merge - merge a blkg_rwstat into another + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's counts to @to. + */ +static inline void blkg_rwstat_merge(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + struct blkg_rwstat v = blkg_rwstat_read(from); + int i; + + u64_stats_update_begin(&to->syncp); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + to->cnt[i] += v.cnt[i]; + u64_stats_update_end(&to->syncp); +} + #else /* CONFIG_BLK_CGROUP */ struct cgroup; diff --git a/block/blk-core.c b/block/blk-core.c index 277134cb5d3..074b758efc4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,7 +39,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); -EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); @@ -1348,7 +1347,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, if (!ll_back_merge_fn(q, req, bio)) return false; - trace_block_bio_backmerge(q, bio); + trace_block_bio_backmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -1370,7 +1369,7 @@ static bool bio_attempt_front_merge(struct request_queue *q, if (!ll_front_merge_fn(q, req, bio)) return false; - trace_block_bio_frontmerge(q, bio); + trace_block_bio_frontmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -1553,13 +1552,6 @@ get_rq: if (list_empty(&plug->list)) trace_block_plug(q); else { - if (!plug->should_sort) { - struct request *__rq; - - __rq = list_entry_rq(plug->list.prev); - if (__rq->q != q) - plug->should_sort = 1; - } if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); trace_block_plug(q); @@ -2890,7 +2882,6 @@ void blk_start_plug(struct blk_plug *plug) plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); - plug->should_sort = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2992,10 +2983,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) list_splice_init(&plug->list, &list); - if (plug->should_sort) { - list_sort(NULL, &list, plug_rq_cmp); - plug->should_sort = 0; - } + list_sort(NULL, &list, plug_rq_cmp); q = NULL; depth = 0; diff --git a/block/blk-exec.c b/block/blk-exec.c index c88202f973d..e7062139612 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -121,9 +121,9 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) - while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); + while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (rq->errors) err = -EIO; diff --git a/block/blk-flush.c b/block/blk-flush.c index 720ad607ff9..db8f1b50785 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -436,7 +436,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio_get(bio); submit_bio(WRITE_FLUSH, bio); - wait_for_completion(&wait); + wait_for_completion_io(&wait); /* * The driver must store the error location in ->bi_sector, if diff --git a/block/blk-lib.c b/block/blk-lib.c index b3a1f2b70b3..d6f50d57256 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -126,7 +126,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* Wait for bios in-flight */ if (!atomic_dec_and_test(&bb.done)) - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) ret = -EIO; @@ -200,7 +200,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, /* Wait for bios in-flight */ if (!atomic_dec_and_test(&bb.done)) - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) ret = -ENOTSUPP; @@ -262,7 +262,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, /* Wait for bios in-flight */ if (!atomic_dec_and_test(&bb.done)) - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) /* One of bios in the batch was completed with error.*/ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 788147797a7..6206a934eb8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -497,6 +497,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, return res; } +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + struct request_queue *q = container_of(rcu_head, struct request_queue, + rcu_head); + kmem_cache_free(blk_requestq_cachep, q); +} + /** * blk_release_queue: - release a &struct request_queue when it is no longer needed * @kobj: the kobj belonging to the request queue to be released @@ -538,7 +545,7 @@ static void blk_release_queue(struct kobject *kobj) bdi_destroy(&q->backing_dev_info); ida_simple_remove(&blk_queue_ida, q->id); - kmem_cache_free(blk_requestq_cachep, q); + call_rcu(&q->rcu_head, blk_free_queue_rcu); } static const struct sysfs_ops queue_sysfs_ops = { diff --git a/block/blk.h b/block/blk.h index 47fdfdd4152..e837b8f619b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -61,7 +61,7 @@ static inline void blk_clear_rq_complete(struct request *rq) /* * Internal elevator interface */ -#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +#define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash) void blk_insert_flush(struct request *rq); void blk_abort_flushes(struct request_queue *q); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ec52807cdd0..4f0ade74cfd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -85,7 +85,6 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; - unsigned total_weight; u64 min_vdisktime; struct cfq_ttime ttime; }; @@ -155,7 +154,7 @@ struct cfq_queue { * First index in the service_trees. * IDLE is handled separately, so it has negative index */ -enum wl_prio_t { +enum wl_class_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, @@ -223,10 +222,45 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; + + /* + * The number of active cfqgs and sum of their weights under this + * cfqg. This covers this cfqg's leaf_weight and all children's + * weights, but does not cover weights of further descendants. + * + * If a cfqg is on the service tree, it's active. An active cfqg + * also activates its parent and contributes to the children_weight + * of the parent. + */ + int nr_active; + unsigned int children_weight; + + /* + * vfraction is the fraction of vdisktime that the tasks in this + * cfqg are entitled to. This is determined by compounding the + * ratios walking up from this cfqg to the root. + * + * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all + * vfractions on a service tree is approximately 1. The sum may + * deviate a bit due to rounding errors and fluctuations caused by + * cfqgs entering and leaving the service tree. + */ + unsigned int vfraction; + + /* + * There are two weights - (internal) weight is the weight of this + * cfqg against the sibling cfqgs. leaf_weight is the wight of + * this cfqg against the child cfqgs. For the root cfqg, both + * weights are kept in sync for backward compatibility. + */ unsigned int weight; unsigned int new_weight; unsigned int dev_weight; + unsigned int leaf_weight; + unsigned int new_leaf_weight; + unsigned int dev_leaf_weight; + /* number of cfqq currently on this group */ int nr_cfqq; @@ -248,14 +282,15 @@ struct cfq_group { struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; - unsigned long saved_workload_slice; - enum wl_type_t saved_workload; - enum wl_prio_t saved_serving_prio; + unsigned long saved_wl_slice; + enum wl_type_t saved_wl_type; + enum wl_class_t saved_wl_class; /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; - struct cfqg_stats stats; + struct cfqg_stats stats; /* stats for this cfqg */ + struct cfqg_stats dead_stats; /* stats pushed from dead children */ }; struct cfq_io_cq { @@ -280,8 +315,8 @@ struct cfq_data { /* * The priority currently being served */ - enum wl_prio_t serving_prio; - enum wl_type_t serving_type; + enum wl_class_t serving_wl_class; + enum wl_type_t serving_wl_type; unsigned long workload_expires; struct cfq_group *serving_group; @@ -353,17 +388,17 @@ struct cfq_data { static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); -static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, - enum wl_prio_t prio, +static struct cfq_rb_root *st_for(struct cfq_group *cfqg, + enum wl_class_t class, enum wl_type_t type) { if (!cfqg) return NULL; - if (prio == IDLE_WORKLOAD) + if (class == IDLE_WORKLOAD) return &cfqg->service_tree_idle; - return &cfqg->service_trees[prio][type]; + return &cfqg->service_trees[class][type]; } enum cfqq_state_flags { @@ -502,7 +537,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { struct cfqg_stats *stats = &cfqg->stats; - if (blkg_rwstat_sum(&stats->queued)) + if (blkg_rwstat_total(&stats->queued)) return; /* @@ -546,7 +581,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) struct cfqg_stats *stats = &cfqg->stats; blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_sum(&stats->queued)); + blkg_rwstat_total(&stats->queued)); blkg_stat_add(&stats->avg_queue_size_samples, 1); cfqg_stats_update_group_wait_time(stats); } @@ -572,6 +607,13 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } +static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) +{ + struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; + + return pblkg ? blkg_to_cfqg(pblkg) : NULL; +} + static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); @@ -586,8 +628,9 @@ static inline void cfqg_put(struct cfq_group *cfqg) char __pbuf[128]; \ \ blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c |