diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-08-01 09:02:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-08-01 09:02:41 -0700 |
commit | 8cf1a3fce0b95050b63d451c9d561da0da2aa4d6 (patch) | |
tree | 0dc7f93474c3be601a5893900db1418dfd60ba5d | |
parent | fcff06c438b60f415af5983efe92811d6aa02ad1 (diff) | |
parent | 80799fbb7d10c30df78015b3fa21f7ffcfc0eb2c (diff) |
Merge branch 'for-3.6/core' of git://git.kernel.dk/linux-block
Pull core block IO bits from Jens Axboe:
"The most complicated part if this is the request allocation rework by
Tejun, which has been queued up for a long time and has been in
for-next ditto as well.
There are a few commits from yesterday and today, mostly trivial and
obvious fixes. So I'm pretty confident that it is sound. It's also
smaller than usual."
* 'for-3.6/core' of git://git.kernel.dk/linux-block:
block: remove dead func declaration
block: add partition resize function to blkpg ioctl
block: uninitialized ioc->nr_tasks triggers WARN_ON
block: do not artificially constrain max_sectors for stacking drivers
blkcg: implement per-blkg request allocation
block: prepare for multiple request_lists
block: add q->nr_rqs[] and move q->rq.elvpriv to q->nr_rqs_elvpriv
blkcg: inline bio_blkcg() and friends
block: allocate io_context upfront
block: refactor get_request[_wait]()
block: drop custom queue draining used by scsi_transport_{iscsi|fc}
mempool: add @gfp_mask to mempool_create_node()
blkcg: make root blkcg allocation use %GFP_KERNEL
blkcg: __blkg_lookup_create() doesn't need radix preload
-rw-r--r-- | Documentation/block/queue-sysfs.txt | 7 | ||||
-rw-r--r-- | block/blk-cgroup.c | 139 | ||||
-rw-r--r-- | block/blk-cgroup.h | 128 | ||||
-rw-r--r-- | block/blk-core.c | 209 | ||||
-rw-r--r-- | block/blk-ioc.c | 1 | ||||
-rw-r--r-- | block/blk-settings.c | 3 | ||||
-rw-r--r-- | block/blk-sysfs.c | 34 | ||||
-rw-r--r-- | block/blk-throttle.c | 3 | ||||
-rw-r--r-- | block/blk.h | 4 | ||||
-rw-r--r-- | block/bsg-lib.c | 53 | ||||
-rw-r--r-- | block/genhd.c | 20 | ||||
-rw-r--r-- | block/ioctl.c | 59 | ||||
-rw-r--r-- | block/partition-generic.c | 4 | ||||
-rw-r--r-- | drivers/scsi/scsi_transport_fc.c | 38 | ||||
-rw-r--r-- | drivers/scsi/scsi_transport_iscsi.c | 2 | ||||
-rw-r--r-- | include/linux/blkdev.h | 53 | ||||
-rw-r--r-- | include/linux/blkpg.h | 1 | ||||
-rw-r--r-- | include/linux/bsg-lib.h | 1 | ||||
-rw-r--r-- | include/linux/genhd.h | 57 | ||||
-rw-r--r-- | include/linux/mempool.h | 3 | ||||
-rw-r--r-- | mm/mempool.c | 12 |
21 files changed, 530 insertions, 301 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index d8147b336c3..6518a55273e 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice this amount, since it applies only to reads or writes (not the accumulated sum). +To avoid priority inversion through request starvation, a request +queue maintains a separate request pool per each cgroup when +CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such +per-block-cgroup request pool. IOW, if there are N block cgroups, +each request queue may have upto N request pools, each independently +regulated by nr_requests. + read_ahead_kb (RW) ------------------ Maximum number of kilobytes to read-ahead for filesystems on this block diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e7dee617358..f3b44a65fc7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -31,27 +31,6 @@ EXPORT_SYMBOL_GPL(blkcg_root); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; -struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), - struct blkcg, css); -} -EXPORT_SYMBOL_GPL(cgroup_to_blkcg); - -static struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkcg, css); -} - -struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_css) - return container_of(bio->bi_css, struct blkcg, css); - return task_blkcg(current); -} -EXPORT_SYMBOL_GPL(bio_blkcg); - static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -84,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(pd); } + blk_exit_rl(&blkg->rl); kfree(blkg); } @@ -91,16 +71,18 @@ static void blkg_free(struct blkcg_gq *blkg) * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @q: request_queue the new blkg is associated with + * @gfp_mask: allocation mask to use * * Allocate a new blkg assocating @blkcg and @q. */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, + gfp_t gfp_mask) { struct blkcg_gq *blkg; int i; /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); if (!blkg) return NULL; @@ -109,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) blkg->blkcg = blkcg; blkg->refcnt = 1; + /* root blkg uses @q->root_rl, init rl only for !root blkgs */ + if (blkcg != &blkcg_root) { + if (blk_init_rl(&blkg->rl, q, gfp_mask)) + goto err_free; + blkg->rl.blkg = blkg; + } + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; @@ -117,11 +106,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); - if (!pd) { - blkg_free(blkg); - return NULL; - } + pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + if (!pd) + goto err_free; blkg->pd[i] = pd; pd->blkg = blkg; @@ -132,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) } return blkg; + +err_free: + blkg_free(blkg); + return NULL; } static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, @@ -175,9 +166,13 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) } EXPORT_SYMBOL_GPL(blkg_lookup); +/* + * If @new_blkg is %NULL, this function tries to allocate a new one as + * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + */ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) + struct request_queue *q, + struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int ret; @@ -189,24 +184,26 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, blkg = __blkg_lookup(blkcg, q); if (blkg) { rcu_assign_pointer(blkcg->blkg_hint, blkg); - return blkg; + goto out_free; } /* blkg holds a reference to blkcg */ - if (!css_tryget(&blkcg->css)) - return ERR_PTR(-EINVAL); + if (!css_tryget(&blkcg->css)) { + blkg = ERR_PTR(-EINVAL); + goto out_free; + } /* allocate */ - ret = -ENOMEM; - blkg = blkg_alloc(blkcg, q); - if (unlikely(!blkg)) - goto err_put; + if (!new_blkg) { + new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + if (unlikely(!new_blkg)) { + blkg = ERR_PTR(-ENOMEM); + goto out_put; + } + } + blkg = new_blkg; /* insert */ - ret = radix_tree_preload(GFP_ATOMIC); - if (ret) - goto err_free; - spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { @@ -215,15 +212,15 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, } spin_unlock(&blkcg->lock); - radix_tree_preload_end(); - if (!ret) return blkg; -err_free: - blkg_free(blkg); -err_put: + + blkg = ERR_PTR(ret); +out_put: css_put(&blkcg->css); - return ERR_PTR(ret); +out_free: + blkg_free(new_blkg); + return blkg; } struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, @@ -235,7 +232,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, */ if (unlikely(blk_queue_bypass(q))) return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); - return __blkg_lookup_create(blkcg, q); + return __blkg_lookup_create(blkcg, q, NULL); } EXPORT_SYMBOL_GPL(blkg_lookup_create); @@ -313,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg) } EXPORT_SYMBOL_GPL(__blkg_release); +/* + * The next function used by blk_queue_for_each_rl(). It's a bit tricky + * because the root blkg uses @q->root_rl instead of its own rl. + */ +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q) +{ + struct list_head *ent; + struct blkcg_gq *blkg; + + /* + * Determine the current blkg list_head. The first entry is + * root_rl which is off @q->blkg_list and mapped to the head. + */ + if (rl == &q->root_rl) { + ent = &q->blkg_list; + } else { + blkg = container_of(rl, struct blkcg_gq, rl); + ent = &blkg->q_node; + } + + /* walk to the next list_head, skip root blkcg */ + ent = ent->next; + if (ent == &q->root_blkg->q_node) + ent = ent->next; + if (ent == &q->blkg_list) + return NULL; + + blkg = container_of(ent, struct blkcg_gq, q_node); + return &blkg->rl; +} + static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -734,24 +763,36 @@ int blkcg_activate_policy(struct request_queue *q, struct blkcg_gq *blkg; struct blkg_policy_data *pd, *n; int cnt = 0, ret; + bool preloaded; if (blkcg_policy_enabled(q, pol)) return 0; + /* preallocations for root blkg */ + blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); + blk_queue_bypass_start(q); /* make sure the root blkg exists and count the existing blkgs */ spin_lock_irq(q->queue_lock); rcu_read_lock(); - blkg = __blkg_lookup_create(&blkcg_root, q); + blkg = __blkg_lookup_create(&blkcg_root, q, blkg); rcu_read_unlock(); + if (preloaded) + radix_tree_preload_end(); + if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto out_unlock; } q->root_blkg = blkg; + q->root_rl.blkg = blkg; list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8ac457ce778..24597309e23 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -17,6 +17,7 @@ #include <linux/u64_stats_sync.h> #include <linux/seq_file.h> #include <linux/radix-tree.h> +#include <linux/blkdev.h> /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -93,6 +94,8 @@ struct blkcg_gq { struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; + /* request allocation list for this blkcg-q pair */ + struct request_list rl; /* reference count */ int refcnt; @@ -120,8 +123,6 @@ struct blkcg_policy { extern struct blkcg blkcg_root; -struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup); -struct blkcg *bio_blkcg(struct bio *bio); struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); @@ -160,6 +161,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, void blkg_conf_finish(struct blkg_conf_ctx *ctx); +static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkcg, css); +} + +static inline struct blkcg *task_blkcg(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkcg, css); +} + +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_css) + return container_of(bio->bi_css, struct blkcg, css); + return task_blkcg(current); +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -234,6 +254,95 @@ static inline void blkg_put(struct blkcg_gq *blkg) } /** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; +root_rl: + rcu_read_unlock(); + return &q->root_rl; +} + +/** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ +static inline void blk_put_rl(struct request_list *rl) +{ + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); +} + +/** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) +{ + rq->rl = rl; +} + +/** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ +static inline struct request_list *blk_rq_rl(struct request *rq) +{ + return rq->rl; +} + +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); +/** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + +/** * blkg_stat_add - add a value to a blkg_stat * @stat: target blkg_stat * @val: value to add @@ -351,6 +460,7 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) #else /* CONFIG_BLK_CGROUP */ struct cgroup; +struct blkcg; struct blkg_policy_data { }; @@ -361,8 +471,6 @@ struct blkcg_gq { struct blkcg_policy { }; -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } @@ -374,6 +482,9 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } +static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } +static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } @@ -381,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } +static inline void blk_put_rl(struct request_list *rl) { } +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } +static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 93eb3e4f88c..dd134d834d5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -387,7 +387,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) if (!list_empty(&q->queue_head) && q->request_fn) __blk_run_queue(q); - drain |= q->rq.elvpriv; + drain |= q->nr_rqs_elvpriv; /* * Unfortunately, requests are queued at and tracked from @@ -397,7 +397,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) if (drain_all) { drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { - drain |= q->rq.count[i]; + drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; drain |= !list_empty(&q->flush_queue[i]); } @@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) * left with hung waiters. We need to wake up those waiters. */ if (q->request_fn) { + struct request_list *rl; + spin_lock_irq(q->queue_lock); - for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) - wake_up_all(&q->rq.wait[i]); + + blk_queue_for_each_rl(rl, q) + for (i = 0; i < ARRAY_SIZE(rl->wait); i++) + wake_up_all(&rl->wait[i]); + spin_unlock_irq(q->queue_lock); } } @@ -517,28 +522,33 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -static int blk_init_free_list(struct request_queue *q) +int blk_init_rl(struct request_list *rl, struct request_queue *q, + gfp_t gfp_mask) { - struct request_list *rl = &q->rq; - if (unlikely(rl->rq_pool)) return 0; + rl->q = q; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - rl->elvpriv = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, q->node); - + mempool_free_slab, request_cachep, + gfp_mask, q->node); if (!rl->rq_pool) return -ENOMEM; return 0; } +void blk_exit_rl(struct request_list *rl) +{ + if (rl->rq_pool) + mempool_destroy(rl->rq_pool); +} + struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { return blk_alloc_queue_node(gfp_mask, -1); @@ -680,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - if (blk_init_free_list(q)) + if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) return NULL; q->request_fn = rfn; @@ -722,15 +732,15 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -static inline void blk_free_request(struct request_queue *q, struct request *rq) +static inline void blk_free_request(struct request_list *rl, struct request *rq) { if (rq->cmd_flags & REQ_ELVPRIV) { - elv_put_request(q, rq); + elv_put_request(rl->q, rq); if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); } - mempool_free(rq, q->rq.rq_pool); + mempool_free(rq, rl->rq_pool); } /* @@ -767,18 +777,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) ioc->last_waited = jiffies; } -static void __freed_request(struct request_queue *q, int sync) +static void __freed_request(struct request_list *rl, int sync) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; - if (rl->count[sync] < queue_congestion_off_threshold(q)) + /* + * bdi isn't aware of blkcg yet. As all async IOs end up root + * blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl && + rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); - blk_clear_queue_full(q, sync); + blk_clear_rl_full(rl, sync); } } @@ -786,19 +801,20 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, unsigned int flags) +static void freed_request(struct request_list *rl, unsigned int flags) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; int sync = rw_is_sync(flags); + q->nr_rqs[sync]--; rl->count[sync]--; if (flags & REQ_ELVPRIV) - rl->elvpriv--; + q->nr_rqs_elvpriv--; - __freed_request(q, sync); + __freed_request(rl, sync); if (unlikely(rl->starved[sync ^ 1])) - __freed_request(q, sync ^ 1); + __freed_request(rl, sync ^ 1); } /* @@ -837,8 +853,8 @@ static struct io_context *rq_ioc(struct bio *bio) } /** - * get_request - get a free request - * @q: request_queue to allocate request from + * __get_request - get a free request + * @rl: request list to allocate from * @rw_flags: RW and SYNC flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask @@ -850,20 +866,16 @@ static struct io_context *rq_ioc(struct bio *bio) * Returns %NULL on failure, with @q->queue_lock held. * Returns !%NULL on success, with @q->queue_lock *not held*. */ -static struct request *get_request(struct request_queue *q, int rw_flags, - struct bio *bio, gfp_t gfp_mask) +static struct request *__get_request(struct request_list *rl, int rw_flags, + struct bio *bio, gfp_t gfp_mask) { + struct request_queue *q = rl->q; struct request *rq; - struct request_list *rl = &q->rq; - struct elevator_type *et; - struct io_context *ioc; + struct elevator_type *et = q->elevator->type; + struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - bool retried = false; int may_queue; -retry: - et = q->elevator->type; - ioc = rq_ioc(bio); if (unlikely(blk_queue_dead(q))) return NULL; @@ -875,28 +887,14 @@ retry: if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { if (rl->count[is_sync]+1 >= q->nr_requests) { /* - * We want ioc to record batching state. If it's - * not already there, creating a new one requires - * dropping queue_lock, which in turn requires - * retesting conditions to avoid queue hang. - */ - if (!ioc && !retried) { - spin_unlock_irq(q->queue_lock); - create_io_context(gfp_mask, q->node); - spin_lock_irq(q->queue_lock); - retried = true; - goto retry; - } - - /* * The queue will fill after this allocation, so set * it as full, and mark this process as "batching". * This process will be allowed to complete a batch of * requests, others will be blocked. */ - if (!blk_queue_full(q, is_sync)) { + if (!blk_rl_full(rl, is_sync)) { ioc_set_batching(q, ioc); - blk_set_queue_full(q, is_sync); + blk_set_rl_full(rl, is_sync); } else { if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { @@ -909,7 +907,12 @@ retry: } } } - blk_set_queue_congested(q, is_sync); + /* + * bdi isn't aware of blkcg yet. As all async IOs end up + * root blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl) + blk_set_queue_congested(q, is_sync); } /* @@ -920,6 +923,7 @@ retry: if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) return NULL; + q->nr_rqs[is_sync]++; rl->count[is_sync]++; rl->starved[is_sync] = 0; @@ -935,7 +939,7 @@ retry: */ if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { rw_flags |= REQ_ELVPRIV; - rl->elvpriv++; + q->nr_rqs_elvpriv++; if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } @@ -945,22 +949,19 @@ retry: spin_unlock_irq(q->queue_lock); /* allocate and init request */ - rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + rq = mempool_alloc(rl->rq_pool, gfp_mask); if (!rq) goto fail_alloc; blk_rq_init(q, rq); + blk_rq_set_rl(rq, rl); rq->cmd_flags = rw_flags | REQ_ALLOCED; /* init elvpriv */ if (rw_flags & REQ_ELVPRIV) { if (unlikely(et->icq_cache && !icq)) { - create_io_context(gfp_mask, q->node); - ioc = rq_ioc(bio); - if (!ioc) - goto fail_elvpriv; - - icq = ioc_create_icq(ioc, q, gfp_mask); + if (ioc) + icq = ioc_create_icq(ioc, q, gfp_mask); if (!icq) goto fail_elvpriv; } @@ -1000,7 +1001,7 @@ fail_elvpriv: rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); - rl->elvpriv--; + q->nr_rqs_elvpriv--; spin_unlock_irq(q->queue_lock); goto out; @@ -1013,7 +1014,7 @@ fail_alloc: * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, rw_flags); + freed_request(rl, rw_flags); /* * in the very unlikely event that allocation failed and no @@ -1029,56 +1030,58 @@ rq_starved: } /** - * get_request_wait - get a free request with retry + * get_request - get a free request * @q: request_queue to allocate request from * @rw_flags: RW and SYNC flags * @bio: bio to allocate request for (can be %NULL) + * @gfp_mask: allocation mask * - * Get a free request from @q. This function keeps retrying under memory - * pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this + * function keeps retrying under memory pressure and fails iff @q is dead. * * Must be callled with @q->queue_lock held and, * Returns %NULL on failure, with @q->queue_lock held. * Returns !%NULL on success, with @q->queue_lock *not held*. */ -static struct request *get_request_wait(struct request_queue *q, int rw_flags, - struct bio *bio) +static struct request *get_request(struct request_queue *q, int rw_flags, + struct bio *bio, gfp_t gfp_mask) { const bool is_sync = rw_is_sync(rw_flags) != 0; + DEFINE_WAIT(wait); + struct request_list *rl; struct request *rq; - rq = get_request(q, rw_flags, bio, GFP_NOIO); - while (!rq) { - DEFINE_WAIT(wait); - struct request_list *rl = &q->rq; - - if (unlikely(blk_queue_dead(q))) - return NULL; + rl = blk_get_rl(q, bio); /* transferred to @rq on success */ +retry: + rq = __get_request(rl, rw_flags, bio, gfp_mask); + if (rq) + return rq; - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, - TASK_UNINTERRUPTIBLE); + if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) { + blk_put_rl(rl); + return NULL; + } - trace_block_sleeprq(q, bio, rw_flags & 1); + /* wait on @rl and retry */ + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, + TASK_UNINTERRUPTIBLE); - spin_unlock_irq(q->queue_lock); - io_schedule(); + trace_block_sleeprq(q, bio, rw_flags & 1); - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - create_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, current->io_context); + spin_unlock_irq(q->queue_lock); + io_schedule(); - spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[is_sync], &wait); + /* + * After sleeping, we become a "batching" process and will be able + * to allocate at least one request, and up to a big batch of them + * for a small period time. See ioc_batching, ioc_set_batching + */ + ioc_set_batching(q, current->io_context); - rq = get_request(q, rw_flags, bio, GFP_NOIO); - }; + spin_lock_irq(q->queue_lock); + finish_wait(&rl->wait[is_sync], &wait); - return rq; + goto retry; } struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) @@ -1087,11 +1090,11 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) BUG_ON(rw != READ && rw != WRITE); + /* create ioc upfront */ + create_io_context(gfp_mask, q->node); + spin_lock_irq(q->queue_lock); - if (gfp_mask & __GFP_WAIT) - rq = get_request_wait(q, rw, NULL); - else - rq = get_request(q, rw, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask); if (!rq) spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ @@ -1248,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req) */ if (req->cmd_flags & REQ_ALLOCED) { unsigned int flags = req->cmd_flags; + struct request_list *rl = blk_rq_rl(req); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); - blk_free_request(q, req); - freed_request(q, flags); + blk_free_request(rl, req); + freed_request(rl, flags); + blk_put_rl(rl); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1481,7 +1486,7 @@ get_rq: * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request_wait(q, rw_flags, bio); + req = get_request(q, rw_flags, bio, GFP_NOIO); if (unlikely(!req)) { bio_endio(bio, -ENODEV); /* @q is dead */ goto out_unlock; @@ -1702,6 +1707,14 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + /* + * Various block parts want %current->io_context and lazy ioc + * allocation ends up trading a lot of pain for a small amount of + * memory. Just allocate it upfront. This may fail and block + * layer knows how to live with it. + */ + create_io_context(GFP_ATOMIC, q->node); + if (blk_throtl_bio(q, bio)) return false; /* throttled, will be resubmitted later */ diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 893b8007c65..fab4cdd3f7b 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -244,6 +244,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) /* initialize */ atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); |