diff options
author | Jens Axboe <jens.axboe@oracle.com> | 2008-01-29 14:49:21 +0100 |
---|---|---|
committer | Jens Axboe <jens.axboe@oracle.com> | 2008-01-29 21:55:05 +0100 |
commit | a168ee84c90b39ece357da127ab388f2f64db19c (patch) | |
tree | acb5527d9835c06e4eb2c308850e74db857f7d64 /block/blk-core.c | |
parent | 9bf722598fcd51073974850ae026b44389430ecc (diff) |
block: first step of splitting ll_rw_blk, rename it
Then we retain history in blk-core.c
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'block/blk-core.c')
-rw-r--r-- | block/blk-core.c | 4457 |
1 files changed, 4457 insertions, 0 deletions
diff --git a/block/blk-core.c b/block/blk-core.c new file mode 100644 index 00000000000..1932a56f5e4 --- /dev/null +++ b/block/blk-core.c @@ -0,0 +1,4457 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1994, Karl Keyte: Added support for disk statistics + * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> + * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 + * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 + */ + +/* + * This handles all read/write requests to block devices + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/blktrace_api.h> +#include <linux/fault-inject.h> +#include <linux/scatterlist.h> + +/* + * for max sense size + */ +#include <scsi/scsi_cmnd.h> + +static void blk_unplug_work(struct work_struct *work); +static void blk_unplug_timeout(unsigned long data); +static void drive_stat_acct(struct request *rq, int new_io); +static void init_request_from_bio(struct request *req, struct bio *bio); +static int __make_request(struct request_queue *q, struct bio *bio); +static struct io_context *current_io_context(gfp_t gfp_flags, int node); +static void blk_recalc_rq_segments(struct request *rq); +static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, + struct bio *bio); + +/* + * For the allocated request tables + */ +static struct kmem_cache *request_cachep; + +/* + * For queue allocation + */ +static struct kmem_cache *requestq_cachep; + +/* + * For io context allocations + */ +static struct kmem_cache *iocontext_cachep; + +/* + * Controlling structure to kblockd + */ +static struct workqueue_struct *kblockd_workqueue; + +unsigned long blk_max_low_pfn, blk_max_pfn; + +EXPORT_SYMBOL(blk_max_low_pfn); +EXPORT_SYMBOL(blk_max_pfn); + +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + +/* Amount of time in which a process may batch requests */ +#define BLK_BATCH_TIME (HZ/50UL) + +/* Number of requests a "batching" process may submit */ +#define BLK_BATCH_REQ 32 + +/* + * Return the threshold (number of used requests) at which the queue is + * considered to be congested. It include a little hysteresis to keep the + * context switch rate down. + */ +static inline int queue_congestion_on_threshold(struct request_queue *q) +{ + return q->nr_congestion_on; +} + +/* + * The threshold at which a queue is considered to be uncongested + */ +static inline int queue_congestion_off_threshold(struct request_queue *q) +{ + return q->nr_congestion_off; +} + +static void blk_queue_congestion_threshold(struct request_queue *q) +{ + int nr; + + nr = q->nr_requests - (q->nr_requests / 8) + 1; + if (nr > q->nr_requests) + nr = q->nr_requests; + q->nr_congestion_on = nr; + + nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; + if (nr < 1) + nr = 1; + q->nr_congestion_off = nr; +} + +/** + * blk_get_backing_dev_info - get the address of a queue's backing_dev_info + * @bdev: device + * + * Locates the passed device's request queue and returns the address of its + * backing_dev_info + * + * Will return NULL if the request queue cannot be located. + */ +struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) +{ + struct backing_dev_info *ret = NULL; + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + ret = &q->backing_dev_info; + return ret; +} +EXPORT_SYMBOL(blk_get_backing_dev_info); + +/** + * blk_queue_prep_rq - set a prepare_request function for queue + * @q: queue + * @pfn: prepare_request function + * + * It's possible for a queue to register a prepare_request callback which + * is invoked before the request is handed to the request_fn. The goal of + * the function is to prepare a request for I/O, it can be used to build a + * cdb from the request data for instance. + * + */ +void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) +{ + q->prep_rq_fn = pfn; +} + +EXPORT_SYMBOL(blk_queue_prep_rq); + +/** + * blk_queue_merge_bvec - set a merge_bvec function for queue + * @q: queue + * @mbfn: merge_bvec_fn + * + * Usually queues have static limitations on the max sectors or segments that + * we can put in a request. Stacking drivers may have some settings that + * are dynamic, and thus we have to query the queue whether it is ok to + * add a new bio_vec to a bio at a given offset or not. If the block device + * has such limitations, it needs to register a merge_bvec_fn to control + * the size of bio's sent to it. Note that a block device *must* allow a + * single page to be added to an empty bio. The block device driver may want + * to use the bio_split() function to deal with these bio's. By default + * no merge_bvec_fn is defined for a queue, and only the fixed limits are + * honored. + */ +void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) +{ + q->merge_bvec_fn = mbfn; +} + +EXPORT_SYMBOL(blk_queue_merge_bvec); + +void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) +{ + q->softirq_done_fn = fn; +} + +EXPORT_SYMBOL(blk_queue_softirq_done); + +/** + * blk_queue_make_request - define an alternate make_request function for a device + * @q: the request queue for the device to be affected + * @mfn: the alternate make_request function + * + * Description: + * The normal way for &struct bios to be passed to a device + * driver is for them to be collected into requests on a request + * queue, and then to allow the device driver to select requests + * off that queue when it is ready. This works well for many block + * devices. However some block devices (typically virtual devices + * such as md or lvm) do not benefit from the processing on the + * request queue, and are served best by having the requests passed + * directly to them. This can be achieved by providing a function + * to blk_queue_make_request(). + * + * Caveat: + * The driver that does this *must* be able to deal appropriately + * with buffers in "highmemory". This can be accomplished by either calling + * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling + * blk_queue_bounce() to create a buffer in normal memory. + **/ +void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) +{ + /* + * set defaults + */ + q->nr_requests = BLKDEV_MAX_RQ; + blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); + blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); + q->make_request_fn = mfn; + q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + q->backing_dev_info.state = 0; + q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + blk_queue_max_sectors(q, SAFE_MAX_SECTORS); + blk_queue_hardsect_size(q, 512); + blk_queue_dma_alignment(q, 511); + blk_queue_congestion_threshold(q); + q->nr_batching = BLK_BATCH_REQ; + + q->unplug_thresh = 4; /* hmm */ + q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ + if (q->unplug_delay == 0) + q->unplug_delay = 1; + + INIT_WORK(&q->unplug_work, blk_unplug_work); + + q->unplug_timer.function = blk_unplug_timeout; + q->unplug_timer.data = (unsigned long)q; + + /* + * by default assume old behaviour and bounce for any highmem page + */ + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); +} + +EXPORT_SYMBOL(blk_queue_make_request); + +static void rq_init(struct request_queue *q, struct request *rq) +{ + INIT_LIST_HEAD(&rq->queuelist); + INIT_LIST_HEAD(&rq->donelist); + + rq->errors = 0; + rq->bio = rq->biotail = NULL; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->ioprio = 0; + rq->buffer = NULL; + rq->ref_count = 1; + rq->q = q; + rq->special = NULL; + rq->data_len = 0; + rq->data = NULL; + rq->nr_phys_segments = 0; + rq->sense = NULL; + rq->end_io = NULL; + rq->end_io_data = NULL; + rq->completion_data = NULL; + rq->next_rq = NULL; +} + +/** + * blk_queue_ordered - does this queue support ordered writes + * @q: the request queue + * @ordered: one of QUEUE_ORDERED_* + * @prepare_flush_fn: rq setup helper for cache flush ordered writes + * + * Description: + * For journalled file systems, doing ordered writes on a commit + * block instead of explicitly doing wait_on_buffer (which is bad + * for performance) can be a big win. Block drivers supporting this + * feature should call this function and indicate so. + * + **/ +int blk_queue_ordered(struct request_queue *q, unsigned ordered, + prepare_flush_fn *prepare_flush_fn) +{ + if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && + prepare_flush_fn == NULL) { + printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); + return -EINVAL; + } + + if (ordered != QUEUE_ORDERED_NONE && + ordered != QUEUE_ORDERED_DRAIN && + ordered != QUEUE_ORDERED_DRAIN_FLUSH && + ordered != QUEUE_ORDERED_DRAIN_FUA && + ordered != QUEUE_ORDERED_TAG && + ordered != QUEUE_ORDERED_TAG_FLUSH && + ordered != QUEUE_ORDERED_TAG_FUA) { + printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); + return -EINVAL; + } + + q->ordered = ordered; + q->next_ordered = ordered; + q->prepare_flush_fn = prepare_flush_fn; + + return 0; +} + +EXPORT_SYMBOL(blk_queue_ordered); + +/* + * Cache flushing for ordered writes handling + */ +inline unsigned blk_ordered_cur_seq(struct request_queue *q) +{ + if (!q->ordseq) + return 0; + return 1 << ffz(q->ordseq); +} + +unsigned blk_ordered_req_seq(struct request *rq) +{ + struct request_queue *q = rq->q; + + BUG_ON(q->ordseq == 0); + + if (rq == &q->pre_flush_rq) + return QUEUE_ORDSEQ_PREFLUSH; + if (rq == &q->bar_rq) + return QUEUE_ORDSEQ_BAR; + if (rq == &q->post_flush_rq) + return QUEUE_ORDSEQ_POSTFLUSH; + + /* + * !fs requests don't need to follow barrier ordering. Always + * put them at the front. This fixes the following deadlock. + * + * http://thread.gmane.org/gmane.linux.kernel/537473 + */ + if (!blk_fs_request(rq)) + return QUEUE_ORDSEQ_DRAIN; + + if ((rq->cmd_flags & REQ_ORDERED_COLOR) == + (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) + return QUEUE_ORDSEQ_DRAIN; + else + return QUEUE_ORDSEQ_DONE; +} + +void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) +{ + struct request *rq; + + if (error && !q->orderr) + q->orderr = error; + + BUG_ON(q->ordseq & seq); + q->ordseq |= seq; + + if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) + return; + + /* + * Okay, sequence complete. + */ + q->ordseq = 0; + rq = q->orig_bar_rq; + + if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq))) + BUG(); +} + +static void pre_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); +} + +static void bar_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); +} + +static void post_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); +} + +static void queue_flush(struct request_queue *q, unsigned which) +{ + struct request *rq; + rq_end_io_fn *end_io; + + if (which == QUEUE_ORDERED_PREFLUSH) { + rq = &q->pre_flush_rq; + end_io = pre_flush_end_io; + } else { + rq = &q->post_flush_rq; + end_io = post_flush_end_io; + } + + rq->cmd_flags = REQ_HARDBARRIER; + rq_init(q, rq); + rq->elevator_private = NULL; + rq->elevator_private2 = NULL; + rq->rq_disk = q->bar_rq.rq_disk; + rq->end_io = end_io; + q->prepare_flush_fn(q, rq); + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); +} + +static inline struct request *start_ordered(struct request_queue *q, + struct request *rq) +{ + q->orderr = 0; + q->ordered = q->next_ordered; + q->ordseq |= QUEUE_ORDSEQ_STARTED; + + /* + * Prep proxy barrier request. + */ + blkdev_dequeue_request(rq); + q->orig_bar_rq = rq; + rq = &q->bar_rq; + rq->cmd_flags = 0; + rq_init(q, rq); + if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) + rq->cmd_flags |= REQ_RW; + if (q->ordered & QUEUE_ORDERED_FUA) + rq->cmd_flags |= REQ_FUA; + rq->elevator_private = NULL; + rq->elevator_private2 = NULL; + init_request_from_bio(rq, q->orig_bar_rq->bio); + rq->end_io = bar_end_io; + + /* + * Queue ordered sequence. As we stack them at the head, we + * need to queue in reverse order. Note that we rely on that + * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs + * request gets inbetween ordered sequence. If this request is + * an empty barrier, we don't need to do a postflush ever since + * there will be no data written between the pre and post flush. + * Hence a single flush will suffice. + */ + if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq)) + queue_flush(q, QUEUE_ORDERED_POSTFLUSH); + else + q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); + + if (q->ordered & QUEUE_ORDERED_PREFLUSH) { + queue_flush(q, QUEUE_ORDERED_PREFLUSH); + rq = &q->pre_flush_rq; + } else + q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; + + if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) + q->ordseq |= QUEUE_ORDSEQ_DRAIN; + else + rq = NULL; + + return rq; +} + +int blk_do_ordered(struct request_queue *q, struct request **rqp) +{ + struct request *rq = *rqp; + const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); + + if (!q->ordseq) { + if (!is_barrier) + return 1; + + if (q->next_ordered != QUEUE_ORDERED_NONE) { + *rqp = start_ordered(q, rq); + return 1; + } else { + /* + * This can happen when the queue switches to + * ORDERED_NONE while this request is on it. + */ + blkdev_dequeue_request(rq); + if (__blk_end_request(rq, -EOPNOTSUPP, + blk_rq_bytes(rq))) + BUG(); + *rqp = NULL; + return 0; + } + } + + /* + * Ordered sequence in progress + */ + + /* Special requests are not subject to ordering rules. */ + if (!blk_fs_request(rq) && + rq != &q->pre_flush_rq && rq != &q->post_flush_rq) + return 1; + + if (q->ordered & QUEUE_ORDERED_TAG) { + /* Ordered by tag. Blocking the next barrier is enough. */ + if (is_barrier && rq != &q->bar_rq) + *rqp = NULL; + } else { + /* Ordered by draining. Wait for turn. */ + WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); + if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) + *rqp = NULL; + } + + return 1; +} + +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, int error) +{ + struct request_queue *q = rq->q; + + if (&q->bar_rq != rq) { + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; + + if (unlikely(nbytes > bio->bi_size)) { + printk("%s: want %u bytes done, only %u left\n", + __FUNCTION__, nbytes, bio->bi_size); + nbytes = bio->bi_size; + } + + bio->bi_size -= nbytes; + bio->bi_sector += (nbytes >> 9); + if (bio->bi_size == 0) + bio_endio(bio, error); + } else { + + /* + * Okay, this is the barrier request in progress, just + * record the error; + */ + if (error && !q->orderr) + q->orderr = error; + } +} + +/** + * blk_queue_bounce_limit - set bounce buffer limit for queue + * @q: the request queue for the device + * @dma_addr: bus address limit + * + * Description: + * Different hardware can have different requirements as to what pages + * it can do I/O directly to. A low level driver can call + * blk_queue_bounce_limit to have lower memory pages allocated as bounce + * buffers for doing I/O to pages residing above @page. + **/ +void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) +{ + unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; + int dma = 0; + + q->bounce_gfp = GFP_NOIO; +#if BITS_PER_LONG == 64 + /* Assume anything <= 4GB can be handled by IOMMU. + Actually some IOMMUs can handle everything, but I don't + know of a way to test this here. */ + if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) + dma = 1; + q->bounce_pfn = max_low_pfn; +#else + if (bounce_pfn < blk_max_low_pfn) + dma = 1; + q->bounce_pfn = bounce_pfn; +#endif + if (dma) { + init_emergency_isa_pool(); + q->bounce_gfp = GFP_NOIO | GFP_DMA; + q->bounce_pfn = bounce_pfn; + } +} + +EXPORT_SYMBOL(blk_queue_bounce_limit); + +/** + * blk_queue_max_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_sectors: max sectors in the usual 512b unit + * + * Description: + * Enables a low level driver to set an upper limit on the size of + * received requests. + **/ +void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) +{ + if ((max_sectors << 9) < PAGE_CACHE_SIZE) { + max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); + printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); + } + + if (BLK_DEF_MAX_SECTORS > max_sectors) + q->max_hw_sectors = q->max_sectors = max_sectors; + else { + q->max_sectors = BLK_DEF_MAX_SECTORS; + q->max_hw_sectors = max_sectors; + } +} + +EXPORT_SYMBOL(blk_queue_max_sectors); + +/** + * blk_queue_max_phys_segments - set max phys segments for a request for this queue + * @q: the request queue for the device + * @max_segments: max number of segments + * + * Description: + * Enables a low level driver to set an upper limit on the number of + * physical data segments in a request. This would be the largest sized + * scatter list the driver could handle. + **/ +void blk_queue_max_phys_segments(struct request_queue *q, + unsigned short max_segments) +{ + if (!max_segments) { + max_segments = 1; + printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); + } + + q->max_phys_segments = max_segments; +} + +EXPORT_SYMBOL(blk_queue_max_phys_segments); + +/** + * blk_queue_max_hw_segments - set max hw segments for a request for this queue + * @q: the request queue for the device + * @max_segments: max number of segments + * + * Description: + * Enables a low level driver to set an upper limit on the number of + * hw data segments in a request. This would be the largest number of + * address/length pairs the host adapter can actually give as once + * to the device. + **/ +void blk_queue_max_hw_segments(struct request_queue *q, + unsigned short max_segments) +{ + if (!max_segments) { + max_segments = 1; + printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); + } + + q->max_hw_segments = max_segments; +} + +EXPORT_SYMBOL(blk_queue_max_hw_segments); + +/** + * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg + * @q: the request queue for the device + * @max_size: max size of segment in bytes + * + * Description: + * Enables a low level driver to set an upper limit on the size of a + * coalesced segment + **/ +void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) +{ + if (max_size < PAGE_CACHE_SIZE) { + max_size = PAGE_CACHE_SIZE; + printk("%s: set to minimum %d\n", __FUNCTION__, max_size); + } + + q->max_segment_size = max_size; +} + +EXPORT_SYMBOL(blk_queue_max_segment_size); + +/** + * blk_queue_hardsect_size - set hardware sector size for the queue + * @q: the request queue for the device + * @size: the hardware sector size, in bytes + * + * Description: + * This should typically be set to the lowest possible sector size + * that the hardware can operate on (possible without reverting to + * even internal read-modify-write operations). Usually the default + * of 512 covers most hardware. + **/ +void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) +{ + q->hardsect_size = size; +} + +EXPORT_SYMBOL(blk_queue_hardsect_size); + +/* + * Returns the minimum that is _not_ zero, unless both are zero. + */ +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + +/** + * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers + * @t: the stacking driver (top) + * @b: the underlying device (bottom) + **/ +void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) +{ + /* zero is "infinity" */ + t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); + t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); + + t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); + t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); + t->max_segment_size = min(t->max_segment_size,b->max_segment_size); + t->hardsect_size = max(t->hardsect_size,b->hardsect_size); + if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) + clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); +} + +EXPORT_SYMBOL(blk_queue_stack_limits); + +/** + * blk_queue_dma_drain - Set up a drain buffer for excess dma. + * + * @q: the request queue for the device + * @buf: physically contiguous buffer + * @size: size of the buffer in bytes + * + * Some devices have excess DMA problems and can't simply discard (or + * zero fill) the unwanted piece of the transfer. They have to have a + * real area of memory to transfer it into. The use case for this is + * ATAPI devices in DMA mode. If the packet command causes a transfer + * bigger than the transfer size some HBAs will lock up if there + * aren't DMA elements to contain the excess transfer. What this API + * does is adjust the queue so that the buf is always appended + * silently to the scatterlist. + * + * Note: This routine adjusts max_hw_segments to make room for + * appending the drain buffer. If you call + * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after + * calling this routine, you must set the limit to one fewer than your + * device can support otherwise there won't be room for the drain + * buffer. + */ +int blk_queue_dma_drain(struct request_queue *q, void *buf, + unsigned int size) +{ + if (q->max_hw_segments < 2 || q->max_phys_segments < 2) + return -EINVAL; + /* make room for appending the drain */ + --q->max_hw_segments; + --q->max_phys_segments; + q->dma_drain_buffer = buf; + q->dma_drain_size = size; + + return 0; +} + +EXPORT_SYMBOL_GPL(blk_queue_dma_drain); + +/** + * blk_queue_segment_boundary - set boundary rules for segment merging + * @q: the request queue for the device + * @mask: the memory boundary mask + **/ +void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) +{ + if (mask < PAGE_CACHE_SIZE - 1) { + mask = PAGE_CACHE_SIZE - 1; + printk("%s: set to minimum %lx\n", __FUNCTION__, mask); + } + + q->seg_boundary_mask = mask; +} + +EXPORT_SYMBOL(blk_queue_segment_boundary); + +/** + * blk_queue_dma_alignment - set dma length and memory alignment + * @q: the request queue for the device + * @mask: alignment mask + * + * description: + * set required memory and length aligment for direct dma transactions. + * this is used when buiding direct io requests for the queue. + * + **/ +void blk_queue_dma_alignment(struct request_queue *q, int mask) +{ + q->dma_alignment = mask; +} + +EXPORT_SYMBOL(blk_queue_dma_alignment); + +/** + * blk_queue_update_dma_alignment - update dma length and memory alignment + * @q: the request queue for the device + * @mask: alignment mask + * + * description: + * update required memory and length aligment for direct dma transactions. + * If the requested alignment is larger than the current alignment, then + * the current queue alignment is updated to the new value, otherwise it + * is left alone. The design of this is to allow multiple objects + * (driver, device, transport etc) to set their respective + * alignments without having them interfere. + * + **/ +void blk_queue_update_dma_alignment(struct request_queue *q, int mask) +{ + BUG_ON(mask > PAGE_SIZE); + + if (mask > q->dma_alignment) + q->dma_alignment = mask; +} + +EXPORT_SYMBOL(blk_queue_update_dma_alignment); + +/** + * blk_queue_find_tag - find a request by its tag and queue + * @q: The request queue for the device + * @tag: The tag of the request + * + * Notes: + * Should be used when a device returns a tag and you want to match + * it with a request. + * + * no locks need be held. + **/ +struct request *blk_queue_find_tag(struct request_queue *q, int tag) +{ + return blk_map_queue_find_tag(q->queue_tags, tag); +} + +EXPORT_SYMBOL(blk_queue_find_tag); + +/** + * __blk_free_tags - release a given set of tag maintenance info + * @bqt: the tag map to free + * + * Tries to free the specified @bqt@. Returns true if it was + * actually freed and false if there are still references using it + */ +static int __blk_free_tags(struct blk_queue_tag *bqt) +{ + int retval; + + retval = atomic_dec_and_test(&bqt->refcnt); + if (retval) { + BUG_ON(bqt->busy); + + kfree(bqt->tag_index); + bqt->tag_index = NULL; + + kfree(bqt->tag_map); + bqt->tag_map = NULL; + + kfree(bqt); + + } + + return retval; +} + +/** + * __blk_queue_free_tags - release tag maintenance info + * @q: the request queue for the device + * + * Notes: + * blk_cleanup_queue() will take care of calling this function, if tagging + * has been used. So there's no need to call this directly. + **/ +static void __blk_queue_free_tags(struct request_queue *q) +{ + struct blk_queue_tag *bqt = q->queue_tags; + + if (!bqt) + return; + + __blk_free_tags(bqt); + + q->queue_tags = NULL; + q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); +} + + +/** + * blk_free_tags - release a given set of tag maintenance info + * @bqt: the tag map to free + * + * For externally managed @bqt@ frees the map. Callers of this + * function must guarantee to have released all the queues that + * might have been using this tag map. + */ +void blk_free_tags(struct blk_queue_tag *bqt) +{ + if (unlikely(!__blk_free_tags(bqt))) + BUG(); +} +EXPORT_SYMBOL(blk_free_tags); + +/** + * blk_queue_free_tags - release tag maintenance info + * @q: the request queue for the device + * + * Notes: + * This is used to disabled tagged queuing to a device, yet leave + * queue in function. + **/ +void blk_queue_free_tags(struct request_queue *q) +{ + clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); +} + +EXPORT_SYMBOL(blk_queue_free_tags); + +static int +init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) +{ + struct request **tag_index; + unsigned long *tag_map; + int nr_ulongs; + + if (q && depth > q->nr_requests * 2) { + depth = q->nr_requests * 2; + printk(KERN_ERR "%s: adjusted depth to %d\n", + __FUNCTION__, depth); + } + + tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); + if (!tag_index) + goto fail; + + nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; + tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); + if (!tag_map) + goto fail; + + tags->real_max_depth = depth; + tags->max_depth = depth; + tags->tag_index = tag_index; + tags->tag_map = tag_map; + + return 0; +fail: + kfree(tag_index); + return -ENOMEM; +} + +static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, + int depth) +{ + struct blk_queue_tag *tags; + + tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); + if (!tags) + goto fail; + + if (init_tag_map(q, tags, depth)) + goto fail; + + tags->busy = 0; + atomic_set(&tags->refcnt, 1); + return tags; +fail: + kfree(tags); + return NULL; +} + +/** + * blk_init_tags - initialize the tag info for an external tag map + * @depth: the maximum queue depth supported + * @tags: the tag to use + **/ +struct blk_queue_tag *blk_init_tags(int depth) +{ + return __blk_queue_init_tags(NULL, depth); +} +EXPORT_SYMBOL(blk_init_tags); + +/** + * blk_queue_init_tags - initialize the queue tag info + * @q: the request queue for the device + * @depth: the maximum queue depth supported + * @tags: the tag to use + **/ +int blk_queue_init_tags(struct request_queue *q, int depth, + struct blk_queue_tag *tags) +{ + int rc; + + BUG_ON(tags && q->queue_tags && tags != q->queue_tags); + + if (!tags && !q->queue_tags) { + tags = __blk_queue_init_tags(q, depth); + + if (!tags) + goto fail; + } else if (q->queue_tags) { + if ((rc = blk_queue_resize_tags(q, depth))) + return rc; + set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); + return 0; + } else + atomic_inc(&tags->refcnt); + + /* + * assign it, all done + */ + q->queue_tags = tags; + q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); + INIT_LIST_HEAD(&q->tag_busy_list); + return 0; +fail: + kfree(tags); + return -ENOMEM; +} + +EXPORT_SYMBOL(blk_queue_init_tags); + +/** + * blk_queue_resize_tags - change the queueing depth + * @q: the request queue for the device + * @new_depth: the new max command queueing depth + * + * Notes: + * Must be called with the queue lock held. + **/ +int blk_queue_resize_tags(struct request_queue *q, int new_depth) +{ + struct blk_queue_tag *bqt = q->queue_tags; + struct request **tag_index; + unsigned long *tag_map; + int max_depth, nr_ulongs; + + if (!bqt) + return -ENXIO; + + /* + * if we already have large enough real_max_depth. just + * adjust max_depth. *NOTE* as requests with tag value + * between new_depth and real_max_depth can be in-flight, tag + * map can not be shrunk blindly here. + */ + if (new_depth <= bqt->real_max_depth) { + bqt->max_depth = new_depth; + return 0; + } + + /* + * Currently cannot replace a shared tag map with a new + * one, so error out if this is the case + */ + if (atomic_read(&bqt->refcnt) != 1) + return -EBUSY; + + /* + * save the old state info, so we can copy it back + */ + tag_index = bqt->tag_index; + tag_map = bqt->tag_map; + max_depth = bqt->real_max_depth; + + if (init_tag_map(q, bqt, new_depth)) + return -ENOMEM; + + memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); + nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; + memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); + + kfree(tag_index); + kfree(tag_map); + return 0; +} + +EXPORT_SYMBOL(blk_queue_resize_tags); + +/** + * blk_queue_end_tag - end tag operations for a request + * @q: the request queue for the device + * @rq: the request that has completed + * + * Description: + * Typically called when end_that_request_first() returns 0, meaning + * all transfers have been done for a request. It's important to call + * this function before end_that_request_last(), as that will put the + * request back on the free list thus corrupting the internal tag list. + * + * Notes: + * queue lock must be held. + **/ +void blk_queue_end_tag(struct request_queue *q, struct request *rq) +{ + struct blk_queue_tag *bqt = q->queue_tags; + int tag = rq->tag; + + BUG_ON(tag == -1); + + if (unlikely(tag >= bqt->real_max_depth)) + /* + * This can happen after tag depth has been reduced. + * FIXME: how about a warning or info message here? + */ + return; + + list_del_init(&rq->queuelist); + rq->cmd_flags &= ~REQ_QUEUED; + rq->tag = -1; + + if (unlikely(bqt->tag_index[tag] == NULL)) + printk(KERN_ERR "%s: tag %d is missing\n", + __FUNCTION__, tag); + + bqt->tag_index[tag] = NULL; + + if (unlikely(!test_bit(tag, bqt->tag_map))) { + printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", + __FUNCTION__, tag); + return; + } + /* + * The tag_map bit acts as a lock for tag_index[bit], so we need + * unlock memory barrier semantics. + */ + clear_bit_unlock(tag, bqt->tag_map); + bqt->busy--; +} + +EXPORT_SYMBOL(blk_queue_end_tag); + +/** + * blk_queue_start_tag - find a free tag and assign it + * @q: the request queue for the device + * @rq: the block request that needs tagging + * + * Description: + * This can either be used as a stand-alone helper, or possibly be + * assigned as the queue &prep_rq_fn (in which case &struct request + * automagically gets a tag assigned). Note that this function + * assumes that any type of request can be queued! if this is not + * true for your device, you must check the request type before + * calling this function. The request will also be removed fro |