aboutsummaryrefslogtreecommitdiff
path: root/block/blk-core.c
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2008-01-29 14:49:21 +0100
committerJens Axboe <jens.axboe@oracle.com>2008-01-29 21:55:05 +0100
commita168ee84c90b39ece357da127ab388f2f64db19c (patch)
treeacb5527d9835c06e4eb2c308850e74db857f7d64 /block/blk-core.c
parent9bf722598fcd51073974850ae026b44389430ecc (diff)
block: first step of splitting ll_rw_blk, rename it
Then we retain history in blk-core.c Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'block/blk-core.c')
-rw-r--r--block/blk-core.c4457
1 files changed, 4457 insertions, 0 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
new file mode 100644
index 00000000000..1932a56f5e4
--- /dev/null
+++ b/block/blk-core.c
@@ -0,0 +1,4457 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
+ * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
+ * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
+ * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
+ */
+
+/*
+ * This handles all read/write requests to block devices
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blktrace_api.h>
+#include <linux/fault-inject.h>
+#include <linux/scatterlist.h>
+
+/*
+ * for max sense size
+ */
+#include <scsi/scsi_cmnd.h>
+
+static void blk_unplug_work(struct work_struct *work);
+static void blk_unplug_timeout(unsigned long data);
+static void drive_stat_acct(struct request *rq, int new_io);
+static void init_request_from_bio(struct request *req, struct bio *bio);
+static int __make_request(struct request_queue *q, struct bio *bio);
+static struct io_context *current_io_context(gfp_t gfp_flags, int node);
+static void blk_recalc_rq_segments(struct request *rq);
+static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
+ struct bio *bio);
+
+/*
+ * For the allocated request tables
+ */
+static struct kmem_cache *request_cachep;
+
+/*
+ * For queue allocation
+ */
+static struct kmem_cache *requestq_cachep;
+
+/*
+ * For io context allocations
+ */
+static struct kmem_cache *iocontext_cachep;
+
+/*
+ * Controlling structure to kblockd
+ */
+static struct workqueue_struct *kblockd_workqueue;
+
+unsigned long blk_max_low_pfn, blk_max_pfn;
+
+EXPORT_SYMBOL(blk_max_low_pfn);
+EXPORT_SYMBOL(blk_max_pfn);
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+/* Amount of time in which a process may batch requests */
+#define BLK_BATCH_TIME (HZ/50UL)
+
+/* Number of requests a "batching" process may submit */
+#define BLK_BATCH_REQ 32
+
+/*
+ * Return the threshold (number of used requests) at which the queue is
+ * considered to be congested. It include a little hysteresis to keep the
+ * context switch rate down.
+ */
+static inline int queue_congestion_on_threshold(struct request_queue *q)
+{
+ return q->nr_congestion_on;
+}
+
+/*
+ * The threshold at which a queue is considered to be uncongested
+ */
+static inline int queue_congestion_off_threshold(struct request_queue *q)
+{
+ return q->nr_congestion_off;
+}
+
+static void blk_queue_congestion_threshold(struct request_queue *q)
+{
+ int nr;
+
+ nr = q->nr_requests - (q->nr_requests / 8) + 1;
+ if (nr > q->nr_requests)
+ nr = q->nr_requests;
+ q->nr_congestion_on = nr;
+
+ nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
+ if (nr < 1)
+ nr = 1;
+ q->nr_congestion_off = nr;
+}
+
+/**
+ * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
+ * @bdev: device
+ *
+ * Locates the passed device's request queue and returns the address of its
+ * backing_dev_info
+ *
+ * Will return NULL if the request queue cannot be located.
+ */
+struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
+{
+ struct backing_dev_info *ret = NULL;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q)
+ ret = &q->backing_dev_info;
+ return ret;
+}
+EXPORT_SYMBOL(blk_get_backing_dev_info);
+
+/**
+ * blk_queue_prep_rq - set a prepare_request function for queue
+ * @q: queue
+ * @pfn: prepare_request function
+ *
+ * It's possible for a queue to register a prepare_request callback which
+ * is invoked before the request is handed to the request_fn. The goal of
+ * the function is to prepare a request for I/O, it can be used to build a
+ * cdb from the request data for instance.
+ *
+ */
+void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
+{
+ q->prep_rq_fn = pfn;
+}
+
+EXPORT_SYMBOL(blk_queue_prep_rq);
+
+/**
+ * blk_queue_merge_bvec - set a merge_bvec function for queue
+ * @q: queue
+ * @mbfn: merge_bvec_fn
+ *
+ * Usually queues have static limitations on the max sectors or segments that
+ * we can put in a request. Stacking drivers may have some settings that
+ * are dynamic, and thus we have to query the queue whether it is ok to
+ * add a new bio_vec to a bio at a given offset or not. If the block device
+ * has such limitations, it needs to register a merge_bvec_fn to control
+ * the size of bio's sent to it. Note that a block device *must* allow a
+ * single page to be added to an empty bio. The block device driver may want
+ * to use the bio_split() function to deal with these bio's. By default
+ * no merge_bvec_fn is defined for a queue, and only the fixed limits are
+ * honored.
+ */
+void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
+{
+ q->merge_bvec_fn = mbfn;
+}
+
+EXPORT_SYMBOL(blk_queue_merge_bvec);
+
+void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
+{
+ q->softirq_done_fn = fn;
+}
+
+EXPORT_SYMBOL(blk_queue_softirq_done);
+
+/**
+ * blk_queue_make_request - define an alternate make_request function for a device
+ * @q: the request queue for the device to be affected
+ * @mfn: the alternate make_request function
+ *
+ * Description:
+ * The normal way for &struct bios to be passed to a device
+ * driver is for them to be collected into requests on a request
+ * queue, and then to allow the device driver to select requests
+ * off that queue when it is ready. This works well for many block
+ * devices. However some block devices (typically virtual devices
+ * such as md or lvm) do not benefit from the processing on the
+ * request queue, and are served best by having the requests passed
+ * directly to them. This can be achieved by providing a function
+ * to blk_queue_make_request().
+ *
+ * Caveat:
+ * The driver that does this *must* be able to deal appropriately
+ * with buffers in "highmemory". This can be accomplished by either calling
+ * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ * blk_queue_bounce() to create a buffer in normal memory.
+ **/
+void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
+{
+ /*
+ * set defaults
+ */
+ q->nr_requests = BLKDEV_MAX_RQ;
+ blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
+ blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+ q->make_request_fn = mfn;
+ q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ q->backing_dev_info.state = 0;
+ q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+ blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
+ blk_queue_hardsect_size(q, 512);
+ blk_queue_dma_alignment(q, 511);
+ blk_queue_congestion_threshold(q);
+ q->nr_batching = BLK_BATCH_REQ;
+
+ q->unplug_thresh = 4; /* hmm */
+ q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
+ if (q->unplug_delay == 0)
+ q->unplug_delay = 1;
+
+ INIT_WORK(&q->unplug_work, blk_unplug_work);
+
+ q->unplug_timer.function = blk_unplug_timeout;
+ q->unplug_timer.data = (unsigned long)q;
+
+ /*
+ * by default assume old behaviour and bounce for any highmem page
+ */
+ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
+}
+
+EXPORT_SYMBOL(blk_queue_make_request);
+
+static void rq_init(struct request_queue *q, struct request *rq)
+{
+ INIT_LIST_HEAD(&rq->queuelist);
+ INIT_LIST_HEAD(&rq->donelist);
+
+ rq->errors = 0;
+ rq->bio = rq->biotail = NULL;
+ INIT_HLIST_NODE(&rq->hash);
+ RB_CLEAR_NODE(&rq->rb_node);
+ rq->ioprio = 0;
+ rq->buffer = NULL;
+ rq->ref_count = 1;
+ rq->q = q;
+ rq->special = NULL;
+ rq->data_len = 0;
+ rq->data = NULL;
+ rq->nr_phys_segments = 0;
+ rq->sense = NULL;
+ rq->end_io = NULL;
+ rq->end_io_data = NULL;
+ rq->completion_data = NULL;
+ rq->next_rq = NULL;
+}
+
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q: the request queue
+ * @ordered: one of QUEUE_ORDERED_*
+ * @prepare_flush_fn: rq setup helper for cache flush ordered writes
+ *
+ * Description:
+ * For journalled file systems, doing ordered writes on a commit
+ * block instead of explicitly doing wait_on_buffer (which is bad
+ * for performance) can be a big win. Block drivers supporting this
+ * feature should call this function and indicate so.
+ *
+ **/
+int blk_queue_ordered(struct request_queue *q, unsigned ordered,
+ prepare_flush_fn *prepare_flush_fn)
+{
+ if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
+ prepare_flush_fn == NULL) {
+ printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
+ return -EINVAL;
+ }
+
+ if (ordered != QUEUE_ORDERED_NONE &&
+ ordered != QUEUE_ORDERED_DRAIN &&
+ ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
+ ordered != QUEUE_ORDERED_DRAIN_FUA &&
+ ordered != QUEUE_ORDERED_TAG &&
+ ordered != QUEUE_ORDERED_TAG_FLUSH &&
+ ordered != QUEUE_ORDERED_TAG_FUA) {
+ printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+ return -EINVAL;
+ }
+
+ q->ordered = ordered;
+ q->next_ordered = ordered;
+ q->prepare_flush_fn = prepare_flush_fn;
+
+ return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/*
+ * Cache flushing for ordered writes handling
+ */
+inline unsigned blk_ordered_cur_seq(struct request_queue *q)
+{
+ if (!q->ordseq)
+ return 0;
+ return 1 << ffz(q->ordseq);
+}
+
+unsigned blk_ordered_req_seq(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ BUG_ON(q->ordseq == 0);
+
+ if (rq == &q->pre_flush_rq)
+ return QUEUE_ORDSEQ_PREFLUSH;
+ if (rq == &q->bar_rq)
+ return QUEUE_ORDSEQ_BAR;
+ if (rq == &q->post_flush_rq)
+ return QUEUE_ORDSEQ_POSTFLUSH;
+
+ /*
+ * !fs requests don't need to follow barrier ordering. Always
+ * put them at the front. This fixes the following deadlock.
+ *
+ * http://thread.gmane.org/gmane.linux.kernel/537473
+ */
+ if (!blk_fs_request(rq))
+ return QUEUE_ORDSEQ_DRAIN;
+
+ if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
+ (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
+ return QUEUE_ORDSEQ_DRAIN;
+ else
+ return QUEUE_ORDSEQ_DONE;
+}
+
+void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+{
+ struct request *rq;
+
+ if (error && !q->orderr)
+ q->orderr = error;
+
+ BUG_ON(q->ordseq & seq);
+ q->ordseq |= seq;
+
+ if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
+ return;
+
+ /*
+ * Okay, sequence complete.
+ */
+ q->ordseq = 0;
+ rq = q->orig_bar_rq;
+
+ if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
+ BUG();
+}
+
+static void pre_flush_end_io(struct request *rq, int error)
+{
+ elv_completed_request(rq->q, rq);
+ blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
+
+static void bar_end_io(struct request *rq, int error)
+{
+ elv_completed_request(rq->q, rq);
+ blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
+
+static void post_flush_end_io(struct request *rq, int error)
+{
+ elv_completed_request(rq->q, rq);
+ blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
+
+static void queue_flush(struct request_queue *q, unsigned which)
+{
+ struct request *rq;
+ rq_end_io_fn *end_io;
+
+ if (which == QUEUE_ORDERED_PREFLUSH) {
+ rq = &q->pre_flush_rq;
+ end_io = pre_flush_end_io;
+ } else {
+ rq = &q->post_flush_rq;
+ end_io = post_flush_end_io;
+ }
+
+ rq->cmd_flags = REQ_HARDBARRIER;
+ rq_init(q, rq);
+ rq->elevator_private = NULL;
+ rq->elevator_private2 = NULL;
+ rq->rq_disk = q->bar_rq.rq_disk;
+ rq->end_io = end_io;
+ q->prepare_flush_fn(q, rq);
+
+ elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+}
+
+static inline struct request *start_ordered(struct request_queue *q,
+ struct request *rq)
+{
+ q->orderr = 0;
+ q->ordered = q->next_ordered;
+ q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+ /*
+ * Prep proxy barrier request.
+ */
+ blkdev_dequeue_request(rq);
+ q->orig_bar_rq = rq;
+ rq = &q->bar_rq;
+ rq->cmd_flags = 0;
+ rq_init(q, rq);
+ if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
+ rq->cmd_flags |= REQ_RW;
+ if (q->ordered & QUEUE_ORDERED_FUA)
+ rq->cmd_flags |= REQ_FUA;
+ rq->elevator_private = NULL;
+ rq->elevator_private2 = NULL;
+ init_request_from_bio(rq, q->orig_bar_rq->bio);
+ rq->end_io = bar_end_io;
+
+ /*
+ * Queue ordered sequence. As we stack them at the head, we
+ * need to queue in reverse order. Note that we rely on that
+ * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
+ * request gets inbetween ordered sequence. If this request is
+ * an empty barrier, we don't need to do a postflush ever since
+ * there will be no data written between the pre and post flush.
+ * Hence a single flush will suffice.
+ */
+ if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
+ queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+ else
+ q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+
+ elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+
+ if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
+ queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+ rq = &q->pre_flush_rq;
+ } else
+ q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
+
+ if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+ q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+ else
+ rq = NULL;
+
+ return rq;
+}
+
+int blk_do_ordered(struct request_queue *q, struct request **rqp)
+{
+ struct request *rq = *rqp;
+ const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+
+ if (!q->ordseq) {
+ if (!is_barrier)
+ return 1;
+
+ if (q->next_ordered != QUEUE_ORDERED_NONE) {
+ *rqp = start_ordered(q, rq);
+ return 1;
+ } else {
+ /*
+ * This can happen when the queue switches to
+ * ORDERED_NONE while this request is on it.
+ */
+ blkdev_dequeue_request(rq);
+ if (__blk_end_request(rq, -EOPNOTSUPP,
+ blk_rq_bytes(rq)))
+ BUG();
+ *rqp = NULL;
+ return 0;
+ }
+ }
+
+ /*
+ * Ordered sequence in progress
+ */
+
+ /* Special requests are not subject to ordering rules. */
+ if (!blk_fs_request(rq) &&
+ rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
+ return 1;
+
+ if (q->ordered & QUEUE_ORDERED_TAG) {
+ /* Ordered by tag. Blocking the next barrier is enough. */
+ if (is_barrier && rq != &q->bar_rq)
+ *rqp = NULL;
+ } else {
+ /* Ordered by draining. Wait for turn. */
+ WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
+ if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
+ *rqp = NULL;
+ }
+
+ return 1;
+}
+
+static void req_bio_endio(struct request *rq, struct bio *bio,
+ unsigned int nbytes, int error)
+{
+ struct request_queue *q = rq->q;
+
+ if (&q->bar_rq != rq) {
+ if (error)
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ error = -EIO;
+
+ if (unlikely(nbytes > bio->bi_size)) {
+ printk("%s: want %u bytes done, only %u left\n",
+ __FUNCTION__, nbytes, bio->bi_size);
+ nbytes = bio->bi_size;
+ }
+
+ bio->bi_size -= nbytes;
+ bio->bi_sector += (nbytes >> 9);
+ if (bio->bi_size == 0)
+ bio_endio(bio, error);
+ } else {
+
+ /*
+ * Okay, this is the barrier request in progress, just
+ * record the error;
+ */
+ if (error && !q->orderr)
+ q->orderr = error;
+ }
+}
+
+/**
+ * blk_queue_bounce_limit - set bounce buffer limit for queue
+ * @q: the request queue for the device
+ * @dma_addr: bus address limit
+ *
+ * Description:
+ * Different hardware can have different requirements as to what pages
+ * it can do I/O directly to. A low level driver can call
+ * blk_queue_bounce_limit to have lower memory pages allocated as bounce
+ * buffers for doing I/O to pages residing above @page.
+ **/
+void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
+{
+ unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
+ int dma = 0;
+
+ q->bounce_gfp = GFP_NOIO;
+#if BITS_PER_LONG == 64
+ /* Assume anything <= 4GB can be handled by IOMMU.
+ Actually some IOMMUs can handle everything, but I don't
+ know of a way to test this here. */
+ if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
+ dma = 1;
+ q->bounce_pfn = max_low_pfn;
+#else
+ if (bounce_pfn < blk_max_low_pfn)
+ dma = 1;
+ q->bounce_pfn = bounce_pfn;
+#endif
+ if (dma) {
+ init_emergency_isa_pool();
+ q->bounce_gfp = GFP_NOIO | GFP_DMA;
+ q->bounce_pfn = bounce_pfn;
+ }
+}
+
+EXPORT_SYMBOL(blk_queue_bounce_limit);
+
+/**
+ * blk_queue_max_sectors - set max sectors for a request for this queue
+ * @q: the request queue for the device
+ * @max_sectors: max sectors in the usual 512b unit
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the size of
+ * received requests.
+ **/
+void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
+{
+ if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
+ max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
+ }
+
+ if (BLK_DEF_MAX_SECTORS > max_sectors)
+ q->max_hw_sectors = q->max_sectors = max_sectors;
+ else {
+ q->max_sectors = BLK_DEF_MAX_SECTORS;
+ q->max_hw_sectors = max_sectors;
+ }
+}
+
+EXPORT_SYMBOL(blk_queue_max_sectors);
+
+/**
+ * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * @q: the request queue for the device
+ * @max_segments: max number of segments
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the number of
+ * physical data segments in a request. This would be the largest sized
+ * scatter list the driver could handle.
+ **/
+void blk_queue_max_phys_segments(struct request_queue *q,
+ unsigned short max_segments)
+{
+ if (!max_segments) {
+ max_segments = 1;
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+ }
+
+ q->max_phys_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_phys_segments);
+
+/**
+ * blk_queue_max_hw_segments - set max hw segments for a request for this queue
+ * @q: the request queue for the device
+ * @max_segments: max number of segments
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the number of
+ * hw data segments in a request. This would be the largest number of
+ * address/length pairs the host adapter can actually give as once
+ * to the device.
+ **/
+void blk_queue_max_hw_segments(struct request_queue *q,
+ unsigned short max_segments)
+{
+ if (!max_segments) {
+ max_segments = 1;
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+ }
+
+ q->max_hw_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_hw_segments);
+
+/**
+ * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
+ * @q: the request queue for the device
+ * @max_size: max size of segment in bytes
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the size of a
+ * coalesced segment
+ **/
+void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
+{
+ if (max_size < PAGE_CACHE_SIZE) {
+ max_size = PAGE_CACHE_SIZE;
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
+ }
+
+ q->max_segment_size = max_size;
+}
+
+EXPORT_SYMBOL(blk_queue_max_segment_size);
+
+/**
+ * blk_queue_hardsect_size - set hardware sector size for the queue
+ * @q: the request queue for the device
+ * @size: the hardware sector size, in bytes
+ *
+ * Description:
+ * This should typically be set to the lowest possible sector size
+ * that the hardware can operate on (possible without reverting to
+ * even internal read-modify-write operations). Usually the default
+ * of 512 covers most hardware.
+ **/
+void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
+{
+ q->hardsect_size = size;
+}
+
+EXPORT_SYMBOL(blk_queue_hardsect_size);
+
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+
+/**
+ * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
+ * @t: the stacking driver (top)
+ * @b: the underlying device (bottom)
+ **/
+void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
+{
+ /* zero is "infinity" */
+ t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
+ t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
+
+ t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
+ t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
+ t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
+ t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
+ if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+ clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_stack_limits);
+
+/**
+ * blk_queue_dma_drain - Set up a drain buffer for excess dma.
+ *
+ * @q: the request queue for the device
+ * @buf: physically contiguous buffer
+ * @size: size of the buffer in bytes
+ *
+ * Some devices have excess DMA problems and can't simply discard (or
+ * zero fill) the unwanted piece of the transfer. They have to have a
+ * real area of memory to transfer it into. The use case for this is
+ * ATAPI devices in DMA mode. If the packet command causes a transfer
+ * bigger than the transfer size some HBAs will lock up if there
+ * aren't DMA elements to contain the excess transfer. What this API
+ * does is adjust the queue so that the buf is always appended
+ * silently to the scatterlist.
+ *
+ * Note: This routine adjusts max_hw_segments to make room for
+ * appending the drain buffer. If you call
+ * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after
+ * calling this routine, you must set the limit to one fewer than your
+ * device can support otherwise there won't be room for the drain
+ * buffer.
+ */
+int blk_queue_dma_drain(struct request_queue *q, void *buf,
+ unsigned int size)
+{
+ if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
+ return -EINVAL;
+ /* make room for appending the drain */
+ --q->max_hw_segments;
+ --q->max_phys_segments;
+ q->dma_drain_buffer = buf;
+ q->dma_drain_size = size;
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
+
+/**
+ * blk_queue_segment_boundary - set boundary rules for segment merging
+ * @q: the request queue for the device
+ * @mask: the memory boundary mask
+ **/
+void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
+{
+ if (mask < PAGE_CACHE_SIZE - 1) {
+ mask = PAGE_CACHE_SIZE - 1;
+ printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
+ }
+
+ q->seg_boundary_mask = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_segment_boundary);
+
+/**
+ * blk_queue_dma_alignment - set dma length and memory alignment
+ * @q: the request queue for the device
+ * @mask: alignment mask
+ *
+ * description:
+ * set required memory and length aligment for direct dma transactions.
+ * this is used when buiding direct io requests for the queue.
+ *
+ **/
+void blk_queue_dma_alignment(struct request_queue *q, int mask)
+{
+ q->dma_alignment = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_dma_alignment);
+
+/**
+ * blk_queue_update_dma_alignment - update dma length and memory alignment
+ * @q: the request queue for the device
+ * @mask: alignment mask
+ *
+ * description:
+ * update required memory and length aligment for direct dma transactions.
+ * If the requested alignment is larger than the current alignment, then
+ * the current queue alignment is updated to the new value, otherwise it
+ * is left alone. The design of this is to allow multiple objects
+ * (driver, device, transport etc) to set their respective
+ * alignments without having them interfere.
+ *
+ **/
+void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
+{
+ BUG_ON(mask > PAGE_SIZE);
+
+ if (mask > q->dma_alignment)
+ q->dma_alignment = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+
+/**
+ * blk_queue_find_tag - find a request by its tag and queue
+ * @q: The request queue for the device
+ * @tag: The tag of the request
+ *
+ * Notes:
+ * Should be used when a device returns a tag and you want to match
+ * it with a request.
+ *
+ * no locks need be held.
+ **/
+struct request *blk_queue_find_tag(struct request_queue *q, int tag)
+{
+ return blk_map_queue_find_tag(q->queue_tags, tag);
+}
+
+EXPORT_SYMBOL(blk_queue_find_tag);
+
+/**
+ * __blk_free_tags - release a given set of tag maintenance info
+ * @bqt: the tag map to free
+ *
+ * Tries to free the specified @bqt@. Returns true if it was
+ * actually freed and false if there are still references using it
+ */
+static int __blk_free_tags(struct blk_queue_tag *bqt)
+{
+ int retval;
+
+ retval = atomic_dec_and_test(&bqt->refcnt);
+ if (retval) {
+ BUG_ON(bqt->busy);
+
+ kfree(bqt->tag_index);
+ bqt->tag_index = NULL;
+
+ kfree(bqt->tag_map);
+ bqt->tag_map = NULL;
+
+ kfree(bqt);
+
+ }
+
+ return retval;
+}
+
+/**
+ * __blk_queue_free_tags - release tag maintenance info
+ * @q: the request queue for the device
+ *
+ * Notes:
+ * blk_cleanup_queue() will take care of calling this function, if tagging
+ * has been used. So there's no need to call this directly.
+ **/
+static void __blk_queue_free_tags(struct request_queue *q)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+
+ if (!bqt)
+ return;
+
+ __blk_free_tags(bqt);
+
+ q->queue_tags = NULL;
+ q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
+}
+
+
+/**
+ * blk_free_tags - release a given set of tag maintenance info
+ * @bqt: the tag map to free
+ *
+ * For externally managed @bqt@ frees the map. Callers of this
+ * function must guarantee to have released all the queues that
+ * might have been using this tag map.
+ */
+void blk_free_tags(struct blk_queue_tag *bqt)
+{
+ if (unlikely(!__blk_free_tags(bqt)))
+ BUG();
+}
+EXPORT_SYMBOL(blk_free_tags);
+
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q: the request queue for the device
+ *
+ * Notes:
+ * This is used to disabled tagged queuing to a device, yet leave
+ * queue in function.
+ **/
+void blk_queue_free_tags(struct request_queue *q)
+{
+ clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_free_tags);
+
+static int
+init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
+{
+ struct request **tag_index;
+ unsigned long *tag_map;
+ int nr_ulongs;
+
+ if (q && depth > q->nr_requests * 2) {
+ depth = q->nr_requests * 2;
+ printk(KERN_ERR "%s: adjusted depth to %d\n",
+ __FUNCTION__, depth);
+ }
+
+ tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+ if (!tag_index)
+ goto fail;
+
+ nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
+ tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
+ if (!tag_map)
+ goto fail;
+
+ tags->real_max_depth = depth;
+ tags->max_depth = depth;
+ tags->tag_index = tag_index;
+ tags->tag_map = tag_map;
+
+ return 0;
+fail:
+ kfree(tag_index);
+ return -ENOMEM;
+}
+
+static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
+ int depth)
+{
+ struct blk_queue_tag *tags;
+
+ tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
+ if (!tags)
+ goto fail;
+
+ if (init_tag_map(q, tags, depth))
+ goto fail;
+
+ tags->busy = 0;
+ atomic_set(&tags->refcnt, 1);
+ return tags;
+fail:
+ kfree(tags);
+ return NULL;
+}
+
+/**
+ * blk_init_tags - initialize the tag info for an external tag map
+ * @depth: the maximum queue depth supported
+ * @tags: the tag to use
+ **/
+struct blk_queue_tag *blk_init_tags(int depth)
+{
+ return __blk_queue_init_tags(NULL, depth);
+}
+EXPORT_SYMBOL(blk_init_tags);
+
+/**
+ * blk_queue_init_tags - initialize the queue tag info
+ * @q: the request queue for the device
+ * @depth: the maximum queue depth supported
+ * @tags: the tag to use
+ **/
+int blk_queue_init_tags(struct request_queue *q, int depth,
+ struct blk_queue_tag *tags)
+{
+ int rc;
+
+ BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+ if (!tags && !q->queue_tags) {
+ tags = __blk_queue_init_tags(q, depth);
+
+ if (!tags)
+ goto fail;
+ } else if (q->queue_tags) {
+ if ((rc = blk_queue_resize_tags(q, depth)))
+ return rc;
+ set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+ return 0;
+ } else
+ atomic_inc(&tags->refcnt);
+
+ /*
+ * assign it, all done
+ */
+ q->queue_tags = tags;
+ q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
+ INIT_LIST_HEAD(&q->tag_busy_list);
+ return 0;
+fail:
+ kfree(tags);
+ return -ENOMEM;
+}
+
+EXPORT_SYMBOL(blk_queue_init_tags);
+
+/**
+ * blk_queue_resize_tags - change the queueing depth
+ * @q: the request queue for the device
+ * @new_depth: the new max command queueing depth
+ *
+ * Notes:
+ * Must be called with the queue lock held.
+ **/
+int blk_queue_resize_tags(struct request_queue *q, int new_depth)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+ struct request **tag_index;
+ unsigned long *tag_map;
+ int max_depth, nr_ulongs;
+
+ if (!bqt)
+ return -ENXIO;
+
+ /*
+ * if we already have large enough real_max_depth. just
+ * adjust max_depth. *NOTE* as requests with tag value
+ * between new_depth and real_max_depth can be in-flight, tag
+ * map can not be shrunk blindly here.
+ */
+ if (new_depth <= bqt->real_max_depth) {
+ bqt->max_depth = new_depth;
+ return 0;
+ }
+
+ /*
+ * Currently cannot replace a shared tag map with a new
+ * one, so error out if this is the case
+ */
+ if (atomic_read(&bqt->refcnt) != 1)
+ return -EBUSY;
+
+ /*
+ * save the old state info, so we can copy it back
+ */
+ tag_index = bqt->tag_index;
+ tag_map = bqt->tag_map;
+ max_depth = bqt->real_max_depth;
+
+ if (init_tag_map(q, bqt, new_depth))
+ return -ENOMEM;
+
+ memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
+ nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
+ memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
+
+ kfree(tag_index);
+ kfree(tag_map);
+ return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_resize_tags);
+
+/**
+ * blk_queue_end_tag - end tag operations for a request
+ * @q: the request queue for the device
+ * @rq: the request that has completed
+ *
+ * Description:
+ * Typically called when end_that_request_first() returns 0, meaning
+ * all transfers have been done for a request. It's important to call
+ * this function before end_that_request_last(), as that will put the
+ * request back on the free list thus corrupting the internal tag list.
+ *
+ * Notes:
+ * queue lock must be held.
+ **/
+void blk_queue_end_tag(struct request_queue *q, struct request *rq)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+ int tag = rq->tag;
+
+ BUG_ON(tag == -1);
+
+ if (unlikely(tag >= bqt->real_max_depth))
+ /*
+ * This can happen after tag depth has been reduced.
+ * FIXME: how about a warning or info message here?
+ */
+ return;
+
+ list_del_init(&rq->queuelist);
+ rq->cmd_flags &= ~REQ_QUEUED;
+ rq->tag = -1;
+
+ if (unlikely(bqt->tag_index[tag] == NULL))
+ printk(KERN_ERR "%s: tag %d is missing\n",
+ __FUNCTION__, tag);
+
+ bqt->tag_index[tag] = NULL;
+
+ if (unlikely(!test_bit(tag, bqt->tag_map))) {
+ printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
+ __FUNCTION__, tag);
+ return;
+ }
+ /*
+ * The tag_map bit acts as a lock for tag_index[bit], so we need
+ * unlock memory barrier semantics.
+ */
+ clear_bit_unlock(tag, bqt->tag_map);
+ bqt->busy--;
+}
+
+EXPORT_SYMBOL(blk_queue_end_tag);
+
+/**
+ * blk_queue_start_tag - find a free tag and assign it
+ * @q: the request queue for the device
+ * @rq: the block request that needs tagging
+ *
+ * Description:
+ * This can either be used as a stand-alone helper, or possibly be
+ * assigned as the queue &prep_rq_fn (in which case &struct request
+ * automagically gets a tag assigned). Note that this function
+ * assumes that any type of request can be queued! if this is not
+ * true for your device, you must check the request type before
+ * calling this function. The request will also be removed fro