aboutsummaryrefslogtreecommitdiff
path: root/block/ll_rw_blk.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2005-11-07 08:32:39 -0800
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-07 08:32:39 -0800
commit333c47c847c90aaefde8b593054d9344106333b5 (patch)
treea4aec7b18ffe8d8dd88e027e5e4d84b2d838fe8a /block/ll_rw_blk.c
parent8f0cb147b2fb12427bf6abef7fed2b604557a41e (diff)
parentc6ea2ba7b8acdb6c4a883b2d38607c8078dff4ee (diff)
Merge branch 'block-dir' of git://brick.kernel.dk/data/git/linux-2.6-block
Diffstat (limited to 'block/ll_rw_blk.c')
-rw-r--r--block/ll_rw_blk.c3612
1 files changed, 3612 insertions, 0 deletions
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
new file mode 100644
index 00000000000..5f52e30b43f
--- /dev/null
+++ b/block/ll_rw_blk.c
@@ -0,0 +1,3612 @@
+/*
+ * linux/drivers/block/ll_rw_blk.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
+ * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
+ * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
+ * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
+ */
+
+/*
+ * This handles all read/write requests to block devices
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+
+/*
+ * for max sense size
+ */
+#include <scsi/scsi_cmnd.h>
+
+static void blk_unplug_work(void *data);
+static void blk_unplug_timeout(unsigned long data);
+static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
+
+/*
+ * For the allocated request tables
+ */
+static kmem_cache_t *request_cachep;
+
+/*
+ * For queue allocation
+ */
+static kmem_cache_t *requestq_cachep;
+
+/*
+ * For io context allocations
+ */
+static kmem_cache_t *iocontext_cachep;
+
+static wait_queue_head_t congestion_wqh[2] = {
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+ };
+
+/*
+ * Controlling structure to kblockd
+ */
+static struct workqueue_struct *kblockd_workqueue;
+
+unsigned long blk_max_low_pfn, blk_max_pfn;
+
+EXPORT_SYMBOL(blk_max_low_pfn);
+EXPORT_SYMBOL(blk_max_pfn);
+
+/* Amount of time in which a process may batch requests */
+#define BLK_BATCH_TIME (HZ/50UL)
+
+/* Number of requests a "batching" process may submit */
+#define BLK_BATCH_REQ 32
+
+/*
+ * Return the threshold (number of used requests) at which the queue is
+ * considered to be congested. It include a little hysteresis to keep the
+ * context switch rate down.
+ */
+static inline int queue_congestion_on_threshold(struct request_queue *q)
+{
+ return q->nr_congestion_on;
+}
+
+/*
+ * The threshold at which a queue is considered to be uncongested
+ */
+static inline int queue_congestion_off_threshold(struct request_queue *q)
+{
+ return q->nr_congestion_off;
+}
+
+static void blk_queue_congestion_threshold(struct request_queue *q)
+{
+ int nr;
+
+ nr = q->nr_requests - (q->nr_requests / 8) + 1;
+ if (nr > q->nr_requests)
+ nr = q->nr_requests;
+ q->nr_congestion_on = nr;
+
+ nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
+ if (nr < 1)
+ nr = 1;
+ q->nr_congestion_off = nr;
+}
+
+/*
+ * A queue has just exitted congestion. Note this in the global counter of
+ * congested queues, and wake up anyone who was waiting for requests to be
+ * put back.
+ */
+static void clear_queue_congested(request_queue_t *q, int rw)
+{
+ enum bdi_state bit;
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ clear_bit(bit, &q->backing_dev_info.state);
+ smp_mb__after_clear_bit();
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+
+/*
+ * A queue has just entered congestion. Flag that in the queue's VM-visible
+ * state flags and increment the global gounter of congested queues.
+ */
+static void set_queue_congested(request_queue_t *q, int rw)
+{
+ enum bdi_state bit;
+
+ bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ set_bit(bit, &q->backing_dev_info.state);
+}
+
+/**
+ * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
+ * @bdev: device
+ *
+ * Locates the passed device's request queue and returns the address of its
+ * backing_dev_info
+ *
+ * Will return NULL if the request queue cannot be located.
+ */
+struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
+{
+ struct backing_dev_info *ret = NULL;
+ request_queue_t *q = bdev_get_queue(bdev);
+
+ if (q)
+ ret = &q->backing_dev_info;
+ return ret;
+}
+
+EXPORT_SYMBOL(blk_get_backing_dev_info);
+
+void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
+{
+ q->activity_fn = fn;
+ q->activity_data = data;
+}
+
+EXPORT_SYMBOL(blk_queue_activity_fn);
+
+/**
+ * blk_queue_prep_rq - set a prepare_request function for queue
+ * @q: queue
+ * @pfn: prepare_request function
+ *
+ * It's possible for a queue to register a prepare_request callback which
+ * is invoked before the request is handed to the request_fn. The goal of
+ * the function is to prepare a request for I/O, it can be used to build a
+ * cdb from the request data for instance.
+ *
+ */
+void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
+{
+ q->prep_rq_fn = pfn;
+}
+
+EXPORT_SYMBOL(blk_queue_prep_rq);
+
+/**
+ * blk_queue_merge_bvec - set a merge_bvec function for queue
+ * @q: queue
+ * @mbfn: merge_bvec_fn
+ *
+ * Usually queues have static limitations on the max sectors or segments that
+ * we can put in a request. Stacking drivers may have some settings that
+ * are dynamic, and thus we have to query the queue whether it is ok to
+ * add a new bio_vec to a bio at a given offset or not. If the block device
+ * has such limitations, it needs to register a merge_bvec_fn to control
+ * the size of bio's sent to it. Note that a block device *must* allow a
+ * single page to be added to an empty bio. The block device driver may want
+ * to use the bio_split() function to deal with these bio's. By default
+ * no merge_bvec_fn is defined for a queue, and only the fixed limits are
+ * honored.
+ */
+void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
+{
+ q->merge_bvec_fn = mbfn;
+}
+
+EXPORT_SYMBOL(blk_queue_merge_bvec);
+
+/**
+ * blk_queue_make_request - define an alternate make_request function for a device
+ * @q: the request queue for the device to be affected
+ * @mfn: the alternate make_request function
+ *
+ * Description:
+ * The normal way for &struct bios to be passed to a device
+ * driver is for them to be collected into requests on a request
+ * queue, and then to allow the device driver to select requests
+ * off that queue when it is ready. This works well for many block
+ * devices. However some block devices (typically virtual devices
+ * such as md or lvm) do not benefit from the processing on the
+ * request queue, and are served best by having the requests passed
+ * directly to them. This can be achieved by providing a function
+ * to blk_queue_make_request().
+ *
+ * Caveat:
+ * The driver that does this *must* be able to deal appropriately
+ * with buffers in "highmemory". This can be accomplished by either calling
+ * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ * blk_queue_bounce() to create a buffer in normal memory.
+ **/
+void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
+{
+ /*
+ * set defaults
+ */
+ q->nr_requests = BLKDEV_MAX_RQ;
+ blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
+ blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+ q->make_request_fn = mfn;
+ q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ q->backing_dev_info.state = 0;
+ q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+ blk_queue_max_sectors(q, MAX_SECTORS);
+ blk_queue_hardsect_size(q, 512);
+ blk_queue_dma_alignment(q, 511);
+ blk_queue_congestion_threshold(q);
+ q->nr_batching = BLK_BATCH_REQ;
+
+ q->unplug_thresh = 4; /* hmm */
+ q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
+ if (q->unplug_delay == 0)
+ q->unplug_delay = 1;
+
+ INIT_WORK(&q->unplug_work, blk_unplug_work, q);
+
+ q->unplug_timer.function = blk_unplug_timeout;
+ q->unplug_timer.data = (unsigned long)q;
+
+ /*
+ * by default assume old behaviour and bounce for any highmem page
+ */
+ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
+
+ blk_queue_activity_fn(q, NULL, NULL);
+}
+
+EXPORT_SYMBOL(blk_queue_make_request);
+
+static inline void rq_init(request_queue_t *q, struct request *rq)
+{
+ INIT_LIST_HEAD(&rq->queuelist);
+
+ rq->errors = 0;
+ rq->rq_status = RQ_ACTIVE;
+ rq->bio = rq->biotail = NULL;
+ rq->ioprio = 0;
+ rq->buffer = NULL;
+ rq->ref_count = 1;
+ rq->q = q;
+ rq->waiting = NULL;
+ rq->special = NULL;
+ rq->data_len = 0;
+ rq->data = NULL;
+ rq->nr_phys_segments = 0;
+ rq->sense = NULL;
+ rq->end_io = NULL;
+ rq->end_io_data = NULL;
+}
+
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q: the request queue
+ * @flag: see below
+ *
+ * Description:
+ * For journalled file systems, doing ordered writes on a commit
+ * block instead of explicitly doing wait_on_buffer (which is bad
+ * for performance) can be a big win. Block drivers supporting this
+ * feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+ switch (flag) {
+ case QUEUE_ORDERED_NONE:
+ if (q->flush_rq)
+ kmem_cache_free(request_cachep, q->flush_rq);
+ q->flush_rq = NULL;
+ q->ordered = flag;
+ break;
+ case QUEUE_ORDERED_TAG:
+ q->ordered = flag;
+ break;
+ case QUEUE_ORDERED_FLUSH:
+ q->ordered = flag;
+ if (!q->flush_rq)
+ q->flush_rq = kmem_cache_alloc(request_cachep,
+ GFP_KERNEL);
+ break;
+ default:
+ printk("blk_queue_ordered: bad value %d\n", flag);
+ break;
+ }
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
+ * blk_queue_issue_flush_fn - set function for issuing a flush
+ * @q: the request queue
+ * @iff: the function to be called issuing the flush
+ *
+ * Description:
+ * If a driver supports issuing a flush command, the support is notified
+ * to the block layer by defining it through this call.
+ *
+ **/
+void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
+{
+ q->issue_flush_fn = iff;
+}
+
+EXPORT_SYMBOL(blk_queue_issue_flush_fn);
+
+/*
+ * Cache flushing for ordered writes handling
+ */
+static void blk_pre_flush_end_io(struct request *flush_rq)
+{
+ struct request *rq = flush_rq->end_io_data;
+ request_queue_t *q = rq->q;
+
+ elv_completed_request(q, flush_rq);
+
+ rq->flags |= REQ_BAR_PREFLUSH;
+
+ if (!flush_rq->errors)
+ elv_requeue_request(q, rq);
+ else {
+ q->end_flush_fn(q, flush_rq);
+ clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
+ q->request_fn(q);
+ }
+}
+
+static void blk_post_flush_end_io(struct request *flush_rq)
+{
+ struct request *rq = flush_rq->end_io_data;
+ request_queue_t *q = rq->q;
+
+ elv_completed_request(q, flush_rq);
+
+ rq->flags |= REQ_BAR_POSTFLUSH;
+
+ q->end_flush_fn(q, flush_rq);
+ clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
+ q->request_fn(q);
+}
+
+struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
+{
+ struct request *flush_rq = q->flush_rq;
+
+ BUG_ON(!blk_barrier_rq(rq));
+
+ if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
+ return NULL;
+
+ rq_init(q, flush_rq);
+ flush_rq->elevator_private = NULL;
+ flush_rq->flags = REQ_BAR_FLUSH;
+ flush_rq->rq_disk = rq->rq_disk;
+ flush_rq->rl = NULL;
+
+ /*
+ * prepare_flush returns 0 if no flush is needed, just mark both
+ * pre and post flush as done in that case
+ */
+ if (!q->prepare_flush_fn(q, flush_rq)) {
+ rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
+ clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
+ return rq;
+ }
+
+ /*
+ * some drivers dequeue requests right away, some only after io
+ * completion. make sure the request is dequeued.
+ */
+ if (!list_empty(&rq->queuelist))
+ blkdev_dequeue_request(rq);
+
+ flush_rq->end_io_data = rq;
+ flush_rq->end_io = blk_pre_flush_end_io;
+
+ __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
+ return flush_rq;
+}
+
+static void blk_start_post_flush(request_queue_t *q, struct request *rq)
+{
+ struct request *flush_rq = q->flush_rq;
+
+ BUG_ON(!blk_barrier_rq(rq));
+
+ rq_init(q, flush_rq);
+ flush_rq->elevator_private = NULL;
+ flush_rq->flags = REQ_BAR_FLUSH;
+ flush_rq->rq_disk = rq->rq_disk;
+ flush_rq->rl = NULL;
+
+ if (q->prepare_flush_fn(q, flush_rq)) {
+ flush_rq->end_io_data = rq;
+ flush_rq->end_io = blk_post_flush_end_io;
+
+ __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
+ q->request_fn(q);
+ }
+}
+
+static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
+ int sectors)
+{
+ if (sectors > rq->nr_sectors)
+ sectors = rq->nr_sectors;
+
+ rq->nr_sectors -= sectors;
+ return rq->nr_sectors;
+}
+
+static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
+ int sectors, int queue_locked)
+{
+ if (q->ordered != QUEUE_ORDERED_FLUSH)
+ return 0;
+ if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
+ return 0;
+ if (blk_barrier_postflush(rq))
+ return 0;
+
+ if (!blk_check_end_barrier(q, rq, sectors)) {
+ unsigned long flags = 0;
+
+ if (!queue_locked)
+ spin_lock_irqsave(q->queue_lock, flags);
+
+ blk_start_post_flush(q, rq);
+
+ if (!queue_locked)
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ }
+
+ return 1;
+}
+
+/**
+ * blk_complete_barrier_rq - complete possible barrier request
+ * @q: the request queue for the device
+ * @rq: the request
+ * @sectors: number of sectors to complete
+ *
+ * Description:
+ * Used in driver end_io handling to determine whether to postpone
+ * completion of a barrier request until a post flush has been done. This
+ * is the unlocked variant, used if the caller doesn't already hold the
+ * queue lock.
+ **/
+int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
+{
+ return __blk_complete_barrier_rq(q, rq, sectors, 0);
+}
+EXPORT_SYMBOL(blk_complete_barrier_rq);
+
+/**
+ * blk_complete_barrier_rq_locked - complete possible barrier request
+ * @q: the request queue for the device
+ * @rq: the request
+ * @sectors: number of sectors to complete
+ *
+ * Description:
+ * See blk_complete_barrier_rq(). This variant must be used if the caller
+ * holds the queue lock.
+ **/
+int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
+ int sectors)
+{
+ return __blk_complete_barrier_rq(q, rq, sectors, 1);
+}
+EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
+
+/**
+ * blk_queue_bounce_limit - set bounce buffer limit for queue
+ * @q: the request queue for the device
+ * @dma_addr: bus address limit
+ *
+ * Description:
+ * Different hardware can have different requirements as to what pages
+ * it can do I/O directly to. A low level driver can call
+ * blk_queue_bounce_limit to have lower memory pages allocated as bounce
+ * buffers for doing I/O to pages residing above @page. By default
+ * the block layer sets this to the highest numbered "low" memory page.
+ **/
+void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
+{
+ unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
+
+ /*
+ * set appropriate bounce gfp mask -- unfortunately we don't have a
+ * full 4GB zone, so we have to resort to low memory for any bounces.
+ * ISA has its own < 16MB zone.
+ */
+ if (bounce_pfn < blk_max_low_pfn) {
+ BUG_ON(dma_addr < BLK_BOUNCE_ISA);
+ init_emergency_isa_pool();
+ q->bounce_gfp = GFP_NOIO | GFP_DMA;
+ } else
+ q->bounce_gfp = GFP_NOIO;
+
+ q->bounce_pfn = bounce_pfn;
+}
+
+EXPORT_SYMBOL(blk_queue_bounce_limit);
+
+/**
+ * blk_queue_max_sectors - set max sectors for a request for this queue
+ * @q: the request queue for the device
+ * @max_sectors: max sectors in the usual 512b unit
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the size of
+ * received requests.
+ **/
+void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
+{
+ if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
+ max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
+ }
+
+ q->max_sectors = q->max_hw_sectors = max_sectors;
+}
+
+EXPORT_SYMBOL(blk_queue_max_sectors);
+
+/**
+ * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * @q: the request queue for the device
+ * @max_segments: max number of segments
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the number of
+ * physical data segments in a request. This would be the largest sized
+ * scatter list the driver could handle.
+ **/
+void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
+{
+ if (!max_segments) {
+ max_segments = 1;
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+ }
+
+ q->max_phys_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_phys_segments);
+
+/**
+ * blk_queue_max_hw_segments - set max hw segments for a request for this queue
+ * @q: the request queue for the device
+ * @max_segments: max number of segments
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the number of
+ * hw data segments in a request. This would be the largest number of
+ * address/length pairs the host adapter can actually give as once
+ * to the device.
+ **/
+void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
+{
+ if (!max_segments) {
+ max_segments = 1;
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+ }
+
+ q->max_hw_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_hw_segments);
+
+/**
+ * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
+ * @q: the request queue for the device
+ * @max_size: max size of segment in bytes
+ *
+ * Description:
+ * Enables a low level driver to set an upper limit on the size of a
+ * coalesced segment
+ **/
+void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
+{
+ if (max_size < PAGE_CACHE_SIZE) {
+ max_size = PAGE_CACHE_SIZE;
+ printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
+ }
+
+ q->max_segment_size = max_size;
+}
+
+EXPORT_SYMBOL(blk_queue_max_segment_size);
+
+/**
+ * blk_queue_hardsect_size - set hardware sector size for the queue
+ * @q: the request queue for the device
+ * @size: the hardware sector size, in bytes
+ *
+ * Description:
+ * This should typically be set to the lowest possible sector size
+ * that the hardware can operate on (possible without reverting to
+ * even internal read-modify-write operations). Usually the default
+ * of 512 covers most hardware.
+ **/
+void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
+{
+ q->hardsect_size = size;
+}
+
+EXPORT_SYMBOL(blk_queue_hardsect_size);
+
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+
+/**
+ * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
+ * @t: the stacking driver (top)
+ * @b: the underlying device (bottom)
+ **/
+void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
+{
+ /* zero is "infinity" */
+ t->max_sectors = t->max_hw_sectors =
+ min_not_zero(t->max_sectors,b->max_sectors);
+
+ t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
+ t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
+ t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
+ t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
+}
+
+EXPORT_SYMBOL(blk_queue_stack_limits);
+
+/**
+ * blk_queue_segment_boundary - set boundary rules for segment merging
+ * @q: the request queue for the device
+ * @mask: the memory boundary mask
+ **/
+void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
+{
+ if (mask < PAGE_CACHE_SIZE - 1) {
+ mask = PAGE_CACHE_SIZE - 1;
+ printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
+ }
+
+ q->seg_boundary_mask = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_segment_boundary);
+
+/**
+ * blk_queue_dma_alignment - set dma length and memory alignment
+ * @q: the request queue for the device
+ * @mask: alignment mask
+ *
+ * description:
+ * set required memory and length aligment for direct dma transactions.
+ * this is used when buiding direct io requests for the queue.
+ *
+ **/
+void blk_queue_dma_alignment(request_queue_t *q, int mask)
+{
+ q->dma_alignment = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_dma_alignment);
+
+/**
+ * blk_queue_find_tag - find a request by its tag and queue
+ * @q: The request queue for the device
+ * @tag: The tag of the request
+ *
+ * Notes:
+ * Should be used when a device returns a tag and you want to match
+ * it with a request.
+ *
+ * no locks need be held.
+ **/
+struct request *blk_queue_find_tag(request_queue_t *q, int tag)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+
+ if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
+ return NULL;
+
+ return bqt->tag_index[tag];
+}
+
+EXPORT_SYMBOL(blk_queue_find_tag);
+
+/**
+ * __blk_queue_free_tags - release tag maintenance info
+ * @q: the request queue for the device
+ *
+ * Notes:
+ * blk_cleanup_queue() will take care of calling this function, if tagging
+ * has been used. So there's no need to call this directly.
+ **/
+static void __blk_queue_free_tags(request_queue_t *q)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+
+ if (!bqt)
+ return;
+
+ if (atomic_dec_and_test(&bqt->refcnt)) {
+ BUG_ON(bqt->busy);
+ BUG_ON(!list_empty(&bqt->busy_list));
+
+ kfree(bqt->tag_index);
+ bqt->tag_index = NULL;
+
+ kfree(bqt->tag_map);
+ bqt->tag_map = NULL;
+
+ kfree(bqt);
+ }
+
+ q->queue_tags = NULL;
+ q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
+}
+
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q: the request queue for the device
+ *
+ * Notes:
+ * This is used to disabled tagged queuing to a device, yet leave
+ * queue in function.
+ **/
+void blk_queue_free_tags(request_queue_t *q)
+{
+ clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_free_tags);
+
+static int
+init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
+{
+ struct request **tag_index;
+ unsigned long *tag_map;
+ int nr_ulongs;
+
+ if (depth > q->nr_requests * 2) {
+ depth = q->nr_requests * 2;
+ printk(KERN_ERR "%s: adjusted depth to %d\n",
+ __FUNCTION__, depth);
+ }
+
+ tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+ if (!tag_index)
+ goto fail;
+
+ nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
+ tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
+ if (!tag_map)
+ goto fail;
+
+ memset(tag_index, 0, depth * sizeof(struct request *));
+ memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
+ tags->real_max_depth = depth;
+ tags->max_depth = depth;
+ tags->tag_index = tag_index;
+ tags->tag_map = tag_map;
+
+ return 0;
+fail:
+ kfree(tag_index);
+ return -ENOMEM;
+}
+
+/**
+ * blk_queue_init_tags - initialize the queue tag info
+ * @q: the request queue for the device
+ * @depth: the maximum queue depth supported
+ * @tags: the tag to use
+ **/
+int blk_queue_init_tags(request_queue_t *q, int depth,
+ struct blk_queue_tag *tags)
+{
+ int rc;
+
+ BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+ if (!tags && !q->queue_tags) {
+ tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
+ if (!tags)
+ goto fail;
+
+ if (init_tag_map(q, tags, depth))
+ goto fail;
+
+ INIT_LIST_HEAD(&tags->busy_list);
+ tags->busy = 0;
+ atomic_set(&tags->refcnt, 1);
+ } else if (q->queue_tags) {
+ if ((rc = blk_queue_resize_tags(q, depth)))
+ return rc;
+ set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+ return 0;
+ } else
+ atomic_inc(&tags->refcnt);
+
+ /*
+ * assign it, all done
+ */
+ q->queue_tags = tags;
+ q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
+ return 0;
+fail:
+ kfree(tags);
+ return -ENOMEM;
+}
+
+EXPORT_SYMBOL(blk_queue_init_tags);
+
+/**
+ * blk_queue_resize_tags - change the queueing depth
+ * @q: the request queue for the device
+ * @new_depth: the new max command queueing depth
+ *
+ * Notes:
+ * Must be called with the queue lock held.
+ **/
+int blk_queue_resize_tags(request_queue_t *q, int new_depth)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+ struct request **tag_index;
+ unsigned long *tag_map;
+ int max_depth, nr_ulongs;
+
+ if (!bqt)
+ return -ENXIO;
+
+ /*
+ * if we already have large enough real_max_depth. just
+ * adjust max_depth. *NOTE* as requests with tag value
+ * between new_depth and real_max_depth can be in-flight, tag
+ * map can not be shrunk blindly here.
+ */
+ if (new_depth <= bqt->real_max_depth) {
+ bqt->max_depth = new_depth;
+ return 0;
+ }
+
+ /*
+ * save the old state info, so we can copy it back
+ */
+ tag_index = bqt->tag_index;
+ tag_map = bqt->tag_map;
+ max_depth = bqt->real_max_depth;
+
+ if (init_tag_map(q, bqt, new_depth))
+ return -ENOMEM;
+
+ memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
+ nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
+ memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
+
+ kfree(tag_index);
+ kfree(tag_map);
+ return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_resize_tags);
+
+/**
+ * blk_queue_end_tag - end tag operations for a request
+ * @q: the request queue for the device
+ * @rq: the request that has completed
+ *
+ * Description:
+ * Typically called when end_that_request_first() returns 0, meaning
+ * all transfers have been done for a request. It's important to call
+ * this function before end_that_request_last(), as that will put the
+ * request back on the free list thus corrupting the internal tag list.
+ *
+ * Notes:
+ * queue lock must be held.
+ **/
+void blk_queue_end_tag(request_queue_t *q, struct request *rq)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+ int tag = rq->tag;
+
+ BUG_ON(tag == -1);
+
+ if (unlikely(tag >= bqt->real_max_depth))
+ /*
+ * This can happen after tag depth has been reduced.
+ * FIXME: how about a warning or info message here?
+ */
+ return;
+
+ if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
+ printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
+ __FUNCTION__, tag);
+ return;
+ }
+
+ list_del_init(&rq->queuelist);
+ rq->flags &= ~REQ_QUEUED;
+ rq->tag = -1;
+
+ if (unlikely(bqt->tag_index[tag] == NULL))
+ printk(KERN_ERR "%s: tag %d is missing\n",
+ __FUNCTION__, tag);
+
+ bqt->tag_index[tag] = NULL;
+ bqt->busy--;
+}
+
+EXPORT_SYMBOL(blk_queue_end_tag);
+
+/**
+ * blk_queue_start_tag - find a free tag and assign it
+ * @q: the request queue for the device
+ * @rq: the block request that needs tagging
+ *
+ * Description:
+ * This can either be used as a stand-alone helper, or possibly be
+ * assigned as the queue &prep_rq_fn (in which case &struct request
+ * automagically gets a tag assigned). Note that this function
+ * assumes that any type of request can be queued! if this is not
+ * true for your device, you must check the request type before
+ * calling this function. The request will also be removed from
+ * the request queue, so it's the drivers responsibility to readd
+ * it if it should need to be restarted for some reason.
+ *
+ * Notes:
+ * queue lock must be held.
+ **/
+int blk_queue_start_tag(request_queue_t *q, struct request *rq)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+ int tag;
+
+ if (unlikely((rq->flags & REQ_QUEUED))) {
+ printk(KERN_ERR
+ "%s: request %p for device [%s] already tagged %d",
+ __FUNCTION__, rq,
+ rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
+ BUG();
+ }
+
+ tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
+ if (tag >= bqt->max_depth)
+ return 1;
+
+ __set_bit(tag, bqt->tag_map);
+
+ rq->flags |= REQ_QUEUED;
+ rq->tag = tag;
+ bqt->tag_index[tag] = rq;
+ blkdev_dequeue_request(rq);
+ list_add(&rq->queuelist, &bqt->busy_list);
+ bqt->busy++;
+ return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_start_tag);
+
+/**
+ * blk_queue_invalidate_tags - invalidate all pending tags
+ * @q: the request queue for the device
+ *
+ * Description:
+ * Hardware conditions may dictate a need to stop all pending requests.
+ * In this case, we will safely clear the block side of the tag queue and
+ * readd all requests to the request queue in the right order.
+ *
+ * Notes:
+ * queue lock must be held.
+ **/
+void blk_queue_invalidate_tags(request_queue_t *q)
+{
+ struct blk_queue_tag *bqt = q->queue_tags;
+ struct list_head *tmp, *n;
+ struct request *rq;
+
+ list_for_each_safe(tmp, n, &bqt->busy_list) {
+ rq = list_entry_rq(tmp);
+
+ if (rq->tag == -1) {
+ printk(KERN_ERR
+ "%s: bad tag found on list\n", __FUNCTION__);
+ list_del_init(&rq->queuelist);
+ rq->flags &= ~REQ_QUEUED;
+ } else
+ blk_queue_end_tag(q, rq);
+
+ rq->flags &= ~REQ_STARTED;
+ __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
+ }
+}
+
+EXPORT_SYMBOL(blk_queue_invalidate_tags);
+
+static char *rq_flags[] = {
+ "REQ_RW",
+ "REQ_FAILFAST",
+ "REQ_SORTED",
+ "REQ_SOFTBARRIER",
+ "REQ_HARDBARRIER",
+ "REQ_CMD",
+ "REQ_NOMERGE",
+ "REQ_STARTED",
+ "REQ_DONTPREP",
+ "REQ_QUEUED",
+ "REQ_ELVPRIV",
+ "REQ_PC",
+ "REQ_BLOCK_PC",
+ "REQ_SENSE",
+ "REQ_FAILED",
+ "REQ_QUIET",
+ "REQ_SPECIAL",
+ "REQ_DRIVE_CMD",
+ "REQ_DRIVE_TASK",
+ "REQ_DRIVE_TASKFILE",
+ "REQ_PREEMPT",
+ "REQ_PM_SUSPEND",
+ "REQ_PM_RESUME",
+ "REQ_PM_SHUTDOWN",
+};
+
+void blk_dump_rq_flags(struct request *rq, char *msg)
+{
+ int bit;
+
+ printk("%s: dev %s: flags = ", msg,
+ rq->rq_disk ? rq->rq_disk->disk_name : "?");
+ bit = 0;
+ do {
+ if (rq->flags & (1 << bit))
+ printk("%s ", rq_flags[bit]);
+ bit++;
+ } while (bit < __REQ_NR_BITS);
+
+ printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
+ rq->nr_sectors,
+ rq->current_nr_sectors);
+ printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
+
+ if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
+ printk("cdb: ");
+ for (bit = 0; bit < sizeof(rq->cmd); bit++)
+ printk("%02x ", rq->cmd[bit]);
+ printk("\n");
+ }
+}
+
+EXPORT_SYMBOL(blk_dump_rq_flags);
+
+void blk_recount_segments(request_queue_t *q, struct bio *bio)
+{
+ struct bio_vec *bv, *bvprv = NULL;
+ int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
+ int high, highprv = 1;
+
+ if (unlikely(!bio->bi_io_vec))
+ return;
+
+ cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
+ hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
+ bio_for_each_segment(bv, bio, i) {
+ /*
+ * the trick here is making sure that a high page is never
+ * considered part of another segment, since that might
+ * change with the bounce page.
+ */
+ high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
+ if (high || highprv)
+ goto new_hw_segment;
+ if (cluster) {
+ if (seg_size + bv->bv_len > q->max_segment_size)
+ goto new_segment;
+ if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
+ goto new_segment;
+ if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+ goto new_segment;
+ if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
+ goto new_hw_segment;
+
+ seg_size += bv->bv_len;
+ hw_seg_size += bv->bv_len;
+ bvprv = bv;
+ continue;
+ }
+new_segment:
+ if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
+ !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
+ hw_seg_size += bv->bv_len;
+ } else {
+new_hw_segment:
+ if (hw_seg_size > bio->bi_hw_front_size)
+ bio->bi_hw_front_size = hw_seg_size;
+ hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
+ nr_hw_segs++;
+ }
+
+ nr_phys_segs++;
+ bvprv = bv;
+ seg_size = bv->bv_len;
+ highprv = high;
+ }
+ if (hw_seg_size > bio->bi_hw_back_size)
+ bio->bi_hw_back_size = hw_seg_size;
+ if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size)
+ bio->bi_hw_front_size = hw_seg_size;
+ bio->bi_phys_segments = nr_phys_segs;
+ bio->bi_hw_s