diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-14 17:55:15 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-14 17:55:15 -0700 |
commit | 355bbd8cb82e60a592f6cd86ce6dbe5677615cf4 (patch) | |
tree | 23678e50ad4687f1656edc972388ee8014e7b89d | |
parent | 39695224bd84dc4be29abad93a0ec232a16fc519 (diff) | |
parent | 746cd1e7e4a555ddaee53b19a46e05c9c61eaf09 (diff) |
Merge branch 'for-2.6.32' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.32' of git://git.kernel.dk/linux-2.6-block: (29 commits)
block: use blkdev_issue_discard in blk_ioctl_discard
Make DISCARD_BARRIER and DISCARD_NOBARRIER writes instead of reads
block: don't assume device has a request list backing in nr_requests store
block: Optimal I/O limit wrapper
cfq: choose a new next_req when a request is dispatched
Seperate read and write statistics of in_flight requests
aoe: end barrier bios with EOPNOTSUPP
block: trace bio queueing trial only when it occurs
block: enable rq CPU completion affinity by default
cfq: fix the log message after dispatched a request
block: use printk_once
cciss: memory leak in cciss_init_one()
splice: update mtime and atime on files
block: make blk_iopoll_prep_sched() follow normal 0/1 return convention
cfq-iosched: get rid of must_alloc flag
block: use interrupts disabled version of raise_softirq_irqoff()
block: fix comment in blk-iopoll.c
block: adjust default budget for blk-iopoll
block: fix long lines in block/blk-iopoll.c
block: add blk-iopoll, a NAPI like approach for block devices
...
42 files changed, 729 insertions, 284 deletions
diff --git a/block/Makefile b/block/Makefile index 6c54ed0ff75..ba74ca6bfa1 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 30022b4e2f6..6593ab39cfe 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err) clear_bit(BIO_UPTODATE, &bio->bi_flags); } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } @@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err) * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: DISCARD_FL_* flags to control behaviour * * Description: - * Issue a discard request for the sectors in question. Does not wait. + * Issue a discard request for the sectors in question. */ -int blkdev_issue_discard(struct block_device *bdev, - sector_t sector, sector_t nr_sects, gfp_t gfp_mask) +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags) { - struct request_queue *q; - struct bio *bio; + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + int type = flags & DISCARD_FL_BARRIER ? + DISCARD_BARRIER : DISCARD_NOBARRIER; int ret = 0; - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); if (!q) return -ENXIO; @@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev, return -EOPNOTSUPP; while (nr_sects && !ret) { - bio = bio_alloc(gfp_mask, 0); + struct bio *bio = bio_alloc(gfp_mask, 0); if (!bio) return -ENOMEM; bio->bi_end_io = blkdev_discard_end_io; bio->bi_bdev = bdev; + if (flags & DISCARD_FL_WAIT) + bio->bi_private = &wait; bio->bi_sector = sector; @@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev, bio->bi_size = nr_sects << 9; nr_sects = 0; } + bio_get(bio); - submit_bio(DISCARD_BARRIER, bio); + submit_bio(type, bio); + + if (flags & DISCARD_FL_WAIT) + wait_for_completion(&wait); - /* Check if it failed immediately */ if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; else if (!bio_flagged(bio, BIO_UPTODATE)) diff --git a/block/blk-core.c b/block/blk-core.c index e695634882a..8135228e4b2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io) part_stat_inc(cpu, part, merges[rw]); else { part_round_stats(cpu, part); - part_inc_in_flight(part); + part_inc_in_flight(part, rw); } part_stat_unlock(); @@ -1031,7 +1031,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, if (part->in_flight) { __part_stat_add(cpu, part, time_in_queue, - part->in_flight * (now - part->stamp)); + part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1112,31 +1112,27 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_type = REQ_TYPE_FS; /* - * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) + * Inherit FAILFAST from bio (for read-ahead, and explicit + * FAILFAST). FAILFAST flags are identical for req and bio. */ - if (bio_rw_ahead(bio)) - req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | - REQ_FAILFAST_DRIVER); - if (bio_failfast_dev(bio)) - req->cmd_flags |= REQ_FAILFAST_DEV; - if (bio_failfast_transport(bio)) - req->cmd_flags |= REQ_FAILFAST_TRANSPORT; - if (bio_failfast_driver(bio)) - req->cmd_flags |= REQ_FAILFAST_DRIVER; - - if (unlikely(bio_discard(bio))) { + if (bio_rw_flagged(bio, BIO_RW_AHEAD)) + req->cmd_flags |= REQ_FAILFAST_MASK; + else + req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; + + if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { req->cmd_flags |= REQ_DISCARD; - if (bio_barrier(bio)) + if (bio_rw_flagged(bio, BIO_RW_BARRIER)) req->cmd_flags |= REQ_SOFTBARRIER; req->q->prepare_discard_fn(req->q, req); - } else if (unlikely(bio_barrier(bio))) + } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) req->cmd_flags |= REQ_HARDBARRIER; - if (bio_sync(bio)) + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) req->cmd_flags |= REQ_RW_SYNC; - if (bio_rw_meta(bio)) + if (bio_rw_flagged(bio, BIO_RW_META)) req->cmd_flags |= REQ_RW_META; - if (bio_noidle(bio)) + if (bio_rw_flagged(bio, BIO_RW_NOIDLE)) req->cmd_flags |= REQ_NOIDLE; req->errors = 0; @@ -1151,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) */ static inline bool queue_should_plug(struct request_queue *q) { - return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); + return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); } static int __make_request(struct request_queue *q, struct bio *bio) @@ -1160,11 +1156,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); - const int sync = bio_sync(bio); - const int unplug = bio_unplug(bio); + const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); + const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; - if (bio_barrier(bio) && bio_has_data(bio) && + if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) && (q->next_ordered == QUEUE_ORDERED_NONE)) { bio_endio(bio, -EOPNOTSUPP); return 0; @@ -1178,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (unlikely(bio_barrier(bio)) || elv_queue_empty(q)) + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1191,6 +1188,9 @@ static int __make_request(struct request_queue *q, struct bio *bio) trace_block_bio_backmerge(q, bio); + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bytes; @@ -1210,6 +1210,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) trace_block_bio_frontmerge(q, bio); + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { + blk_rq_set_mixed_merge(req); + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= ff; + } + bio->bi_next = req->bio; req->bio = bio; @@ -1457,19 +1463,20 @@ static inline void __generic_make_request(struct bio *bio) if (old_sector != -1) trace_block_remap(q, bio, old_dev, old_sector); - trace_block_bio_queue(q, bio); - old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; if (bio_check_eod(bio, nr_sectors)) goto end_io; - if (bio_discard(bio) && !q->prepare_discard_fn) { + if (bio_rw_flagged(bio, BIO_RW_DISCARD) && + !q->prepare_discard_fn) { err = -EOPNOTSUPP; goto end_io; } + trace_block_bio_queue(q, bio); + ret = q->make_request_fn(q, bio); } while (ret); @@ -1654,6 +1661,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); +/** + * blk_rq_err_bytes - determine number of bytes till the next failure boundary + * @rq: request to examine + * + * Description: + * A request could be merge of IOs which require different failure + * handling. This function determines the number of bytes which + * can be failed from the beginning of the request without + * crossing into area which need to be retried further. + * + * Return: + * The number of bytes to fail. + * + * Context: + * queue_lock must be held. + */ +unsigned int blk_rq_err_bytes(const struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + unsigned int bytes = 0; + struct bio *bio; + + if (!(rq->cmd_flags & REQ_MIXED_MERGE)) + return blk_rq_bytes(rq); + + /* + * Currently the only 'mixing' which can happen is between + * different fastfail types. We can safely fail portions + * which have all the failfast bits that the first one has - + * the ones which are at least as eager to fail as the first + * one. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + if ((bio->bi_rw & ff) != ff) + break; + bytes += bio->bi_size; + } + + /* this could lead to infinite loop */ + BUG_ON(blk_rq_bytes(rq) && !bytes); + return bytes; +} +EXPORT_SYMBOL_GPL(blk_rq_err_bytes); + static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { @@ -1687,7 +1738,7 @@ static void blk_account_io_done(struct request *req) part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rw); part_stat_unlock(); } @@ -1807,8 +1858,15 @@ void blk_dequeue_request(struct request *rq) * and to it is freed is accounted as io that is in progress at * the driver side. */ - if (blk_account_rq(rq)) + if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; + /* + * Mark this device as supporting hardware queuing, if + * we have more IOs in flight than 4. + */ + if (!blk_queue_queuing(q) && queue_in_flight(q) > 4) + set_bit(QUEUE_FLAG_CQ, &q->queue_flags); + } } /** @@ -2000,6 +2058,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (blk_fs_request(req) || blk_discard_rq(req)) req->__sector += total_bytes >> 9; + /* mixed attributes always follow the first bio */ + if (req->cmd_flags & REQ_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK; + } + /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. @@ -2179,6 +2243,25 @@ bool blk_end_request_cur(struct request *rq, int error) EXPORT_SYMBOL(blk_end_request_cur); /** + * blk_end_request_err - Finish a request till the next failure boundary. + * @rq: the request to finish till the next failure boundary for + * @error: must be negative errno + * + * Description: + * Complete @rq till the next failure boundary. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool blk_end_request_err(struct request *rq, int error) +{ + WARN_ON(error >= 0); + return blk_end_request(rq, error, blk_rq_err_bytes(rq)); +} +EXPORT_SYMBOL_GPL(blk_end_request_err); + +/** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: %0 for success, < %0 for error @@ -2237,12 +2320,31 @@ bool __blk_end_request_cur(struct request *rq, int error) } EXPORT_SYMBOL(__blk_end_request_cur); +/** + * __blk_end_request_err - Finish a request till the next failure boundary. + * @rq: the request to finish till the next failure boundary for + * @error: must be negative errno + * + * Description: + * Complete @rq till the next failure boundary. Must be called + * with queue lock held. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool __blk_end_request_err(struct request *rq, int error) +{ + WARN_ON(error >= 0); + return __blk_end_request(rq, error, blk_rq_err_bytes(rq)); +} +EXPORT_SYMBOL_GPL(__blk_end_request_err); + void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { - /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and - we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ - rq->cmd_flags |= (bio->bi_rw & 3); + /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ + rq->cmd_flags |= bio->bi_rw & REQ_RW; if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c new file mode 100644 index 00000000000..ca564202ed7 --- /dev/null +++ b/block/blk-iopoll.c @@ -0,0 +1,227 @@ +/* + * Functions related to interrupt-poll handling in the block layer. This + * is similar to NAPI for network devices. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/blk-iopoll.h> +#include <linux/delay.h> + +#include "blk.h" + +int blk_iopoll_enabled = 1; +EXPORT_SYMBOL(blk_iopoll_enabled); + +static unsigned int blk_iopoll_budget __read_mostly = 256; + +static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); + +/** + * blk_iopoll_sched - Schedule a run of the iopoll handler + * @iop: The parent iopoll structure + * + * Description: + * Add this blk_iopoll structure to the pending poll list and trigger the + * raise of the blk iopoll softirq. The driver must already have gotten a + * succesful return from blk_iopoll_sched_prep() before calling this. + **/ +void blk_iopoll_sched(struct blk_iopoll *iop) +{ + unsigned long flags; + + local_irq_save(flags); + list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_iopoll_sched); + +/** + * __blk_iopoll_complete - Mark this @iop as un-polled again + * @iop: The parent iopoll structure + * + * Description: + * See blk_iopoll_complete(). This function must be called with interrupts + * disabled. + **/ +void __blk_iopoll_complete(struct blk_iopoll *iop) +{ + list_del(&iop->list); + smp_mb__before_clear_bit(); + clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(__blk_iopoll_complete); + +/** + * blk_iopoll_complete - Mark this @iop as un-polled again + * @iop: The parent iopoll structure + * + * Description: + * If a driver consumes less than the assigned budget in its run of the + * iopoll handler, it'll end the polled mode by calling this function. The + * iopoll handler will not be invoked again before blk_iopoll_sched_prep() + * is called. + **/ +void blk_iopoll_complete(struct blk_iopoll *iopoll) +{ + unsigned long flags; + + local_irq_save(flags); + __blk_iopoll_complete(iopoll); + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_iopoll_complete); + +static void blk_iopoll_softirq(struct softirq_action *h) +{ + struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); + int rearm = 0, budget = blk_iopoll_budget; + unsigned long start_time = jiffies; + + local_irq_disable(); + + while (!list_empty(list)) { + struct blk_iopoll *iop; + int work, weight; + + /* + * If softirq window is exhausted then punt. + */ + if (budget <= 0 || time_after(jiffies, start_time)) { + rearm = 1; + break; + } + + local_irq_enable(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + iop = list_entry(list->next, struct blk_iopoll, list); + + weight = iop->weight; + work = 0; + if (test_bit(IOPOLL_F_SCHED, &iop->state)) + work = iop->poll(iop, weight); + + budget -= work; + + local_irq_disable(); + + /* + * Drivers must not modify the iopoll state, if they + * consume their assigned weight (or more, some drivers can't + * easily just stop processing, they have to complete an + * entire mask of commands).In such cases this code + * still "owns" the iopoll instance and therefore can + * move the instance around on the list at-will. + */ + if (work >= weight) { + if (blk_iopoll_disable_pending(iop)) + __blk_iopoll_complete(iop); + else + list_move_tail(&iop->list, list); + } + } + + if (rearm) + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + + local_irq_enable(); +} + +/** + * blk_iopoll_disable - Disable iopoll on this @iop + * @iop: The parent iopoll structure + * + * Description: + * Disable io polling and wait for any pending callbacks to have completed. + **/ +void blk_iopoll_disable(struct blk_iopoll *iop) +{ + set_bit(IOPOLL_F_DISABLE, &iop->state); + while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state)) + msleep(1); + clear_bit(IOPOLL_F_DISABLE, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_disable); + +/** + * blk_iopoll_enable - Enable iopoll on this @iop + * @iop: The parent iopoll structure + * + * Description: + * Enable iopoll on this @iop. Note that the handler run will not be + * scheduled, it will only mark it as active. + **/ +void blk_iopoll_enable(struct blk_iopoll *iop) +{ + BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); + smp_mb__before_clear_bit(); + clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_enable); + +/** + * blk_iopoll_init - Initialize this @iop + * @iop: The parent iopoll structure + * @weight: The default weight (or command completion budget) + * @poll_fn: The handler to invoke + * + * Description: + * Initialize this blk_iopoll structure. Before being actively used, the + * driver must call blk_iopoll_enable(). + **/ +void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) +{ + memset(iop, 0, sizeof(*iop)); + INIT_LIST_HEAD(&iop->list); + iop->weight = weight; + iop->poll = poll_fn; + set_bit(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_init); + +static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + &__get_cpu_var(blk_cpu_iopoll)); + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = { + .notifier_call = blk_iopoll_cpu_notify, +}; + +static __init int blk_iopoll_setup(void) +{ + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); + + open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq); + register_hotcpu_notifier(&blk_iopoll_cpu_notifier); + return 0; +} +subsys_initcall(blk_iopoll_setup); diff --git a/block/blk-merge.c b/block/blk-merge.c index e1999679a4d..99cb5cf1f44 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, return 1; } +/** + * blk_rq_set_mixed_merge - mark a request as mixed merge + * @rq: request to mark as mixed merge + * + * Description: + * @rq is about to be mixed merged. Make sure the attributes + * which can be mixed are set in each bio and mark @rq as mixed + * merged. + */ +void blk_rq_set_mixed_merge(struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + struct bio *bio; + + if (rq->cmd_flags & REQ_MIXED_MERGE) + return; + + /* + * @rq will no longer represent mixable attributes for all the + * contained bios. It will just track those of the first one. + * Distributes the attributs to each bio. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) && + (bio->bi_rw & REQ_FAILFAST_MASK) != ff); + bio->bi_rw |= ff; + } + rq->cmd_flags |= REQ_MIXED_MERGE; +} + static void blk_account_io_merge(struct request *req) { if (blk_do_io_stat(req)) { @@ -321,7 +351,7 @@ static void blk_account_io_merge(struct request *req) part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rq_data_dir(req)); part_stat_unlock(); } @@ -350,12 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (blk_integrity_rq(req) != blk_integrity_rq(next)) return 0; - /* don't merge requests of different failfast settings */ - if (blk_failfast_dev(req) != blk_failfast_dev(next) || - blk_failfast_transport(req) != blk_failfast_transport(next) || - blk_failfast_driver(req) != blk_failfast_driver(next)) - return 0; - /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -366,6 +390,19 @@ static int attempt_merge(struct request_queue *q, struct request *req, return 0; /* + * If failfast settings disagree or any of the two is already + * a mixed merge, mark both as mixed before proceeding. This + * makes sure that all involved bios have mixable attributes + * set properly. + */ + if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || + (req->cmd_flags & REQ_FAILFAST_MASK) != + (next->cmd_flags & REQ_FAILFAST_MASK)) { + blk_rq_set_mixed_merge(req); + blk_rq_set_mixed_merge(next); + } + + /* * At this point we have either done a back merge * or front merge. We need the smaller start_time of * the merged requests to be the current request diff --git a/block/blk-settings.c b/block/blk-settings.c index 476d8706507..83413ff8373 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -428,6 +428,25 @@ void blk_queue_io_min(struct request_queue *q, unsigned int min) EXPORT_SYMBOL(blk_queue_io_min); /** + * blk_limits_io_opt - set optimal request size for a device + * @limits: the queue limits + * @opt: smallest I/O size in bytes + * + * Description: + * Storage devices may report an optimal I/O size, which is the + * device's preferred unit for sustained I/O. This is rarely reported + * for disk drives. For RAID arrays it is usually the stripe width or + * the internal track size. A properly aligned multiple of + * optimal_io_size is the preferred request size for workloads where + * sustained throughput is desired. + */ +void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt) +{ + limits->io_opt = opt; +} +EXPORT_SYMBOL(blk_limits_io_opt); + +/** * blk_queue_io_opt - set optimal request size for the queue * @q: the request queue for the device * @opt: optimal request size in bytes @@ -442,7 +461,7 @@ EXPORT_SYMBOL(blk_queue_io_min); */ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { - q->limits.io_opt = opt; + blk_limits_io_opt(&q->limits, opt); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d3aa2aadb3e..b78c9c3e267 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -40,7 +40,12 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) { struct request_list *rl = &q->rq; unsigned long nr; - int ret = queue_var_store(&nr, page, count); + int ret; + + if (!q->request_fn) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; diff --git a/block/blk.h b/block/blk.h index 3fae6add543..5ee3d7e72fe 100644 --- a/block/blk.h +++ b/block/blk.h @@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, int attempt_back_merge(struct request_queue *q, struct request *rq); int attempt_front_merge(struct request_queue *q, struct request *rq); void blk_recalc_rq_segments(struct request *rq); +void blk_rq_set_mixed_merge(struct request *rq); void blk_queue_congestion_threshold(struct request_queue *q); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fd7080ed793..0e3814b662a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -134,13 +134,8 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; - /* - * Used to track any pending rt requests so we can pre-empt current - * non-RT cfqq in service when this value is non-zero. - */ - unsigned int busy_rt_queues; - int rq_in_driver; + int rq_in_driver[2]; int sync_flight; /* @@ -191,7 +186,6 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ - CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ @@ -218,7 +212,6 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ CFQ_CFQQ_FNS(on_rr); CFQ_CFQQ_FNS(wait_request); CFQ_CFQQ_FNS(must_dispatch); -CFQ_CFQQ_FNS(must_alloc); CFQ_CFQQ_FNS(must_alloc_slice); CFQ_CFQQ_FNS(fifo_expire); CFQ_CFQQ_FNS(idle_window); @@ -239,6 +232,11 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); +static inline int rq_in_driver(struct cfq_data *cfqd) +{ + return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; +} + static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, int is_sync) { @@ -257,7 +255,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, */ static inline int cfq_bio_sync(struct bio *bio) { - if (bio_data_dir(bio) == READ || bio_sync(bio)) + if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO)) return 1; return 0; @@ -648,8 +646,6 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -673,8 +669,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues--; } /* @@ -760,9 +754,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver++; + cfqd->rq_in_driver[rq_is_sync(rq)]++; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - cfqd->rq_in_driver); + rq_in_driver(cfqd)); cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } @@ -770,11 +764,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) static void cfq_deactivate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; + const int sync = rq_is_sync(rq); - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; + WARN_ON(!cfqd->rq_in_driver[sync]); + cfqd->rq_in_driver[sync]--; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - cfqd->rq_in_driver); + rq_in_driver(cfqd)); } static void cfq_remove_request(struct request *rq) @@ -1080,7 +1075,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * still requests with the driver, don't idle */ - if (cfqd->rq_in_driver) + if (rq_in_driver(cfqd)) return; /* @@ -1115,6 +1110,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); cfq_remove_request(rq); cfqq->dispatched++; elv_dispatch_sort(q, rq); @@ -1179,20 +1175,6 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto expire; /* - * If we have a RT cfqq waiting, then we pre-empt the current non-rt - * cfqq. - */ - if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { - /* - * We simulate this as cfqq timed out so that it gets to bank - * the remaining of its time slice. - */ - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - goto new_queue; - } - - /* * The active queue has requests and isn't expired, allow it to * dispatch. */ @@ -1312,6 +1294,12 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) return 0; /* + * Drain async requests before we start sync IO + */ + if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + return 0; + + /* * If this is an async queue and we have sync IO in flight, let it wait */ if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) @@ -1362,7 +1350,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) cfq_slice_expired(cfqd, 0); } - cfq_log(cfqd, "dispatched a request"); + cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); return 1; } @@ -2130,11 +2118,11 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { - if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = cfqd->rq_in_driver; + if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) + cfqd->rq_in_driver_peak = rq_in_driver(cfqd); if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) + rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) @@ -2161,9 +2149,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_update_hw_tag(cfqd); - WARN_ON(!cfqd->rq_in_driver); + WARN_ON(!cfqd->rq_in_driver[sync]); WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver--; + cfqd->rq_in_driver[sync]--; cfqq->dispatched--; if (cfq_cfqq_sync(cfqq)) @@ -2197,7 +2185,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_arm_slice_timer(cfqd); } - if (!cfqd->rq_in_driver) + if (!rq_in_driver(cfqd)) cfq_schedule_dispatch(cfqd); } @@ -2229,8 +2217,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq) static inline int __cfq_may_queue(struct cfq_queue *cfqq) { - if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && - !cfq_cfqq_must_alloc_slice(cfqq)) { + if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { cfq_mark_cfqq_must_alloc_slice(cfqq); return ELV_MQUEUE_MUST; } @@ -2317,7 +2304,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) } cfqq->allocated[rw]++; - cfq_clear_cfqq_must_alloc(cfqq); atomic_inc(&cfqq->ref); spin_unlock_irqrestore(q->queue_lock, flags); diff --git a/block/elevator.c b/block/elevator.c index 2d511f9105e..1975b619c86 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) /* * Don't merge file system requests and discard requests */ - if (bio_discard(bio) != bio_discard(rq->bio)) + if (bio_rw_flagged(bio, BIO_RW_DISCARD) != + bio_rw_flagged(rq->bio, BIO_RW_DISCARD)) return 0; /* @@ -100,19 +101,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_integrity(bio) != blk_integrity_rq(rq)) return 0; - /* - * Don't merge if failfast settings don't match. - * - * FIXME: The negation in front of each condition is necessary - * because bio and request flags use different bit positions - * and the accessors return those bits directly. This - * ugliness will soon go away. - */ - if (!bio_failfast_dev(bio) != !blk_failfast_dev(rq) || - !bio_failfast_transport(bio) != !blk_failfast_transport(rq) || - !bio_failfast_driver(bio) != !blk_failfast_driver(rq)) - return 0; - if (!elv_iosched_allow_merge(rq, bio)) return 0; diff --git a/block/genhd.c b/block/genhd.c index f4c64c2b303..5b76bf55d05 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_alignment_offset.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[1]), (unsigned long long)part_stat_read(hd, sectors[1]), jiffies_to_msecs(part_stat_read(hd, ticks[1])), - hd->in_flight, + part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); @@ -1215,6 +1217,16 @@ void put_disk(struct gendisk *disk) EXPORT_SYMBOL(put_disk); +static void set_disk_ro_uevent(struct gendisk *gd, int ro) +{ + char event[] = "DISK_RO=1"; + char *envp[] = { event, NULL }; + + if (!ro) + event[8] = '0'; + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); +} + void set_device_ro(struct block_device *bdev, int flag) { bdev->bd_part->policy = flag; @@ -1227,8 +1239,12 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); + if (disk->part0.policy != flag) { + set_disk_ro_uevent(disk, flag); + disk->part0.policy = flag; + } + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) part->policy = flag; disk_part_iter_exit(&piter); diff --git a/block/ioctl.c b/block/ioctl.c index 500e4c73cc5..d3e6b5827a3 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev) return res; } -static void blk_ioc_discard_endio(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - complete(bio->bi_private); -} - static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, uint64_t len) { - struct request_queue *q = bdev_get_queue(bdev); - int ret = 0; - if (start & 511) return -EINVAL; if (len & 511) @@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (bdev->bd_inode->i_size >> 9)) return -EINVAL; - - if (!q->prepare_discard_fn) - return -EOPNOTSUPP; - - while (len && !ret) { - DECLARE_COMPLETION_ONSTACK(wait); - struct bio *bio; - - bio = bio_alloc(GFP_KERNEL, 0); - - bio->bi_end_io = blk_ioc_discard_endio; - bio->bi_bdev = bdev; - bio->bi_private = &wait; - bio->bi_sector = start; - - if (len > queue_max_hw_sectors(q)) { - bio->bi_size = queue_max_hw_sectors(q) << 9; - len -= queue_max_hw_sectors(q); - start += queue_max_hw_sectors(q); - } else { - bio->bi_size = len << 9; - len = 0; - } - submit_bio(DISCARD_NOBARRIER, bio); - - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; + return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, + DISCARD_FL_WAIT); } static int put_ushort(unsigned long arg, unsigned short val) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 95d344971ed..b6cd571adbf 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -172,6 +172,9 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio) BUG(); bio_endio(bio, -ENXIO); return 0; + } else if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { + bio_endio(bio, -EOPNOTSUPP); + return 0; } else if (bio->bi_io_vec == NULL) { printk(KERN_ERR "aoe: bi_io_vec is NULL\n"); BUG(); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index a52cc7fe45e..0589dfbbd7d 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3889,7 +3889,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, int j = 0; int rc; int dac, return_code; - InquiryData_struct *inq_buff = NULL; + InquiryData_struct *inq_buff; if (reset_devices) { /* Reset the controller with a PCI power-cycle */ @@ -4029,6 +4029,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, printk(KERN_WARNING "cciss: unable to determine firmware" " version of controller\n"); } + kfree(inq_buff); cciss_procinit(i); @@ -4045,7 +4046,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, return 1; clean4: - kfree(inq_buff); kfree(hba[i]->cmd_pool_bits); if (hba[i]->cmd_pool) pci_free_consistent(hba[i]->pdev, diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5757188cd1f..bbb79441d89 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -475,7 +475,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; if (bio_rw(bio) == WRITE) { - int barrier = bio_barrier(bio); + bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); struct file *file = lo->lo_backing_file; if (barrier) { diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 911dfd98d81..9f3518c515a 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -219,8 +219,6 @@ static int pcd_sector; /* address of next requested sector */ static int pcd_count; /* number of blocks still to do */ static char *pcd_buf; /* buffer for request in progress */ -static int pcd_warned; /* Have we logged a phase warning ? */ - /* kernel glue structures */ static int pcd_block_open(struct block_device *bdev, fmode_t mode) @@ -417,12 +415,10 @@ static int pcd_completion(struct pcd_unit *cd, char *buf, char *fun) printk ("%s: %s: Unexpected phase %d, d=%d, k=%d\n", cd->name, fun, p, d, k); - if ((verbose < 2) && !pcd_warned) { - pcd_warned = 1; - printk - ("%s: WARNING: ATAPI phase errors\n", - cd->name); - } + if (verbose < 2) + printk_once( + "%s: WARNING: ATAPI phase errors\n", + cd->name); mdelay(1); } if (k++ > PCD_TMO) { diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index da403b6a7f4..f5cd2e83ebc 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1564,15 +1564,13 @@ static int carm_init_shm(struct carm_host *host) static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) { - static unsigned int printed_version; struct carm_host *host; unsigned int pci_dac; int rc; struct request_queue *q; unsigned int i; - if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index 390d69bb7c4..b441ce3832e 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -416,15 +416,9 @@ retry: goto retry; } if (we.max_disk > (MAX_DISKNO - 1)) { - static int warned; - - if (warned == 0) { - warned++; - printk(VIOD_KERN_INFO - "Only examining the first %d " - "of %d disks connected\n", - MAX_DISKNO, we.max_disk + 1); - } + printk_once(VIOD_KERN_INFO + "Only examining the first %d of %d disks connected\n", + MAX_DISKNO, we.max_disk + 1); } /* Send the close event to OS/400. We DON'T expect a response */ diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 33f179e66bf..cc9dc79b078 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1129,7 +1129,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, if (error == -EOPNOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) goto out; if (unlikely(error)) { diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3e563d25173..e0efc1adcaf 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -285,7 +285,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, if (!error) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) return error; if (error == -EOPNOTSUPP) @@ -336,7 +336,7 @@ static void stripe_io_hints(struct dm_target *ti, unsigned chunk_size = (sc->chunk_mask + 1) << 9; blk_limits_io_min(limits, chunk_size); - limits->io_opt = chunk_size * sc->stripes; + blk_limits_io_opt(limits, chunk_size * sc->stripes); } static struct target_type stripe_target = { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b4845b14740..eee28fac210 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -130,7 +130,7 @@ struct mapped_device { /* * A list of ios that arrived while we were suspended. */ - atomic_t pending; + atomic_t pending[2]; wait_queue_head_t wait; struct work_struct work; struct bio_list deferred; @@ -453,13 +453,14 @@ static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; int cpu; + int rw = bio_data_dir(io->bio); io->start_time = jiffies; cpu = part_stat_lock(); part_round_stats(cpu, &dm_disk(md)->part0); part_stat_unlock(); - dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending); + dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); } static void end_io_acct(struct dm_io *io) @@ -479,8 +480,9 @@ static void end_io_acct(struct dm_io *io) * After this is decremented the bio must not be touched if it is * a barrier. */ - dm_disk(md)->part0.in_flight = pending = - atomic_dec_return(&md->pending); + dm_disk(md)->part0.in_flight[rw] = pending = + atomic_dec_return(&md->pending[rw]); + pending += atomic_read(&md->pending[rw^0x1]); /* nudge anyone waiting on suspend queue */ if (!pending) @@ -586,7 +588,7 @@ static void dec_pending(struct dm_io *io, int error) */ spin_lock_irqsave(&md->deferred_lock, flags); if (__noflush_suspending(md)) { - if (!bio_barrier(io->bio)) + if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER)) bio_list_add_head(&md->deferred, io->bio); } else @@ -598,7 +600,7 @@ static void dec_pending(struct dm_io *io, int error) io_error = io->error; bio = io->bio; - if (bio_barrier(bio)) { + if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { /* * There can be just one barrier request so we use * a per-device variable for error reporting. @@ -1209,7 +1211,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) ci.map = dm_get_table(md); if (unlikely(!ci.map)) { - if (!bio_barrier(bio)) + if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) bio_io_error(bio); else if (!md->barrier_error) @@ -1321,7 +1323,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio) * we have to queue this io for later. */ if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || - unlikely(bio_barrier(bio))) { + unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { up_read(&md->io_lock); if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && @@ -1344,7 +1346,7 @@ static int dm_make_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q->queuedata; - if (unlikely(bio_barrier(bio))) { + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { bio_endio(bio, -EOPNOTSUPP); return 0; } @@ -1785,7 +1787,8 @@ static struct mapped_device *alloc_dev(int minor) if (!md->disk) goto bad_disk; - atomic_set(&md->pending, 0); + atomic_set(&md->pending[0], 0); + atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); @@ -2088,7 +2091,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) break; } spin_unlock_irqrestore(q->queue_lock, flags); - } else if (!atomic_read(&md->pending)) + } else if (!atomic_read(&md->pending[0]) && + !atomic_read(&md->pending[1])) break; if (interruptible == TASK_INTERRUPTIBLE && @@ -2164,7 +2168,7 @@ static void dm_wq_work(struct work_struct *work) if (dm_request_based(md)) generic_make_request(c); else { - if (bio_barrier(c)) + if (bio_rw_flagged(c, BIO_RW_BARRIER)) process_barrier(md, c); else __split_and_process_bio(md, c); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5fe39c2a3d2..ea484290544 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -288,7 +288,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) sector_t start_sector; int cpu; - if (unlikely(bio_barrier(bio))) { + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { bio_endio(bio, -EOPNOTSUPP); return 0; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7140909f666..89e76819f61 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -90,7 +90,7 @@ static void multipath_end_request(struct bio *bio, int error) if (uptodate) multipath_end_bh_io(mp_bh, 0); - else if (!bio_rw_ahead(bio)) { + else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) { /* * oops, IO error: */ @@ -144,7 +144,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) const int rw = bio_data_dir(bio); int cpu; - if (unlikely(bio_barrier(bio))) { + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { bio_endio(bio, -EOPNOTSUPP); return 0; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 898e2bdfee4..f845ed98fec 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -448,7 +448,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) const int rw = bio_data_dir(bio); int cpu; - if (unlikely(bio_barrier(bio))) { + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { bio_endio(bio, -EOPNOTSUPP); return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8726fd7ebce..ff7ed333599 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -782,8 +782,9 @@ static int make_request(struct request_queue *q, struct bio * bio) struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - const int do_sync = bio_sync(bio); - int cpu, do_barriers; + const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + int cpu; + bool do_barriers; mdk_rdev_t *blocked_rdev; /* @@ -797,7 +798,8 @@ static int make_request(struct request_queue *q, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ - if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { + if (unlikely(!mddev->barriers_work && + bio_rw_flagged(bio, BIO_RW_BARRIER))) { if (rw == WRITE) md_write_end(mddev); bio_endio(bio, -EOPNOTSUPP); @@ -925,7 +927,7 @@ static int make_request(struct request_queue *q, struct bio * bio) atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio_barrier(bio); + do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER); if (do_barriers) set_bit(R1BIO_Barrier, &r1_bio->state); @@ -1600,7 +1602,7 @@ static void raid1d(mddev_t *mddev) * We already have a nr_pending reference on these rdevs. */ int i; - const int do_sync = bio_sync(r1_bio->master_bio); + const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); clear_bit(R1BIO_BarrierRetry, &r1_bio->state); clear_bit(R1BIO_Barrier, &r1_bio->state); for (i=0; i < conf->raid_disks; i++) @@ -1654,7 +1656,7 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { - const int do_sync = bio_sync(r1_bio->master_bio); + const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3d9020cf6f6..d0a2152e064 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -796,12 +796,12 @@ static int make_request(struct request_queue *q, struct bio * bio) int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); - const int do_sync = bio_sync(bio); + const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); struct bio_list bl; unsigned long flags; mdk_rdev_t *blocked_rdev; - if (unlikely(bio_barrier(bio))) { + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { bio_endio(bio, -EOPNOTSUPP); return 0; } @@ -1610,7 +1610,7 @@ static void raid10d(mddev_t *mddev) raid_end_bio_io(r10_bio); bio_put(bio); } else { - const int do_sync = bio_sync(r10_bio->master_bio); + const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO); bio_put(bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b8a2c5dc67b..826eb346735 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3606,7 +3606,7 @@ static int make_request(struct request_queue *q, struct bio * bi) const int rw = bio_data_dir(bi); int cpu, remaining; - if (unlikely(bio_barrier(bi))) { + if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { bio_endio(bi, -EOPNOTSUPP); return 0; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 662024d8694..5987da85710 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -898,8 +898,10 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_print_sense("", cmd); scsi_print_command(cmd); } - blk_end_request_all(req, -EIO); - scsi_next_command(cmd); + if (blk_end_request_err(req, -EIO)) + scsi_requeue_command(q, cmd); + else + scsi_next_command(cmd); break; case ACTION_REPREP: /* Unprep the request and put it back at the head of the queue. diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c index 84724187ec3..ac8577358ba 100644 --- a/drivers/staging/dst/dcore.c +++ b/drivers/staging/dst/dcore.c @@ -112,8 +112,9 @@ static int dst_request(struct request_queue *q, struct bio *bio) * I worked with. * * Empty barriers are not allowed anyway, see 51fd77bd9f512 - * for example, although later it was changed to bio_discard() - * only, which does not work in this case. + * for example, although later it was changed to + * bio_rw_flagged(bio, BIO_RW_DISCARD) only, which does not + * work in this case. */ //err = -EOPNOTSUPP; err = 0; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72a2b9c28e9..535f85ba104 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, + DISCARD_FL_BARRIER); } #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5dbefd11b4a..5cf405b0828 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -260,7 +260,7 @@ loop_lock: num_run++; batch_run++; - if (bio_sync(cur)) + if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) num_sync_run++; if (need_resched()) { @@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_sync(bio)) + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 18d3a28554a..28c590b7c9d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, goto start_new_extent; if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, - nr_sects, GFP_NOFS); + nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; nr_sects = 0; @@ -871,7 +872,8 @@ start_new_extent: } } if (nr_sects) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); + rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index ea4e6cb29e1..619ba99dfe3 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - p->in_flight, + part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/fs/splice.c b/fs/splice.c index 819023733f8..7394e9e1753 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, len = left; ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret > 0) + if (ret > 0) { *ppos += ret; + file_accessed(in); + } return ret; } @@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ret = file_remove_suid(out); - if (!ret) + if (!ret) { + file_update_time(out); ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + } mutex_unlock(&inode->i_mutex); } while (ret > 0); splice_from_pipe_end(pipe, &sd); diff --git a/include/linux/bio.h b/include/linux/bio.h index 2892b710771..5be93f18d84 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -142,56 +142,51 @@ struct bio { * * bit 0 -- data direction * If not set, bio is a read from device. If set, it's a write to device. - * bit 1 -- rw-ahead when set - * bit 2 -- barrier + * bit 1 -- fail fast device errors + * bit 2 -- fail fast transport errors + * bit 3 -- fail fast driver errors + * bit 4 -- rw-ahead when set + * bit 5 -- barrier * Insert a serialization point in the IO queue, forcing previously * submitted IO to be completed before this one is issued. - * bit 3 -- synchronous I/O hint. - * bit 4 -- Unplug the device immediately after submitting this bio. - * bit 5 -- metadata request + * bit 6 -- synchronous I/O hint. + * bit 7 -- Unplug the device immediately after submitting this bio. + * bit 8 -- metadata request * Used for tracing to differentiate metadata and data IO. May also * get some preferential treatment in the IO scheduler - * bit 6 -- discard sectors + * bit 9 -- discard sectors * Informs the lower level device that this range of sectors is no longer * used by the file system and may thus be freed by the device. Used * for flash based storage. - * bit 7 -- fail fast device errors - * bit 8 -- fail fast transport errors - * bit 9 -- fail fast driver errors * Don't want driver retries for any fast fail whatever the reason. * bit 10 -- Tell the IO scheduler not to wait for more requests after this one has been submitted, even if it is a SYNC request. */ -#define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ -#define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ -#define BIO_RW_BARRIER 2 -#define BIO_RW_SYNCIO 3 -#define BIO_RW_UNPLUG 4 -#define BIO_RW_META 5 -#define BIO_RW_DISCARD 6 -#define BIO_RW_FAILFAST_DEV 7 -#define BIO_RW_FAILFAST_TRANSPORT 8 -#define BIO_RW_FAILFAST_DRIVER 9 -#define BIO_RW_NOIDLE 10 - -#define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) +enum bio_rw_flags { + BIO_RW, + BIO_RW_FAILFAST_DEV, + BIO_RW_FAILFAST_TRANSPORT, + BIO_RW_FAILFAST_DRIVER, + /* above flags must match REQ_* */ + BIO_RW_AHEAD, + BIO_RW_BARRIER, + BIO_RW_SYNCIO, + BIO_RW_UNPLUG, + BIO_RW_META, + BIO_RW_DISCARD, + BIO_RW_NOIDLE, +}; /* - * Old defines, these should eventually be replaced by direct usage of - * bio_rw_flagged() + * First four bits must match between bio->bi_rw and rq->cmd_flags, make + * that explicit here. */ -#define bio_barrier(bio) bio_rw_flagged(bio, BIO_RW_BARRIER) -#define bio_sync(bio) bio_rw_flagged(bio, BIO_RW_SYNCIO) -#define bio_unplug(bio) bio_rw_flagged(bio, BIO_RW_UNPLUG) -#define bio_failfast_dev(bio) bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV) -#define bio_failfast_transport(bio) \ - bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT) -#define bio_failfast_driver(bio) \ - bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER) -#define bio_rw_ahead(bio) bio_rw_flagged(bio, BIO_RW_AHEAD) -#define bio_rw_meta(bio) bio_rw_flagged(bio, BIO_RW_META) -#define bio_discard(bio) bio_rw_flagged(bio, BIO_RW_DISCARD) -#define bio_noidle(bio) bio_rw_flagged(bio, BIO_RW_NOIDLE) +#define BIO_RW_RQ_MASK 0xf + +static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag) +{ + return (bio->bi_rw & (1 << flag)) != 0; +} /* * upper 16 bits of bi_rw define the io priority of this bio @@ -216,7 +211,7 @@ struct bio { #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) -#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) +#define bio_empty_barrier(bio) (bio_rw_flagged(bio, BIO_RW_BARRIER) && !bio_has_data(bio) && !bio_rw_flagged(bio, BIO_RW_DISCARD)) static inline unsigned int bio_cur_bytes(struct bio *bio) { diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h new file mode 100644 index 00000000000..308734d3d4a --- /dev/null +++ b/include/linux/blk-iopoll.h @@ -0,0 +1,48 @@ +#ifndef BLK_IOPOLL_H +#define BLK_IOPOLL_H + +struct blk_iopoll; +typedef int (blk_iopoll_fn)(struct blk_iopoll *, int); + +struct blk_iopoll { + struct list_head list; + unsigned long state; + unsigned long data; + int weight; + int max; + blk_iopoll_fn *poll; +}; + +enum { + IOPOLL_F_SCHED = 0, + IOPOLL_F_DISABLE = 1, +}; + +/* + * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating + * that we were the first to acquire this iop for scheduling. If this iop + * is currently disabled, return "failure". + */ +static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop) +{ + if (!test_bit(IOPOLL_F_DISABLE, &iop->state)) + return test_and_set_bit(IOPOLL_F_SCHED, &iop->state); + + return 1; +} + +static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop) +{ + return test_bit(IOPOLL_F_DISABLE, &iop->state); +} + +extern void blk_iopoll_sched(struct blk_iopoll *); +extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *); +extern void blk_iopoll_complete(struct blk_iopoll *); +extern void __blk_iopoll_complete(struct blk_iopoll *); +extern void blk_iopoll_enable(struct blk_iopoll *); +extern void blk_iopoll_disable(struct blk_iopoll *); + +extern int blk_iopoll_enabled; + +#endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 69103e053c9..e23a86cae5a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -86,13 +86,14 @@ enum { }; /* - * request type modified bits. first two bits match BIO_RW* bits, important + * request type modified bits. first four bits match BIO_RW* bits, important */ enum rq_flag_bits { __REQ_RW, /* not set, read. set, write */ __REQ_FAILFAST_DEV, /* no driver retries of device errors */ __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ + /* above flags must match BIO_RW_* */ __REQ_DISCARD, /* request to discard sectors */ __REQ_SORTED, /* elevator knows about this request */ __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ @@ -114,6 +115,7 @@ enum rq_flag_bits { __REQ_INTEGRITY, /* integrity metadata has been remapped */ __REQ_NOIDLE, /* Don't anticipate more IO after this one */ __REQ_IO_STAT, /* account I/O stat */ + __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_NR_BITS, /* stops here */ }; @@ -142,6 +144,10 @@ enum rq_flag_bits { #define REQ_INTEGRITY (1 << __REQ_INTEGRITY) #define REQ_NOIDLE (1 << __REQ_NOIDLE) #define REQ_IO_STAT (1 << __REQ_IO_STAT) +#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) + +#define REQ_FAILFAST_MASK (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \ + REQ_FAILFAST_DRIVER) #define BLK_MAX_CDB 16 @@ -453,10 +459,12 @@ struct request_queue #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ +#define QUEUE_FLAG_CQ 16 /* hardware does queuing */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ - (1 << QUEUE_FLAG_STACKABLE)) + (1 << QUEUE_FLAG_STACKABLE) | \ + (1 << QUEUE_FLAG_SAME_COMP)) static inline int queue_is_locked(struct request_queue *q) { @@ -575,6 +583,7 @@ enum { #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) +#define blk_queue_queuing(q) test_bit(QUEUE_FLAG_CQ, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) @@ -828,11 +837,13 @@ static inline void blk_run_address_space(struct address_space *mapping) } /* - * blk_rq_pos() : the current sector - * blk_rq_bytes() : bytes left in the entire request - * blk_rq_cur_bytes() : bytes left in the current segment - * blk_rq_sectors() : sectors left in the entire request - * blk_rq_cur_sectors() : sectors left in the current segment + * blk_rq_pos() : the current sector + * blk_rq_bytes() : bytes left in the entire request + * blk_rq_cur_bytes() : bytes left in the current segment + * blk_rq_err_bytes() : bytes left till the next error boundary + * blk_rq_sectors() : sectors left in the entire request + * blk_rq_cur_sectors() : sectors left in the current segment + * blk_rq_err_sectors() : sectors left till the next error boundary */ static inline sector_t blk_rq_pos(const struct request *rq) { @@ -849,6 +860,8 @@ static inline int blk_rq_cur_bytes(const struct request *rq) return rq->bio ? bio_cur_bytes(rq->bio) : 0; } +extern unsigned int blk_rq_err_bytes(const struct request *rq); + static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> 9; @@ -859,6 +872,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +static inline unsigned int blk_rq_err_sectors(const struct request *rq) +{ + return blk_rq_err_bytes(rq) >> 9; +} + /* * Request issue related functions. */ @@ -885,10 +903,12 @@ extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); extern bool blk_end_request_cur(struct request *rq, int error); +extern bool blk_end_request_err(struct request *rq, int error); extern bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void __blk_end_request_all(struct request *rq, int error); extern bool __blk_end_request_cur(struct request *rq, int error); +extern bool __blk_end_request_err(struct request *rq, int error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); @@ -915,6 +935,7 @@ extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); +extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_default_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, @@ -977,15 +998,18 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, } extern int blkdev_issue_flush(struct block_device *, sector_t *); -extern int blkdev_issue_discard(struct block_device *, - sector_t sector, sector_t nr_sects, gfp_t); +#define DISCARD_FL_WAIT 0x01 /* wait for completion */ +#define DISCARD_FL_BARRIER 0x02 /* issue DISCARD_BARRIER request */ +extern int blkdev_issue_discard(struct block_device *, sector_t sector, + sector_t nr_sects, gfp_t, int flags); static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks) { block <<= (sb->s_blocksize_bits - 9); nr_blocks <<= (sb->s_blocksize_bits - 9); - return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL); + return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL, + DISCARD_FL_BARRIER); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); diff --git a/include/linux/fs.h b/include/linux/fs.h index 37f53216998..b21cf6b9c80 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -161,8 +161,8 @@ struct inodes_stat_t { * These aren't really reads or writes, they pass down information about * parts of device that are now unused by the file system. */ -#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) -#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) +#define DISCARD_NOBARRIER (WRITE | (1 << BIO_RW_DISCARD)) +#define DISCARD_BARRIER (DISCARD_NOBARRIER | (1 << BIO_RW_BARRIER)) #define SEL_IN 1 #define SEL_OUT 2 diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 45fc320a53c..44263cb2712 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -98,7 +98,7 @@ struct hd_struct { int make_it_fail; #endif unsigned long stamp; - int in_flight; + int in_flight[2]; #ifdef CONFIG_SMP struct disk_stats *dkstats; #else @@ -322,18 +322,23 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_sub(cpu, gendiskp, field, subnd) \ part_stat_add(cpu, gendiskp, field, -subnd) -static inline void part_inc_in_flight(struct hd_struct *part) +static inline void part_inc_in_flight(struct hd_struct *part, int rw) { - part->in_flight++; + part->in_flight[rw]++; if (part->partno) - part_to_disk(part)->part0.in_flight++; + part_to_disk(part)->part0.in_flight[rw]++; } -static inline void part_dec_in_flight(struct hd_struct *part) +static inline void part_dec_in_flight(struct hd_struct *part, int rw) { - part->in_flight--; + part->in_flight[rw]--; if (part->partno) - part_to_disk(part)->part0.in_flight--; + part_to_disk(part)->part0.in_flight[rw]--; +} + +static inline int part_in_flight(struct hd_struct *part) +{ + return part->in_flight[0] + part->in_flight[1]; } /* block/blk-core.c */ @@ -546,6 +551,8 @@ extern ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf); #ifdef CONFIG_FAIL_MAKE_REQUEST extern ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 1ac57e522a1..8e9e151f811 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -348,6 +348,7 @@ enum NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, + BLOCK_IOPOLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3125cff1c57..6bb59f70740 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -91,6 +91,7 @@ extern int sysctl_nr_trim_pages; #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +extern int blk_iopoll_enabled; /* Constants used for minimum and maximum */ #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -997,7 +998,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - + { + .ctl_name = CTL_UNNUMBERED, + .procname = "blk_iopoll", + .data = &blk_iopoll_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/mm/swapfile.c b/mm/swapfile.c index 8ffdc0d23c5..74f1102e874 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si) } err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL); + nr_blocks, GFP_KERNEL, + DISCARD_FL_BARRIER); if (err) break; @@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO)) + nr_blocks, GFP_NOIO, + DISCARD_FL_BARRIER)) break; } |