diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-block | 64 | ||||
-rw-r--r-- | block/blk-cgroup.c | 200 | ||||
-rw-r--r-- | block/blk-cgroup.h | 40 | ||||
-rw-r--r-- | block/blk-core.c | 32 | ||||
-rw-r--r-- | block/blk-exec.c | 2 | ||||
-rw-r--r-- | block/blk-flush.c | 16 | ||||
-rw-r--r-- | block/blk-ioc.c | 3 | ||||
-rw-r--r-- | block/blk-lib.c | 82 | ||||
-rw-r--r-- | block/blk-settings.c | 9 | ||||
-rw-r--r-- | block/blk-sysfs.c | 3 | ||||
-rw-r--r-- | block/blk-throttle.c | 313 | ||||
-rw-r--r-- | block/blk.h | 23 | ||||
-rw-r--r-- | block/cfq-iosched.c | 232 | ||||
-rw-r--r-- | block/elevator.c | 11 | ||||
-rw-r--r-- | drivers/ata/libata-scsi.c | 13 | ||||
-rw-r--r-- | drivers/block/paride/pcd.c | 2 | ||||
-rw-r--r-- | drivers/cdrom/viocd.c | 4 | ||||
-rw-r--r-- | drivers/ide/ide-cd.c | 3 | ||||
-rw-r--r-- | drivers/scsi/sr.c | 2 | ||||
-rw-r--r-- | fs/block_dev.c | 17 | ||||
-rw-r--r-- | fs/partitions/check.c | 8 | ||||
-rw-r--r-- | include/linux/blk_types.h | 2 | ||||
-rw-r--r-- | include/linux/blkdev.h | 15 | ||||
-rw-r--r-- | include/linux/genhd.h | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 4 |
25 files changed, 785 insertions, 317 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 4873c759d53..c1eb41cb987 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -142,3 +142,67 @@ Description: with the previous I/O request are enabled. When set to 2, all merge tries are disabled. The default value is 0 - which enables all types of merge tries. + +What: /sys/block/<disk>/discard_alignment +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + device is offset from the internal allocation unit's + natural alignment. + +What: /sys/block/<disk>/<partition>/discard_alignment +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may + internally allocate space in units that are bigger than + the exported logical block size. The discard_alignment + parameter indicates how many bytes the beginning of the + partition is offset from the internal allocation unit's + natural alignment. + +What: /sys/block/<disk>/queue/discard_granularity +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may + internally allocate space using units that are bigger + than the logical block size. The discard_granularity + parameter indicates the size of the internal allocation + unit in bytes if reported by the device. Otherwise the + discard_granularity will be set to match the device's + physical block size. A discard_granularity of 0 means + that the device does not support discard functionality. + +What: /sys/block/<disk>/queue/discard_max_bytes +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may have + internal limits on the number of bytes that can be + trimmed or unmapped in a single operation. Some storage + protocols also have inherent limits on the number of + blocks that can be described in a single command. The + discard_max_bytes parameter is set by the device driver + to the maximum number of bytes that can be discarded in + a single operation. Discard requests issued to the + device must not exceed this limit. A discard_max_bytes + value of 0 means that the device does not support + discard functionality. + +What: /sys/block/<disk>/queue/discard_zeroes_data +Date: May 2011 +Contact: Martin K. Petersen <martin.petersen@oracle.com> +Description: + Devices that support discard functionality may return + stale or random data when a previously discarded block + is read back. This can cause problems if the filesystem + expects discarded blocks to be explicitly cleared. If a + device reports that it deterministically returns zeroes + when a discarded area is read the discard_zeroes_data + parameter will be set to one. Otherwise it will be 0 and + the result of reading a discarded area is undefined. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 471fdcc5df8..07371cfdfae 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -385,25 +385,40 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, spin_lock_irqsave(&blkg->stats_lock, flags); blkg->stats.time += time; +#ifdef CONFIG_DEBUG_BLK_CGROUP blkg->stats.unaccounted_time += unaccounted_time; +#endif spin_unlock_irqrestore(&blkg->stats_lock, flags); } EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); +/* + * should be called under rcu read lock or queue lock to make sure blkg pointer + * is valid. + */ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, bool direction, bool sync) { - struct blkio_group_stats *stats; + struct blkio_group_stats_cpu *stats_cpu; unsigned long flags; - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - stats->sectors += bytes >> 9; - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, - sync); - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + /* + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. + */ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(blkg->stats_cpu); + + u64_stats_update_begin(&stats_cpu->syncp); + stats_cpu->sectors += bytes >> 9; + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], + 1, direction, sync); + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], + bytes, direction, sync); + u64_stats_update_end(&stats_cpu->syncp); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); @@ -426,18 +441,44 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg, } EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); +/* Merged stats are per cpu. */ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, bool sync) { + struct blkio_group_stats_cpu *stats_cpu; unsigned long flags; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, - sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + /* + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. + */ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(blkg->stats_cpu); + + u64_stats_update_begin(&stats_cpu->syncp); + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, + direction, sync); + u64_stats_update_end(&stats_cpu->syncp); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); +/* + * This function allocates the per cpu stats for blkio_group. Should be called + * from sleepable context as alloc_per_cpu() requires that. + */ +int blkio_alloc_blkg_stats(struct blkio_group *blkg) +{ + /* Allocate memory for per cpu stats */ + blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); + if (!blkg->stats_cpu) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); + void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid) @@ -508,6 +549,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) } EXPORT_SYMBOL_GPL(blkiocg_lookup_group); +static void blkio_reset_stats_cpu(struct blkio_group *blkg) +{ + struct blkio_group_stats_cpu *stats_cpu; + int i, j, k; + /* + * Note: On 64 bit arch this should not be an issue. This has the + * possibility of returning some inconsistent value on 32bit arch + * as 64bit update on 32bit is non atomic. Taking care of this + * corner case makes code very complicated, like sending IPIs to + * cpus, taking care of stats of offline cpus etc. + * + * reset stats is anyway more of a debug feature and this sounds a + * corner case. So I am not complicating the code yet until and + * unless this becomes a real issue. + */ + for_each_possible_cpu(i) { + stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); + stats_cpu->sectors = 0; + for(j = 0; j < BLKIO_STAT_CPU_NR; j++) + for (k = 0; k < BLKIO_STAT_TOTAL; k++) + stats_cpu->stat_arr_cpu[j][k] = 0; + } +} + static int blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -552,7 +617,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) } #endif spin_unlock(&blkg->stats_lock); + + /* Reset Per cpu stats which don't take blkg->stats_lock */ + blkio_reset_stats_cpu(blkg); } + spin_unlock_irq(&blkcg->lock); return 0; } @@ -598,6 +667,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, return val; } + +static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, + enum stat_type_cpu type, enum stat_sub_type sub_type) +{ + int cpu; + struct blkio_group_stats_cpu *stats_cpu; + u64 val = 0, tval; + + for_each_possible_cpu(cpu) { + unsigned int start; + stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); + + do { + start = u64_stats_fetch_begin(&stats_cpu->syncp); + if (type == BLKIO_STAT_CPU_SECTORS) + tval = stats_cpu->sectors; + else + tval = stats_cpu->stat_arr_cpu[type][sub_type]; + } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); + + val += tval; + } + + return val; +} + +static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) +{ + uint64_t disk_total, val; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_CPU_SECTORS) { + val = blkio_read_stat_cpu(blkg, type, 0); + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); + } + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + val = blkio_read_stat_cpu(blkg, type, sub_type); + cb->fill(cb, key_str, val); + } + + disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + + blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); + + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; +} + /* This should be called with blkg->stats_lock held */ static uint64_t blkio_get_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) @@ -609,9 +731,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, if (type == BLKIO_STAT_TIME) return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, blkg->stats.time, cb, dev); - if (type == BLKIO_STAT_SECTORS) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.sectors, cb, dev); #ifdef CONFIG_DEBUG_BLK_CGROUP if (type == BLKIO_STAT_UNACCOUNTED_TIME) return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, @@ -1075,8 +1194,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, } static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, - struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, - bool show_total) + struct cftype *cft, struct cgroup_map_cb *cb, + enum stat_type type, bool show_total, bool pcpu) { struct blkio_group *blkg; struct hlist_node *n; @@ -1087,10 +1206,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, if (blkg->dev) { if (!cftype_blkg_same_policy(cft, blkg)) continue; - spin_lock_irq(&blkg->stats_lock); - cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, - type); - spin_unlock_irq(&blkg->stats_lock); + if (pcpu) + cgroup_total += blkio_get_stat_cpu(blkg, cb, + blkg->dev, type); + else { + spin_lock_irq(&blkg->stats_lock); + cgroup_total += blkio_get_stat(blkg, cb, + blkg->dev, type); + spin_unlock_irq(&blkg->stats_lock); + } } } if (show_total) @@ -1114,47 +1238,47 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, switch(name) { case BLKIO_PROP_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_TIME, 0); + BLKIO_STAT_TIME, 0, 0); case BLKIO_PROP_sectors: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SECTORS, 0); + BLKIO_STAT_CPU_SECTORS, 0, 1); case BLKIO_PROP_io_service_bytes: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_BYTES, 1); + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); case BLKIO_PROP_io_serviced: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICED, 1); + BLKIO_STAT_CPU_SERVICED, 1, 1); case BLKIO_PROP_io_service_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_TIME, 1); + BLKIO_STAT_SERVICE_TIME, 1, 0); case BLKIO_PROP_io_wait_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_WAIT_TIME, 1); + BLKIO_STAT_WAIT_TIME, 1, 0); case BLKIO_PROP_io_merged: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_MERGED, 1); + BLKIO_STAT_CPU_MERGED, 1, 1); case BLKIO_PROP_io_queued: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_QUEUED, 1); + BLKIO_STAT_QUEUED, 1, 0); #ifdef CONFIG_DEBUG_BLK_CGROUP case BLKIO_PROP_unaccounted_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_UNACCOUNTED_TIME, 0); + BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); case BLKIO_PROP_dequeue: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_DEQUEUE, 0); + BLKIO_STAT_DEQUEUE, 0, 0); case BLKIO_PROP_avg_queue_size: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_AVG_QUEUE_SIZE, 0); + BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); case BLKIO_PROP_group_wait_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_GROUP_WAIT_TIME, 0); + BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); case BLKIO_PROP_idle_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_IDLE_TIME, 0); + BLKIO_STAT_IDLE_TIME, 0, 0); case BLKIO_PROP_empty_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_EMPTY_TIME, 0); + BLKIO_STAT_EMPTY_TIME, 0, 0); #endif default: BUG(); @@ -1164,10 +1288,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, switch(name){ case BLKIO_THROTL_io_service_bytes: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_BYTES, 1); + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); case BLKIO_THROTL_io_serviced: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICED, 1); + BLKIO_STAT_CPU_SERVICED, 1, 1); default: BUG(); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index c774930cc20..a71d2904ffb 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -14,6 +14,7 @@ */ #include <linux/cgroup.h> +#include <linux/u64_stats_sync.h> enum blkio_policy_id { BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ @@ -36,22 +37,15 @@ enum stat_type { * request completion for IOs doen by this cgroup. This may not be * accurate when NCQ is turned on. */ BLKIO_STAT_SERVICE_TIME = 0, - /* Total bytes transferred */ - BLKIO_STAT_SERVICE_BYTES, - /* Total IOs serviced, post merge */ - BLKIO_STAT_SERVICED, /* Total time spent waiting in scheduler queue in ns */ BLKIO_STAT_WAIT_TIME, - /* Number of IOs merged */ - BLKIO_STAT_MERGED, /* Number of IOs queued up */ BLKIO_STAT_QUEUED, /* All the single valued stats go below this */ BLKIO_STAT_TIME, - BLKIO_STAT_SECTORS, +#ifdef CONFIG_DEBUG_BLK_CGROUP /* Time not charged to this cgroup */ BLKIO_STAT_UNACCOUNTED_TIME, -#ifdef CONFIG_DEBUG_BLK_CGROUP BLKIO_STAT_AVG_QUEUE_SIZE, BLKIO_STAT_IDLE_TIME, BLKIO_STAT_EMPTY_TIME, @@ -60,6 +54,18 @@ enum stat_type { #endif }; +/* Per cpu stats */ +enum stat_type_cpu { + BLKIO_STAT_CPU_SECTORS, + /* Total bytes transferred */ + BLKIO_STAT_CPU_SERVICE_BYTES, + /* Total IOs serviced, post merge */ + BLKIO_STAT_CPU_SERVICED, + /* Number of IOs merged */ + BLKIO_STAT_CPU_MERGED, + BLKIO_STAT_CPU_NR +}; + enum stat_sub_type { BLKIO_STAT_READ = 0, BLKIO_STAT_WRITE, @@ -116,11 +122,11 @@ struct blkio_cgroup { struct blkio_group_stats { /* total disk time and nr sectors dispatched by this group */ uint64_t time; - uint64_t sectors; - /* Time not charged to this cgroup */ - uint64_t unaccounted_time; uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; #ifdef CONFIG_DEBUG_BLK_CGROUP + /* Time not charged to this cgroup */ + uint64_t unaccounted_time; + /* Sum of number of IOs queued across all samples */ uint64_t avg_queue_size_sum; /* Count of samples taken for average */ @@ -145,6 +151,13 @@ struct blkio_group_stats { #endif }; +/* Per cpu blkio group stats */ +struct blkio_group_stats_cpu { + uint64_t sectors; + uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; + struct u64_stats_sync syncp; +}; + struct blkio_group { /* An rcu protected unique identifier for the group */ void *key; @@ -160,6 +173,8 @@ struct blkio_group { /* Need to serialize the stats in the case of reset/update */ spinlock_t stats_lock; struct blkio_group_stats stats; + /* Per cpu stats pointer */ + struct blkio_group_stats_cpu __percpu *stats_cpu; }; struct blkio_policy_node { @@ -295,6 +310,7 @@ extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid); +extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); @@ -322,6 +338,8 @@ static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid) {} +static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } + static inline int blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index 3fe00a14822..c8303e9d919 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -569,8 +569,6 @@ int blk_get_queue(struct request_queue *q) static inline void blk_free_request(struct request_queue *q, struct request *rq) { - BUG_ON(rq->cmd_flags & REQ_ON_PLUG); - if (rq->cmd_flags & REQ_ELVPRIV) elv_put_request(q, rq); mempool_free(rq, q->rq.rq_pool); @@ -1110,14 +1108,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; - /* - * Debug stuff, kill later - */ - if (!rq_mergeable(req)) { - blk_dump_rq_flags(req, "back"); - return false; - } - if (!ll_back_merge_fn(q, req, bio)) return false; @@ -1132,6 +1122,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); return true; } @@ -1141,14 +1132,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, const int ff = bio->bi_rw & REQ_FAILFAST_MASK; sector_t sector; - /* - * Debug stuff, kill later - */ - if (!rq_mergeable(req)) { - blk_dump_rq_flags(req, "front"); - return false; - } - if (!ll_front_merge_fn(q, req, bio)) return false; @@ -1173,6 +1156,7 @@ static bool bio_attempt_front_merge(struct request_queue *q, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); return true; } @@ -1258,14 +1242,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) el_ret = elv_merge(q, &req, bio); if (el_ret == ELEVATOR_BACK_MERGE) { - BUG_ON(req->cmd_flags & REQ_ON_PLUG); if (bio_attempt_back_merge(q, req, bio)) { if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; } } else if (el_ret == ELEVATOR_FRONT_MERGE) { - BUG_ON(req->cmd_flags & REQ_ON_PLUG); if (bio_attempt_front_merge(q, req, bio)) { if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); @@ -1320,10 +1302,6 @@ get_rq: if (__rq->q != q) plug->should_sort = 1; } - /* - * Debug flag, kill later - */ - req->cmd_flags |= REQ_ON_PLUG; list_add_tail(&req->queuelist, &plug->list); drive_stat_acct(req, 1); } else { @@ -1550,7 +1528,8 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } - blk_throtl_bio(q, &bio); + if (blk_throtl_bio(q, &bio)) + goto end_io; /* * If bio = NULL, bio has been throttled and will be submitted @@ -2748,7 +2727,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) while (!list_empty(&list)) { rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); - BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG)); BUG_ON(!rq->q); if (rq->q != q) { /* @@ -2760,8 +2738,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) depth = 0; spin_lock(q->queue_lock); } - rq->cmd_flags &= ~REQ_ON_PLUG; - /* * rq is already accounted, so use raw insert */ diff --git a/block/blk-exec.c b/block/blk-exec.c index 81e31819a59..8a0e7ec056e 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -56,7 +56,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, spin_lock_irq(q->queue_lock); __elv_add_request(q, rq, where); __blk_run_queue(q); - /* the queue is stopped so it won't be plugged+unplugged */ + /* the queue is stopped so it won't be run */ if (rq->cmd_type == REQ_TYPE_PM_RESUME) q->request_fn(q); spin_unlock_irq(q->queue_lock); diff --git a/block/blk-flush.c b/block/blk-flush.c index 6c9b5e189e6..bb21e4c36f7 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error) } /* - * Moving a request silently to empty queue_head may stall the - * queue. Kick the queue in those cases. This function is called - * from request completion path and calling directly into - * request_fn may confuse the driver. Always use kblockd. + * Kick the queue to avoid stall for two cases: + * 1. Moving a request silently to empty queue_head may stall the + * queue. + * 2. When flush request is running in non-queueable queue, the + * queue is hold. Restart the queue after flush request is finished + * to avoid stall. + * This function is called from request completion path and calling + * directly into request_fn may confuse the driver. Always use + * kblockd. */ - if (queued) + if (queued || q->flush_queue_delayed) blk_run_queue_async(q); + q->flush_queue_delayed = 0; } /** diff --git a/block/blk-ioc.c b/block/blk-ioc.c index b791022beef..c898049dafd 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -96,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); ret->ioc_data = NULL; +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + ret->cgroup_changed = 0; +#endif } return ret; diff --git a/block/blk-lib.c b/block/blk-lib.c index 25de73e4759..78e627e2581 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -9,17 +9,20 @@ #include "blk.h" -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } +struct bio_batch { + atomic_t done; + unsigned long flags; + struct completion *wait; +}; - if (bio->bi_private) - complete(bio->bi_private); +static void bio_batch_end_io(struct bio *bio, int err) +{ + struct bio_batch *bb = bio->bi_private; + if (err && (err != -EOPNOTSUPP)) + clear_bit(BIO_UPTODATE, &bb->flags); + if (atomic_dec_and_test(&bb->done)) + complete(bb->wait); bio_put(bio); } @@ -41,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; unsigned int max_discard_sectors; + struct bio_batch bb; struct bio *bio; int ret = 0; @@ -67,7 +71,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, type |= REQ_SECURE; } - while (nr_sects && !ret) { + atomic_set(&bb.done, 1); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + + while (nr_sects) { bio = bio_alloc(gfp_mask, 1); if (!bio) { ret = -ENOMEM; @@ -75,9 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; + bio->bi_end_io = bio_batch_end_io; bio->bi_bdev = bdev; - bio->bi_private = &wait; + bio->bi_private = &bb; if (nr_sects > max_discard_sectors) { bio->bi_size = max_discard_sectors << 9; @@ -88,45 +96,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, nr_sects = 0; } - bio_get(bio); + atomic_inc(&bb.done); submit_bio(type, bio); + } + /* Wait for bios in-flight */ + if (!atomic_dec_and_test(&bb.done)) wait_for_completion(&wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } + if (!test_bit(BIO_UPTODATE, &bb.flags)) + ret = -EIO; return ret; } EXPORT_SYMBOL(blkdev_issue_discard); -struct bio_batch -{ - atomic_t done; - unsigned long flags; - struct completion *wait; -}; - -static void bio_batch_end_io(struct bio *bio, int err) -{ - struct bio_batch *bb = bio->bi_private; - - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bb->flags); - else - clear_bit(BIO_UPTODATE, &bb->flags); - } - if (bb) - if (atomic_dec_and_test(&bb->done)) - complete(bb->wait); - bio_put(bio); -} - /** * blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -151,7 +135,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, bb.flags = 1 << BIO_UPTODATE; bb.wait = &wait; -submit: ret = 0; while (nr_sects != 0) { bio = bio_alloc(gfp_mask, @@ -168,9 +151,6 @@ submit: while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); - if (sz == 0) - /* bio has maximum size possible */ - break; ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); nr_sects -= ret >> 9; sector += ret >> 9; @@ -190,16 +170,6 @@ submit: /* One of bios in the batch was completed with error.*/ ret = -EIO; - if (ret) - goto out; - - if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { - ret = -EOPNOTSUPP; - goto out; - } - if (nr_sects != 0) - goto submit; -out: return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-settings.c b/block/blk-settings.c index 1fa76929359..fa1eb0449a0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; - lim->discard_zeroes_data = -1; + lim->discard_zeroes_data = 1; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) blk_set_default_limits(&q->limits); blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); + q->limits.discard_zeroes_data = 0; /* * by default assume old behaviour and bounce for any highmem page @@ -790,6 +791,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush) } EXPORT_SYMBOL_GPL(blk_queue_flush); +void blk_queue_flush_queueable(struct request_queue *q, bool queueable) +{ + q->flush_not_queueable = !queueable; +} +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bd236313f35..d935bd859c8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag static ssize_t queue_discard_max_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.max_discard_sectors << 9, page); + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_discard_sectors << 9); } static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 252a81a306f..a62be8d0dc1 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -78,6 +78,8 @@ struct throtl_grp { /* Some throttle limits got updated for the group */ int limits_changed; + + struct rcu_head rcu_head; }; struct throtl_data @@ -88,7 +90,7 @@ struct throtl_data /* service tree for active throtl groups */ struct throtl_rb_root tg_service_tree; - struct throtl_grp root_tg; + struct throtl_grp *root_tg; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ @@ -151,56 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) return tg; } |