diff options
| -rw-r--r-- | Documentation/block/cfq-iosched.txt | 45 | ||||
| -rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 28 | ||||
| -rw-r--r-- | block/blk-cgroup.c | 2 | ||||
| -rw-r--r-- | block/blk-core.c | 6 | ||||
| -rw-r--r-- | block/blk-sysfs.c | 1 | ||||
| -rw-r--r-- | block/blk.h | 8 | ||||
| -rw-r--r-- | block/cfq-iosched.c | 103 | ||||
| -rw-r--r-- | block/elevator.c | 44 | ||||
| -rw-r--r-- | drivers/block/cciss.c | 11 | ||||
| -rw-r--r-- | drivers/block/loop.c | 2 | ||||
| -rw-r--r-- | drivers/block/mg_disk.c | 3 | ||||
| -rw-r--r-- | drivers/s390/char/tape_block.c | 3 | ||||
| -rw-r--r-- | fs/bio-integrity.c | 4 | ||||
| -rw-r--r-- | fs/fs-writeback.c | 2 | ||||
| -rw-r--r-- | include/linux/elevator.h | 1 | ||||
| -rw-r--r-- | lib/scatterlist.c | 14 | ||||
| -rw-r--r-- | mm/backing-dev.c | 7 | 
17 files changed, 238 insertions, 46 deletions
| diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt new file mode 100644 index 00000000000..e578feed6d8 --- /dev/null +++ b/Documentation/block/cfq-iosched.txt @@ -0,0 +1,45 @@ +CFQ ioscheduler tunables +======================== + +slice_idle +---------- +This specifies how long CFQ should idle for next request on certain cfq queues +(for sequential workloads) and service trees (for random workloads) before +queue is expired and CFQ selects next queue to dispatch from. + +By default slice_idle is a non-zero value. That means by default we idle on +queues/service trees. This can be very helpful on highly seeky media like +single spindle SATA/SAS disks where we can cut down on overall number of +seeks and see improved throughput. + +Setting slice_idle to 0 will remove all the idling on queues/service tree +level and one should see an overall improved throughput on faster storage +devices like multiple SATA/SAS disks in hardware RAID configuration. The down +side is that isolation provided from WRITES also goes down and notion of +IO priority becomes weaker. + +So depending on storage and workload, it might be useful to set slice_idle=0. +In general I think for SATA/SAS disks and software RAID of SATA/SAS disks +keeping slice_idle enabled should be useful. For any configurations where +there are multiple spindles behind single LUN (Host based hardware RAID +controller or for storage arrays), setting slice_idle=0 might end up in better +throughput and acceptable latencies. + +CFQ IOPS Mode for group scheduling +=================================== +Basic CFQ design is to provide priority based time slices. Higher priority +process gets bigger time slice and lower priority process gets smaller time +slice. Measuring time becomes harder if storage is fast and supports NCQ and +it would be better to dispatch multiple requests from multiple cfq queues in +request queue at a time. In such scenario, it is not possible to measure time +consumed by single queue accurately. + +What is possible though is to measure number of requests dispatched from a +single queue and also allow dispatch from multiple cfq queue at the same time. +This effectively becomes the fairness in terms of IOPS (IO operations per +second). + +If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches +to IOPS mode and starts providing fairness in terms of number of requests +dispatched. Note that this mode switching takes effect only for group +scheduling. For non-cgroup users nothing should change. diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 48e0b21b005..6919d62591d 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -217,6 +217,7 @@ Details of cgroup files  CFQ sysfs tunable  =================  /sys/block/<disk>/queue/iosched/group_isolation +-----------------------------------------------  If group_isolation=1, it provides stronger isolation between groups at the  expense of throughput. By default group_isolation is 0. In general that @@ -243,6 +244,33 @@ By default one should run with group_isolation=0. If that is not sufficient  and one wants stronger isolation between groups, then set group_isolation=1  but this will come at cost of reduced throughput. +/sys/block/<disk>/queue/iosched/slice_idle +------------------------------------------ +On a faster hardware CFQ can be slow, especially with sequential workload. +This happens because CFQ idles on a single queue and single queue might not +drive deeper request queue depths to keep the storage busy. In such scenarios +one can try setting slice_idle=0 and that would switch CFQ to IOPS +(IO operations per second) mode on NCQ supporting hardware. + +That means CFQ will not idle between cfq queues of a cfq group and hence be +able to driver higher queue depth and achieve better throughput. That also +means that cfq provides fairness among groups in terms of IOPS and not in +terms of disk time. + +/sys/block/<disk>/queue/iosched/group_idle +------------------------------------------ +If one disables idling on individual cfq queues and cfq service trees by +setting slice_idle=0, group_idle kicks in. That means CFQ will still idle +on the group in an attempt to provide fairness among groups. + +By default group_idle is same as slice_idle and does not do anything if +slice_idle is enabled. + +One can experience an overall throughput drop if you have created multiple +groups and put applications in that group which are not driving enough +IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle +on individual groups and throughput should improve. +  What works  ==========  - Currently only sync IO queues are support. All the buffered writes are diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a6809645d21..2fef1ef931a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -966,7 +966,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)  	/* Currently we do not support hierarchy deeper than two level (0,1) */  	if (parent != cgroup->top_cgroup) -		return ERR_PTR(-EINVAL); +		return ERR_PTR(-EPERM);  	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);  	if (!blkcg) diff --git a/block/blk-core.c b/block/blk-core.c index ee1a1e7e63c..32a1c123dfb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1198,9 +1198,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)  	int el_ret;  	unsigned int bytes = bio->bi_size;  	const unsigned short prio = bio_prio(bio); -	const bool sync = (bio->bi_rw & REQ_SYNC); -	const bool unplug = (bio->bi_rw & REQ_UNPLUG); -	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; +	const bool sync = !!(bio->bi_rw & REQ_SYNC); +	const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); +	const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;  	int rw_flags;  	if ((bio->bi_rw & REQ_HARDBARRIER) && diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 001ab18078f..0749b89c688 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -511,6 +511,7 @@ int blk_register_queue(struct gendisk *disk)  		kobject_uevent(&q->kobj, KOBJ_REMOVE);  		kobject_del(&q->kobj);  		blk_trace_remove_sysfs(disk_to_dev(disk)); +		kobject_put(&dev->kobj);  		return ret;  	} diff --git a/block/blk.h b/block/blk.h index 6e7dc87141e..d6b911ac002 100644 --- a/block/blk.h +++ b/block/blk.h @@ -142,14 +142,18 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)  static inline int blk_cpu_to_group(int cpu)  { +	int group = NR_CPUS;  #ifdef CONFIG_SCHED_MC  	const struct cpumask *mask = cpu_coregroup_mask(cpu); -	return cpumask_first(mask); +	group = cpumask_first(mask);  #elif defined(CONFIG_SCHED_SMT) -	return cpumask_first(topology_thread_cpumask(cpu)); +	group = cpumask_first(topology_thread_cpumask(cpu));  #else  	return cpu;  #endif +	if (likely(group < NR_CPUS)) +		return group; +	return cpu;  }  /* diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index eb4086f7dfe..f65c6f01c47 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10;  static int cfq_slice_async = HZ / 25;  static const int cfq_slice_async_rq = 2;  static int cfq_slice_idle = HZ / 125; +static int cfq_group_idle = HZ / 125;  static const int cfq_target_latency = HZ * 3/10; /* 300 ms */  static const int cfq_hist_divisor = 4; @@ -147,6 +148,8 @@ struct cfq_queue {  	struct cfq_queue *new_cfqq;  	struct cfq_group *cfqg;  	struct cfq_group *orig_cfqg; +	/* Number of sectors dispatched from queue in single dispatch round */ +	unsigned long nr_sectors;  };  /* @@ -198,6 +201,8 @@ struct cfq_group {  	struct hlist_node cfqd_node;  	atomic_t ref;  #endif +	/* number of requests that are on the dispatch list or inside driver */ +	int dispatched;  };  /* @@ -271,6 +276,7 @@ struct cfq_data {  	unsigned int cfq_slice[2];  	unsigned int cfq_slice_async_rq;  	unsigned int cfq_slice_idle; +	unsigned int cfq_group_idle;  	unsigned int cfq_latency;  	unsigned int cfq_group_isolation; @@ -378,6 +384,21 @@ CFQ_CFQQ_FNS(wait_busy);  			&cfqg->service_trees[i][j]: NULL) \ +static inline bool iops_mode(struct cfq_data *cfqd) +{ +	/* +	 * If we are not idling on queues and it is a NCQ drive, parallel +	 * execution of requests is on and measuring time is not possible +	 * in most of the cases until and unless we drive shallower queue +	 * depths and that becomes a performance bottleneck. In such cases +	 * switch to start providing fairness in terms of number of IOs. +	 */ +	if (!cfqd->cfq_slice_idle && cfqd->hw_tag) +		return true; +	else +		return false; +} +  static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)  {  	if (cfq_class_idle(cfqq)) @@ -906,7 +927,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)  			slice_used = cfqq->allocated_slice;  	} -	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);  	return slice_used;  } @@ -914,19 +934,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,  				struct cfq_queue *cfqq)  {  	struct cfq_rb_root *st = &cfqd->grp_service_tree; -	unsigned int used_sl, charge_sl; +	unsigned int used_sl, charge;  	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)  			- cfqg->service_tree_idle.count;  	BUG_ON(nr_sync < 0); -	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); +	used_sl = charge = cfq_cfqq_slice_usage(cfqq); -	if (!cfq_cfqq_sync(cfqq) && !nr_sync) -		charge_sl = cfqq->allocated_slice; +	if (iops_mode(cfqd)) +		charge = cfqq->slice_dispatch; +	else if (!cfq_cfqq_sync(cfqq) && !nr_sync) +		charge = cfqq->allocated_slice;  	/* Can't update vdisktime while group is on service tree */  	cfq_rb_erase(&cfqg->rb_node, st); -	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); +	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);  	__cfq_group_service_tree_add(st, cfqg);  	/* This group is being expired. Save the context */ @@ -940,6 +962,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,  	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,  					st->min_vdisktime); +	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" +			" sect=%u", used_sl, cfqq->slice_dispatch, charge, +			iops_mode(cfqd), cfqq->nr_sectors);  	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);  	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);  } @@ -1587,6 +1612,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,  		cfqq->allocated_slice = 0;  		cfqq->slice_end = 0;  		cfqq->slice_dispatch = 0; +		cfqq->nr_sectors = 0;  		cfq_clear_cfqq_wait_request(cfqq);  		cfq_clear_cfqq_must_dispatch(cfqq); @@ -1839,6 +1865,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)  	BUG_ON(!service_tree);  	BUG_ON(!service_tree->count); +	if (!cfqd->cfq_slice_idle) +		return false; +  	/* We never do for idle class queues. */  	if (prio == IDLE_WORKLOAD)  		return false; @@ -1863,7 +1892,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)  {  	struct cfq_queue *cfqq = cfqd->active_queue;  	struct cfq_io_context *cic; -	unsigned long sl; +	unsigned long sl, group_idle = 0;  	/*  	 * SSD device without seek penalty, disable idling. But only do so @@ -1879,8 +1908,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)  	/*  	 * idle is disabled, either manually or by past process history  	 */ -	if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) -		return; +	if (!cfq_should_idle(cfqd, cfqq)) { +		/* no queue idling. Check for group idling */ +		if (cfqd->cfq_group_idle) +			group_idle = cfqd->cfq_group_idle; +		else +			return; +	}  	/*  	 * still active requests from this queue, don't idle @@ -1907,13 +1941,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)  		return;  	} +	/* There are other queues in the group, don't do group idle */ +	if (group_idle && cfqq->cfqg->nr_cfqq > 1) +		return; +  	cfq_mark_cfqq_wait_request(cfqq); -	sl = cfqd->cfq_slice_idle; +	if (group_idle) +		sl = cfqd->cfq_group_idle; +	else +		sl = cfqd->cfq_slice_idle;  	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);  	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); -	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); +	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, +			group_idle ? 1 : 0);  }  /* @@ -1929,9 +1971,11 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)  	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);  	cfq_remove_request(rq);  	cfqq->dispatched++; +	(RQ_CFQG(rq))->dispatched++;  	elv_dispatch_sort(q, rq);  	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; +	cfqq->nr_sectors += blk_rq_sectors(rq);  	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),  					rq_data_dir(rq), rq_is_sync(rq));  } @@ -2198,7 +2242,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)  			cfqq = NULL;  			goto keep_queue;  		} else -			goto expire; +			goto check_group_idle;  	}  	/* @@ -2226,8 +2270,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)  	 * flight or is idling for a new request, allow either of these  	 * conditions to happen (or time out) before selecting a new queue.  	 */ -	if (timer_pending(&cfqd->idle_slice_timer) || -	    (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { +	if (timer_pending(&cfqd->idle_slice_timer)) { +		cfqq = NULL; +		goto keep_queue; +	} + +	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { +		cfqq = NULL; +		goto keep_queue; +	} + +	/* +	 * If group idle is enabled and there are requests dispatched from +	 * this group, wait for requests to complete. +	 */ +check_group_idle: +	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 +	    && cfqq->cfqg->dispatched) {  		cfqq = NULL;  		goto keep_queue;  	} @@ -3375,6 +3434,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)  	WARN_ON(!cfqq->dispatched);  	cfqd->rq_in_driver--;  	cfqq->dispatched--; +	(RQ_CFQG(rq))->dispatched--;  	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,  			rq_start_time_ns(rq), rq_io_start_time_ns(rq),  			rq_data_dir(rq), rq_is_sync(rq)); @@ -3404,7 +3464,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)  		 * the queue.  		 */  		if (cfq_should_wait_busy(cfqd, cfqq)) { -			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; +			unsigned long extend_sl = cfqd->cfq_slice_idle; +			if (!cfqd->cfq_slice_idle) +				extend_sl = cfqd->cfq_group_idle; +			cfqq->slice_end = jiffies + extend_sl;  			cfq_mark_cfqq_wait_busy(cfqq);  			cfq_log_cfqq(cfqd, cfqq, "will busy wait");  		} @@ -3850,6 +3913,7 @@ static void *cfq_init_queue(struct request_queue *q)  	cfqd->cfq_slice[1] = cfq_slice_sync;  	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;  	cfqd->cfq_slice_idle = cfq_slice_idle; +	cfqd->cfq_group_idle = cfq_group_idle;  	cfqd->cfq_latency = 1;  	cfqd->cfq_group_isolation = 0;  	cfqd->hw_tag = -1; @@ -3922,6 +3986,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);  SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);  SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);  SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);  SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);  SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);  SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); @@ -3954,6 +4019,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);  STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,  		UINT_MAX, 0);  STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);  STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);  STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);  STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, @@ -3975,6 +4041,7 @@ static struct elv_fs_entry cfq_attrs[] = {  	CFQ_ATTR(slice_async),  	CFQ_ATTR(slice_async_rq),  	CFQ_ATTR(slice_idle), +	CFQ_ATTR(group_idle),  	CFQ_ATTR(low_latency),  	CFQ_ATTR(group_isolation),  	__ATTR_NULL @@ -4028,6 +4095,12 @@ static int __init cfq_init(void)  	if (!cfq_slice_idle)  		cfq_slice_idle = 1; +#ifdef CONFIG_CFQ_GROUP_IOSCHED +	if (!cfq_group_idle) +		cfq_group_idle = 1; +#else +		cfq_group_idle = 0; +#endif  	if (cfq_slab_setup())  		return -ENOMEM; diff --git a/block/elevator.c b/block/elevator.c index ec585c9554d..205b09a5bd9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1009,18 +1009,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)  {  	struct elevator_queue *old_elevator, *e;  	void *data; +	int err;  	/*  	 * Allocate new elevator  	 */  	e = elevator_alloc(q, new_e);  	if (!e) -		return 0; +		return -ENOMEM;  	data = elevator_init_queue(q, e);  	if (!data) {  		kobject_put(&e->kobj); -		return 0; +		return -ENOMEM;  	}  	/* @@ -1043,7 +1044,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)  	__elv_unregister_queue(old_elevator); -	if (elv_register_queue(q)) +	err = elv_register_queue(q); +	if (err)  		goto fail_register;  	/* @@ -1056,7 +1058,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)  	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); -	return 1; +	return 0;  fail_register:  	/* @@ -1071,17 +1073,19 @@ fail_register:  	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);  	spin_unlock_irq(q->queue_lock); -	return 0; +	return err;  } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, -			  size_t count) +/* + * Switch this queue to the given IO scheduler. + */ +int elevator_change(struct request_queue *q, const char *name)  {  	char elevator_name[ELV_NAME_MAX];  	struct elevator_type *e;  	if (!q->elevator) -		return count; +		return -ENXIO;  	strlcpy(elevator_name, name, sizeof(elevator_name));  	e = elevator_get(strstrip(elevator_name)); @@ -1092,13 +1096,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,  	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {  		elevator_put(e); -		return count; +		return 0;  	} -	if (!elevator_switch(q, e)) -		printk(KERN_ERR "elevator: switch to %s failed\n", -							elevator_name); -	return count; +	return elevator_switch(q, e); +} +EXPORT_SYMBOL(elevator_change); + +ssize_t elv_iosched_store(struct request_queue *q, const char *name, +			  size_t count) +{ +	int ret; + +	if (!q->elevator) +		return count; + +	ret = elevator_change(q, name); +	if (!ret) +		return count; + +	printk(KERN_ERR "elevator: switch to %s failed\n", name); +	return ret;  }  ssize_t elv_iosched_show(struct request_queue *q, char *name) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 31064df1370..6124c2fd2d3 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -297,6 +297,8 @@ static void enqueue_cmd_and_start_io(ctlr_info_t *h,  	spin_lock_irqsave(&h->lock, flags);  	addQ(&h->reqQ, c);  	h->Qdepth++; +	if (h->Qdepth > h->maxQsinceinit) +		h->maxQsinceinit = h->Qdepth;  	start_io(h);  	spin_unlock_irqrestore(&h->lock, flags);  } @@ -4519,6 +4521,12 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)  	misc_fw_support = readl(&cfgtable->misc_fw_support);  	use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET; +	/* The doorbell reset seems to cause lockups on some Smart +	 * Arrays (e.g. P410, P410i, maybe others).  Until this is +	 * fixed or at least isolated, avoid the doorbell reset. +	 */ +	use_doorbell = 0; +  	rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);  	if (rc)  		goto unmap_cfgtable; @@ -4712,6 +4720,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,  	h->scatter_list = kmalloc(h->max_commands *  						sizeof(struct scatterlist *),  						GFP_KERNEL); +	if (!h->scatter_list) +		goto clean4; +  	for (k = 0; k < h->nr_cmds; k++) {  		h->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *  							h->maxsgentries, diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f3c636d2371..91797bbbe70 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -477,7 +477,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)  	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;  	if (bio_rw(bio) == WRITE) { -		bool barrier = (bio->bi_rw & REQ_HARDBARRIER); +		bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);  		struct file *file = lo->lo_backing_file;  		if (barrier) { diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index b82c5ce5e9d..76fa3deaee8 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -974,8 +974,7 @@ static int mg_probe(struct platform_device *plat_dev)  	host->breq->queuedata = host;  	/* mflash is random device, thanx for the noop */ -	elevator_exit(host->breq->elevator); -	err = elevator_init(host->breq, "noop"); +	err = elevator_change(host->breq, "noop");  	if (err) {  		printk(KERN_ERR "%s:%d (elevator_init) fail\n",  				__func__, __LINE__); diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index b7de02525ec..85cf607fc78 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -217,8 +217,7 @@ tapeblock_setup_device(struct tape_device * device)  	if (!blkdat->request_queue)  		return -ENOMEM; -	elevator_exit(blkdat->request_queue->elevator); -	rc = elevator_init(blkdat->request_queue, "noop"); +	rc = elevator_change(blkdat->request_queue, "noop");  	if (rc)  		goto cleanup_queue; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 612a5c38d3c..4d0ff5ee27b 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)  	/* Allocate kernel buffer for protection data */  	len = sectors * blk_integrity_tuple_size(bi); -	buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); +	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);  	if (unlikely(buf == NULL)) {  		printk(KERN_ERR "could not allocate integrity buffer\n"); -		return -EIO; +		return -ENOMEM;  	}  	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7d9d06ba184..81e086d8aa5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -808,7 +808,7 @@ int bdi_writeback_thread(void *data)  			wb->last_active = jiffies;  		set_current_state(TASK_INTERRUPTIBLE); -		if (!list_empty(&bdi->work_list)) { +		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {  			__set_current_state(TASK_RUNNING);  			continue;  		} diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2c958f4fce1..926b50322a4 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -136,6 +136,7 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);  extern int elevator_init(struct request_queue *, char *);  extern void elevator_exit(struct elevator_queue *); +extern int elevator_change(struct request_queue *, const char *);  extern int elv_rq_merge_ok(struct request *, struct bio *);  /* diff --git a/lib/scatterlist.c b/lib/scatterlist.c index a5ec42868f9..4ceb05d772a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -248,8 +248,18 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents,  		left -= sg_size;  		sg = alloc_fn(alloc_size, gfp_mask); -		if (unlikely(!sg)) -			return -ENOMEM; +		if (unlikely(!sg)) { +			/* +			 * Adjust entry count to reflect that the last +			 * entry of the previous table won't be used for +			 * linkage.  Without this, sg_kfree() may get +			 * confused. +			 */ +			if (prv) +				table->nents = ++table->orig_nents; + + 			return -ENOMEM; +		}  		sg_init_table(sg, alloc_size);  		table->nents = table->orig_nents += sg_size; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eaa4a5bbe06..c2bf86f470e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -445,8 +445,8 @@ static int bdi_forker_thread(void *ptr)  		switch (action) {  		case FORK_THREAD:  			__set_current_state(TASK_RUNNING); -			task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s", -					   dev_name(bdi->dev)); +			task = kthread_create(bdi_writeback_thread, &bdi->wb, +					      "flush-%s", dev_name(bdi->dev));  			if (IS_ERR(task)) {  				/*  				 * If thread creation fails, force writeout of @@ -457,10 +457,13 @@ static int bdi_forker_thread(void *ptr)  				/*  				 * The spinlock makes sure we do not lose  				 * wake-ups when racing with 'bdi_queue_work()'. +				 * And as soon as the bdi thread is visible, we +				 * can start it.  				 */  				spin_lock_bh(&bdi->wb_lock);  				bdi->wb.task = task;  				spin_unlock_bh(&bdi->wb_lock); +				wake_up_process(task);  			}  			break; | 
