20 files changed, 305 insertions, 27 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 9e79a485e4f..c3df30cfb3f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -257,7 +257,6 @@ void __generic_unplug_device(struct request_queue *q)
 
 	q->request_fn(q);
 }
-EXPORT_SYMBOL(__generic_unplug_device);
 
 /**
  * generic_unplug_device - fire a request queue
@@ -325,6 +324,9 @@ EXPORT_SYMBOL(blk_unplug);
 
 static void blk_invoke_request_fn(struct request_queue *q)
 {
+	if (unlikely(blk_queue_stopped(q)))
+		return;
+
 	/*
 	 * one level of recursion is ok and is much faster than kicking
 	 * the unplug handling
@@ -399,8 +401,13 @@ void blk_sync_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_sync_queue);
 
 /**
- * blk_run_queue - run a single device queue
+ * __blk_run_queue - run a single device queue
  * @q:	The queue to run
+ *
+ * Description:
+ *    See @blk_run_queue. This variant must be called with the queue lock
+ *    held and interrupts disabled.
+ *
  */
 void __blk_run_queue(struct request_queue *q)
 {
@@ -418,6 +425,12 @@ EXPORT_SYMBOL(__blk_run_queue);
 /**
  * blk_run_queue - run a single device queue
  * @q: The queue to run
+ *
+ * Description:
+ *    Invoke request handling on this queue, if it has pending work to do.
+ *    May be used to restart queueing when a request has completed. Also
+ *    See @blk_start_queueing.
+ *
  */
 void blk_run_queue(struct request_queue *q)
 {
@@ -501,6 +514,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
+	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 
@@ -884,7 +898,8 @@ EXPORT_SYMBOL(blk_get_request);
  *
  * This is basically a helper to remove the need to know whether a queue
  * is plugged or not if someone just wants to initiate dispatch of requests
- * for this queue.
+ * for this queue. Should be used to start queueing on a device outside
+ * of ->request_fn() context. Also see @blk_run_queue.
  *
  * The queue lock must be held with interrupts disabled.
  */
@@ -1003,8 +1018,9 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
 }
 
 /**
- * part_round_stats()	- Round off the performance stats on a struct
- * disk_stats.
+ * part_round_stats() - Round off the performance stats on a struct disk_stats.
+ * @cpu: cpu number for stats access
+ * @part: target partition
  *
  * The average IO queue length and utilisation statistics are maintained
  * by observing the current state of the queue length and the amount of
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 908d3e11ac5..8681cd6f991 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -77,12 +77,20 @@ void blk_recalc_rq_segments(struct request *rq)
 			continue;
 		}
 new_segment:
+		if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
+			rq->bio->bi_seg_front_size = seg_size;
+
 		nr_phys_segs++;
 		bvprv = bv;
 		seg_size = bv->bv_len;
 		highprv = high;
 	}
 
+	if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
+		rq->bio->bi_seg_front_size = seg_size;
+	if (seg_size > rq->biotail->bi_seg_back_size)
+		rq->biotail->bi_seg_back_size = seg_size;
+
 	rq->nr_phys_segments = nr_phys_segs;
 }
 
@@ -106,7 +114,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
 		return 0;
 
-	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
+	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
+	    q->max_segment_size)
 		return 0;
 
 	if (!bio_has_data(bio))
@@ -309,6 +318,8 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
 	int total_phys_segments;
+	unsigned int seg_size =
+		req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
 
 	/*
 	 * First check if the either of the requests are re-queued
@@ -324,8 +335,13 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
-	if (blk_phys_contig_segment(q, req->biotail, next->bio))
+	if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
+		if (req->nr_phys_segments == 1)
+			req->bio->bi_seg_front_size = seg_size;
+		if (next->nr_phys_segments == 1)
+			next->biotail->bi_seg_back_size = seg_size;
 		total_phys_segments--;
+	}
 
 	if (total_phys_segments > q->max_phys_segments)
 		return 0;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b21dcdb6415..41392fbe19f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -141,8 +141,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	if (q->unplug_delay == 0)
 		q->unplug_delay = 1;
 
-	INIT_WORK(&q->unplug_work, blk_unplug_work);
-
 	q->unplug_timer.function = blk_unplug_timeout;
 	q->unplug_timer.data = (unsigned long)q;
 
diff --git a/block/blk.h b/block/blk.h
index e5c57976996..d2e49af90db 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -20,6 +20,7 @@ void blk_unplug_timeout(unsigned long data);
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
+void __generic_unplug_device(struct request_queue *);
 
 /*
  * Internal atomic flags for request handling
diff --git a/block/elevator.c b/block/elevator.c
index 04518921db3..59173a69ebd 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -612,7 +612,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		 *   processing.
 		 */
 		blk_remove_plug(q);
-		q->request_fn(q);
+		blk_start_queueing(q);
 		break;
 
 	case ELEVATOR_INSERT_SORT:
@@ -950,7 +950,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			q->request_fn(q);
+			blk_start_queueing(q);
 		}
 	}
 }
@@ -1109,8 +1109,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	elv_drain_elevator(q);
 
 	while (q->rq.elvpriv) {
-		blk_remove_plug(q);
-		q->request_fn(q);
+		blk_start_queueing(q);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
@@ -1166,15 +1165,10 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 			  size_t count)
 {
 	char elevator_name[ELV_NAME_MAX];
-	size_t len;
 	struct elevator_type *e;
 
-	elevator_name[sizeof(elevator_name) - 1] = '\0';
-	strncpy(elevator_name, name, sizeof(elevator_name) - 1);
-	len = strlen(elevator_name);
-
-	if (len && elevator_name[len - 1] == '\n')
-		elevator_name[len - 1] = '\0';
+	strlcpy(elevator_name, name, sizeof(elevator_name));
+	strstrip(elevator_name);
 
 	e = elevator_get(elevator_name);
 	if (!e) {
diff --git a/block/genhd.c b/block/genhd.c
index 4cd3433c99a..646e1d2507c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -358,7 +358,6 @@ static int blk_mangle_minor(int minor)
 /**
  * blk_alloc_devt - allocate a dev_t for a partition
  * @part: partition to allocate dev_t for
- * @gfp_mask: memory allocation flag
  * @devt: out parameter for resulting dev_t
  *
  * Allocate a dev_t for block device.
@@ -535,7 +534,7 @@ void unlink_gendisk(struct gendisk *disk)
 /**
  * get_gendisk - get partitioning information for a given device
  * @devt: device to get partitioning information for
- * @part: returned partition index
+ * @partno: returned partition index
  *
  * This function gets the structure containing partitioning
  * information for the given device @devt.
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 77c6eaeacef..7162d67562a 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1493,8 +1493,8 @@ void ide_do_drive_cmd(ide_drive_t *drive, struct request *rq)
 
 	spin_lock_irqsave(&ide_lock, flags);
 	hwgroup->rq = NULL;
-	__elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 1);
-	__generic_unplug_device(drive->queue);
+	__elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
+	blk_start_queueing(drive->queue);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
diff --git a/drivers/s390/scsi/zfcp_def.h b/drivers/s390/scsi/zfcp_def.h
index 8a13071c444..9ce4c75bd19 100644
--- a/drivers/s390/scsi/zfcp_def.h
+++ b/drivers/s390/scsi/zfcp_def.h
@@ -583,6 +583,8 @@ struct zfcp_fsf_req {
 	unsigned long long     issued;         /* request sent time (STCK) */
 	struct zfcp_unit       *unit;
 	void			(*handler)(struct zfcp_fsf_req *);
+	u16			qdio_outb_usage;/* usage of outbound queue */
+	u16			qdio_inb_usage;	/* usage of inbound queue */
 };
 
 /* driver data */
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 739356a5c12..5ae1d497e5e 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -6,6 +6,7 @@
  * Copyright IBM Corporation 2002, 2008
  */
 
+#include <linux/blktrace_api.h>
 #include "zfcp_ext.h"
 
 static void zfcp_fsf_request_timeout_handler(unsigned long data)
@@ -777,6 +778,7 @@ static int zfcp_fsf_req_send(struct zfcp_fsf_req *req)
 	list_add_tail(&req->list, &adapter->req_list[idx]);
 	spin_unlock(&adapter->req_list_lock);
 
+	req->qdio_outb_usage = atomic_read(&req_q->count);
 	req->issued = get_clock();
 	if (zfcp_qdio_send(req)) {
 		/* Queues are down..... */
@@ -2082,6 +2084,36 @@ static void zfcp_fsf_req_latency(struct zfcp_fsf_req *req)
 	spin_unlock_irqrestore(&unit->latencies.lock, flags);
 }
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+static void zfcp_fsf_trace_latency(struct zfcp_fsf_req *fsf_req)
+{
+	struct fsf_qual_latency_info *lat_inf;
+	struct scsi_cmnd *scsi_cmnd = (struct scsi_cmnd *)fsf_req->data;
+	struct request *req = scsi_cmnd->request;
+	struct zfcp_blk_drv_data trace;
+	int ticks = fsf_req->adapter->timer_ticks;
+
+	trace.flags = 0;
+	trace.magic = ZFCP_BLK_DRV_DATA_MAGIC;
+	if (fsf_req->adapter->adapter_features & FSF_FEATURE_MEASUREMENT_DATA) {
+		trace.flags |= ZFCP_BLK_LAT_VALID;
+		lat_inf = &fsf_req->qtcb->prefix.prot_status_qual.latency_info;
+		trace.channel_lat = lat_inf->channel_lat * ticks;
+		trace.fabric_lat = lat_inf->fabric_lat * ticks;
+	}
+	if (fsf_req->status & ZFCP_STATUS_FSFREQ_ERROR)
+		trace.flags |= ZFCP_BLK_REQ_ERROR;
+	trace.inb_usage = fsf_req->qdio_inb_usage;
+	trace.outb_usage = fsf_req->qdio_outb_usage;
+
+	blk_add_driver_data(req->q, req, &trace, sizeof(trace));
+}
+#else
+static inline void zfcp_fsf_trace_latency(struct zfcp_fsf_req *fsf_req)
+{
+}
+#endif
+
 static void zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *req)
 {
 	struct scsi_cmnd *scpnt = req->data;
@@ -2114,6 +2146,8 @@ static void zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *req)
 	if (req->adapter->adapter_features & FSF_FEATURE_MEASUREMENT_DATA)
 		zfcp_fsf_req_latency(req);
 
+	zfcp_fsf_trace_latency(req);
+
 	if (unlikely(fcp_rsp_iu->validity.bits.fcp_rsp_len_valid)) {
 		if (fcp_rsp_info[3] == RSP_CODE_GOOD)
 			set_host_byte(scpnt, DID_OK);
diff --git a/drivers/s390/scsi/zfcp_fsf.h b/drivers/s390/scsi/zfcp_fsf.h
index fd3a88777ac..fa2a3178061 100644
--- a/drivers/s390/scsi/zfcp_fsf.h
+++ b/drivers/s390/scsi/zfcp_fsf.h
@@ -439,4 +439,16 @@ struct fsf_qtcb {
 	u8 log[FSF_QTCB_LOG_SIZE];
 } __attribute__ ((packed));
 
+struct zfcp_blk_drv_data {
+#define ZFCP_BLK_DRV_DATA_MAGIC			0x1
+	u32 magic;
+#define ZFCP_BLK_LAT_VALID			0x1
+#define ZFCP_BLK_REQ_ERROR			0x2
+	u16 flags;
+	u8 inb_usage;
+	u8 outb_usage;
+	u64 channel_lat;
+	u64 fabric_lat;
+} __attribute__ ((packed));
+
 #endif				/* FSF_H */
diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c
index 3e05080e62d..664752f90b2 100644
--- a/drivers/s390/scsi/zfcp_qdio.c
+++ b/drivers/s390/scsi/zfcp_qdio.c
@@ -115,6 +115,7 @@ static void zfcp_qdio_reqid_check(struct zfcp_adapter *adapter,
 	spin_unlock_irqrestore(&adapter->req_list_lock, flags);
 
 	fsf_req->sbal_response = sbal_idx;
+	fsf_req->qdio_inb_usage = atomic_read(&adapter->resp_q.count);
 	zfcp_fsf_req_complete(fsf_req);
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d84f0469a01..218408eed1b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 
 /**
  * lookup_bdev  - lookup a struct block_device by name
- * @pathname:	special file representing the block device
+ * @path:	special file representing the block device
  *
  * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index fbeb2f372a9..cfb0c80690a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	return ERR_PTR(res);
 }
 
+static ssize_t part_partition_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%d\n", p->partno);
+}
+
 static ssize_t part_start_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
+static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
@@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail =
 #endif
 
 static struct attribute *part_attrs[] = {
+	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_stat.attr,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1beda208cbf..1c91a176b9a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -79,6 +79,13 @@ struct bio {
 
 	unsigned int		bi_size;	/* residual I/O count */
 
+	/*
+	 * To keep track of the max segment size, we account for the
+	 * sizes of the first and last mergeable segments in this bio.
+	 */
+	unsigned int		bi_seg_front_size;
+	unsigned int		bi_seg_back_size;
+
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
 	unsigned int		bi_comp_cpu;	/* completion CPU */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3491d22526..b4fe68fe3a5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -865,7 +865,6 @@ extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(struct request_queue *);
-extern void __generic_unplug_device(struct request_queue *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(struct request_queue *);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 3a31eb50616..bdf505d33e7 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -24,6 +24,7 @@ enum blktrace_cat {
 	BLK_TC_AHEAD	= 1 << 11,	/* readahead */
 	BLK_TC_META	= 1 << 12,	/* metadata */
 	BLK_TC_DISCARD	= 1 << 13,	/* discard requests */
+	BLK_TC_DRV_DATA	= 1 << 14,	/* binary per-driver data */
 
 	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
 };
@@ -51,6 +52,7 @@ enum blktrace_act {
 	__BLK_TA_BOUNCE,		/* bio was bounced */
 	__BLK_TA_REMAP,			/* bio was remapped */
 	__BLK_TA_ABORT,			/* request aborted */
+	__BLK_TA_DRV_DATA,		/* driver-specific binary data */
 };
 
 /*
@@ -82,6 +84,7 @@ enum blktrace_notify {
 #define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
 #define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
 #define BLK_TA_ABORT		(__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_DRV_DATA	(__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
 
 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
@@ -317,6 +320,34 @@ static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
 }
 
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+static inline void blk_add_driver_data(struct request_queue *q,
+				       struct request *rq,
+				       void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
@@ -330,6 +361,7 @@ extern int blk_trace_remove(struct request_queue *q);
 #define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
 #define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
 #define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
+#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
 #define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
 #define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
 #define blk_trace_startstop(q, start)		(-ENOTTY)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 54b3623434e..35a61dc60d5 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -11,6 +11,8 @@
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/irqflags.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
@@ -273,6 +275,25 @@ extern void softirq_init(void);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
+/* This is the worklist that queues up per-cpu softirq work.
+ *
+ * send_remote_sendirq() adds work to these lists, and
+ * the softirq handler itself dequeues from them.  The queues
+ * are protected by disabling local cpu interrupts and they must
+ * only be accessed by the local cpu that they are for.
+ */
+DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+
+/* Try to send a softirq to a remote cpu.  If this cannot be done, the
+ * work will be queued to the local cpu.
+ */
+extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq);
+
+/* Like send_remote_softirq(), but the caller must disable local cpu interrupts
+ * and compute the current cpu, passed in as 'this_cpu'.
+ */
+extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
+				  int this_cpu, int softirq);
 
 /* Tasklets --- multithreaded analogue of BHs.
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 66484d4a845..2e4d58b26c0 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -7,6 +7,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/types.h>
 #include <linux/list.h>
 #include <linux/cpumask.h>
 
@@ -16,7 +17,8 @@ struct call_single_data {
 	struct list_head list;
 	void (*func) (void *info);
 	void *info;
-	unsigned int flags;
+	u16 flags;
+	u16 priv;
 };
 
 #ifdef CONFIG_SMP
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 37d67aa2d56..83ba21a13bd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,6 +6,8 @@
  *	Distribute under GPLv2.
  *
  *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ *	Remote softirq infrastructure is by Jens Axboe.
  */
 
 #include <linux/module.h>
@@ -474,17 +476,144 @@ void tasklet_kill(struct tasklet_struct *t)
 
 EXPORT_SYMBOL(tasklet_kill);
 
+DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+EXPORT_PER_CPU_SYMBOL(softirq_work_list);
+
+static void __local_trigger(struct call_single_data *cp, int softirq)
+{
+	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+
+	list_add_tail(&cp->list, head);
+
+	/* Trigger the softirq only if the list was previously empty.  */
+	if (head->next == &cp->list)
+		raise_softirq_irqoff(softirq);
+}
+
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+static void remote_softirq_receive(void *data)
+{
+	struct call_single_data *cp = data;
+	unsigned long flags;
+	int softirq;
+
+	softirq = cp->priv;
+
+	local_irq_save(flags);
+	__local_trigger(cp, softirq);
+	local_irq_restore(flags);
+}
+
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+	if (cpu_online(cpu)) {
+		cp->func = remote_softirq_receive;
+		cp->info = cp;
+		cp->flags = 0;
+		cp->priv = softirq;
+
+		__smp_call_function_single(cpu, cp);
+		return 0;
+	}
+	return 1;
+}
+#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+	return 1;
+}
+#endif
+
+/**
+ * __send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @this_cpu: the currently executing cpu
+ * @softirq: the softirq for the work
+ *
+ * Attempt to schedule softirq work on a remote cpu.  If this cannot be
+ * done, the work is instead queued up on the local cpu.
+ *
+ * Interrupts must be disabled.
+ */
+void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
+{
+	if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
+		__local_trigger(cp, softirq);
+}
+EXPORT_SYMBOL(__send_remote_softirq);
+
+/**
+ * send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @softirq: the softirq for the work
+ *
+ * Like __send_remote_softirq except that disabling interrupts and
+ * computing the current cpu is done for the caller.
+ */
+void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+	unsigned long flags;
+	int this_cpu;
+
+	local_irq_save(flags);
+	this_cpu = smp_processor_id();
+	__send_remote_softirq(cp, cpu, this_cpu, softirq);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(send_remote_softirq);
+
+static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+					       unsigned long action, void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+		int i;
+
+		local_irq_disable();
+		for (i = 0; i < NR_SOFTIRQS; i++) {
+			struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+			struct list_head *local_head;
+
+			if (list_empty(head))
+				continue;
+
+			local_head = &__get_cpu_var(softirq_work_list[i]);
+			list_splice_init(head, local_head);
+			raise_softirq_irqoff(i);
+		}
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+	.notifier_call	= remote_softirq_cpu_notify,
+};
+
 void __init softirq_init(void)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
+		int i;
+
 		per_cpu(tasklet_vec, cpu).tail =
 			&per_cpu(tasklet_vec, cpu).head;
 		per_cpu(tasklet_hi_vec, cpu).tail =
 			&per_cpu(tasklet_hi_vec, cpu).head;
+		for (i = 0; i < NR_SOFTIRQS; i++)
+			INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
 	}
 
+	register_hotcpu_notifier(&remote_softirq_cpu_notifier);
+
 	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 31d784dd80d..b0f239e443b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -652,6 +652,11 @@ config DEBUG_BLOCK_EXT_DEVT
 	depends on BLOCK
 	default n
 	help
+	  BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON
+	  SOME DISTRIBUTIONS.  DO NOT ENABLE THIS UNLESS YOU KNOW WHAT
+	  YOU ARE DOING.  Distros, please enable this and fix whatever
+	  is broken.
+
 	  Conventionally, block device numbers are allocated from
 	  predetermined contiguous area.  However, extended block area
 	  may introduce non-contiguous block device numbers.  This