From 370d6686683f44b6e869210eea2cf6a905b38a55 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Date: Sat, 12 Oct 2013 06:33:47 +0200
Subject: mg_disk: remove deprecated IRQF_DISABLED

This patch proposes to remove the use of the IRQF_DISABLED flag

It's a NOOP since 2.6.35 and it will be removed one day.

Signed-off-by: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mg_disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 77a60bedd7a..7bc363f1ee8 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -936,7 +936,7 @@ static int mg_probe(struct platform_device *plat_dev)
 			goto probe_err_3b;
 		}
 		err = request_irq(host->irq, mg_irq,
-				IRQF_DISABLED | IRQF_TRIGGER_RISING,
+				IRQF_TRIGGER_RISING,
 				MG_DEV_NAME, host);
 		if (err) {
 			printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
-- 
cgit v1.2.3-18-g5258


From fbe363c476afe8ec992d3baf682670a4bd1b6ce6 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Mon, 12 Aug 2013 12:53:44 +0200
Subject: xen-blkfront: revoke foreign access for grants not mapped by the
 backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's no need to keep the foreign access in a grant if it is not
persistently mapped by the backend. This allows us to free grants that
are not mapped by the backend, thus preventing blkfront from hoarding
all grants.

The main effect of this is that blkfront will only persistently map
the same grants as the backend, and it will always try to use grants
that are already mapped by the backend. Also the number of persistent
grants in blkfront is the same as in blkback (and is controlled by the
value in blkback).

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Matt Wilson <msw@amazon.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkfront.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8d53ed29360..429d5264094 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1013,13 +1013,38 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 	}
 	/* Add the persistent grant into the list of free grants */
 	for (i = 0; i < nseg; i++) {
-		list_add(&s->grants_used[i]->node, &info->persistent_gnts);
-		info->persistent_gnts_c++;
+		if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
+			/*
+			 * If the grant is still mapped by the backend (the
+			 * backend has chosen to make this grant persistent)
+			 * we add it at the head of the list, so it will be
+			 * reused first.
+			 */
+			list_add(&s->grants_used[i]->node, &info->persistent_gnts);
+			info->persistent_gnts_c++;
+		} else {
+			/*
+			 * If the grant is not mapped by the backend we end the
+			 * foreign access and add it to the tail of the list,
+			 * so it will not be picked again unless we run out of
+			 * persistent grants.
+			 */
+			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
+			s->grants_used[i]->gref = GRANT_INVALID_REF;
+			list_add_tail(&s->grants_used[i]->node, &info->persistent_gnts);
+		}
 	}
 	if (s->req.operation == BLKIF_OP_INDIRECT) {
 		for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
-			list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
-			info->persistent_gnts_c++;
+			if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
+				list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
+				info->persistent_gnts_c++;
+			} else {
+				gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
+				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
+				list_add_tail(&s->indirect_grants[i]->node,
+				              &info->persistent_gnts);
+			}
 		}
 	}
 }
-- 
cgit v1.2.3-18-g5258


From c47206e25f28232ff979994c32758c82841d81cd Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Mon, 12 Aug 2013 12:53:43 +0200
Subject: xen-blkfront: improve aproximation of required grants per request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improve the calculation of required grants to process a request by
using nr_phys_segments instead of always assuming a request is going
to use all posible segments.

nr_phys_segments contains the number of scatter-gather DMA addr+len
pairs, which is basically what we put at every granted page.
for_each_sg iterates over the DMA addr+len pairs and uses a grant
page for each of them.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkfront.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 429d5264094..5b8a15483a4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -400,10 +400,13 @@ static int blkif_queue_request(struct request *req)
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 
-	max_grefs = info->max_indirect_segments ?
-		    info->max_indirect_segments +
-		    INDIRECT_GREFS(info->max_indirect_segments) :
-		    BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	max_grefs = req->nr_phys_segments;
+	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		/*
+		 * If we are using indirect segments we need to account
+		 * for the indirect grefs used in the request.
+		 */
+		max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
 
 	/* Check if we have enough grants to allocate a requests */
 	if (info->persistent_gnts_c < max_grefs) {
-- 
cgit v1.2.3-18-g5258


From ea5ec76d76da9279d12027c1828544c5ccbe7932 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 5 Sep 2013 13:00:14 +0200
Subject: xen/blkback: fix reference counting

If the permission check fails, we drop a reference to the blkif without
having taken it in the first place. The bug was introduced in commit
604c499cbbcc3d5fe5fb8d53306aa0fae1990109 (xen/blkback: Check device
permissions before allowing OP_DISCARD).

Cc: stable@vger.kernel.org
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkback/blkback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index bf4b9d282c0..6620b73d049 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -887,6 +887,8 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	unsigned long secure;
 	struct phys_req preq;
 
+	xen_blkif_get(blkif);
+
 	preq.sector_number = req->u.discard.sector_number;
 	preq.nr_sects      = req->u.discard.nr_sectors;
 
@@ -899,7 +901,6 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	}
 	blkif->st_ds_req++;
 
-	xen_blkif_get(blkif);
 	secure = (blkif->vbd.discard_secure &&
 		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
 		 BLKDEV_DISCARD_SECURE : 0;
-- 
cgit v1.2.3-18-g5258


From ef7e7c82e02b602f29c2b87f42dcd6143a6777da Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 15 Oct 2013 14:14:38 -0600
Subject: loop: fix crash when using unassigned loop device

When the loop module is loaded, it creates 8 loop devices /dev/loop[0-7].
The devices have no request routine and thus, when they are used without
being assigned, a crash happens.

For example, these commands cause crash (assuming there are no used loop
devices):

Kernel Fault: Code=26 regs=000000007f420980 (Addr=0000000000000010)
CPU: 1 PID: 50 Comm: kworker/1:1 Not tainted 3.11.0 #1
Workqueue: ksnaphd do_metadata [dm_snapshot]
task: 000000007fcf4078 ti: 000000007f420000 task.ti: 000000007f420000
[  116.319988]
     YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
PSW: 00001000000001001111111100001111 Not tainted
r00-03  000000ff0804ff0f 00000000408bf5d0 00000000402d8204 000000007b7ff6c0
r04-07  00000000408a95d0 000000007f420950 000000007b7ff6c0 000000007d06c930
r08-11  000000007f4205c0 0000000000000001 000000007f4205c0 000000007f4204b8
r12-15  0000000000000010 0000000000000000 0000000000000000 0000000000000000
r16-19  000000001108dd48 000000004061cd7c 000000007d859800 000000000800000f
r20-23  0000000000000000 0000000000000008 0000000000000000 0000000000000000
r24-27  00000000ffffffff 000000007b7ff6c0 000000007d859800 00000000408a95d0
r28-31  0000000000000000 000000007f420950 000000007f420980 000000007f4208e8
sr00-03  0000000000000000 0000000000000000 0000000000000000 0000000000303000
sr04-07  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  117.549988]
IASQ: 0000000000000000 0000000000000000 IAOQ: 00000000402d82fc 00000000402d8300
 IIR: 53820020    ISR: 0000000000000000  IOR: 0000000000000010
 CPU:        1   CR30: 000000007f420000 CR31: ffffffffffffffff
 ORIG_R28: 0000000000000001
 IAOQ[0]: generic_make_request+0x11c/0x1a0
 IAOQ[1]: generic_make_request+0x120/0x1a0
 RP(r2): generic_make_request+0x24/0x1a0
Backtrace:
 [<00000000402d83f0>] submit_bio+0x70/0x140
 [<0000000011087c4c>] dispatch_io+0x234/0x478 [dm_mod]
 [<0000000011087f44>] sync_io+0xb4/0x190 [dm_mod]
 [<00000000110883bc>] dm_io+0x2c4/0x310 [dm_mod]
 [<00000000110bfcd0>] do_metadata+0x28/0xb0 [dm_snapshot]
 [<00000000401591d8>] process_one_work+0x160/0x460
 [<0000000040159bc0>] worker_thread+0x300/0x478
 [<0000000040161a70>] kthread+0x118/0x128
 [<0000000040104020>] end_fault_vector+0x20/0x28
 [<0000000040177220>] task_tick_fair+0x420/0x4d0
 [<00000000401aa048>] invoke_rcu_core+0x50/0x60
 [<00000000401ad5b8>] rcu_check_callbacks+0x210/0x8d8
 [<000000004014aaa0>] update_process_times+0xa8/0xc0
 [<00000000401ab86c>] rcu_process_callbacks+0x4b4/0x598
 [<0000000040142408>] __do_softirq+0x250/0x2c0
 [<00000000401789d0>] find_busiest_group+0x3c0/0xc70
[  119.379988]
Kernel panic - not syncing: Kernel Fault
Rebooting in 1 seconds..

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index dbdb88a4976..c8dac730524 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -894,13 +894,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	bio_list_init(&lo->lo_bio_list);
 
-	/*
-	 * set queue make_request_fn, and add limits based on lower level
-	 * device
-	 */
-	blk_queue_make_request(lo->lo_queue, loop_make_request);
-	lo->lo_queue->queuedata = lo;
-
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
 		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
@@ -1618,6 +1611,8 @@ static int loop_add(struct loop_device **l, int i)
 	if (!lo)
 		goto out;
 
+	lo->lo_state = Lo_unbound;
+
 	/* allocate id, if @id >= 0, we're requesting that specific id */
 	if (i >= 0) {
 		err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
@@ -1635,6 +1630,12 @@ static int loop_add(struct loop_device **l, int i)
 	if (!lo->lo_queue)
 		goto out_free_idr;
 
+	/*
+	 * set queue make_request_fn
+	 */
+	blk_queue_make_request(lo->lo_queue, loop_make_request);
+	lo->lo_queue->queuedata = lo;
+
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
 	if (!disk)
 		goto out_free_queue;
-- 
cgit v1.2.3-18-g5258


From e5feab229f199dadee91073fbef5b507046086fd Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Wed, 4 Sep 2013 13:59:02 -0500
Subject: rsxx: Handling failed pci_map_page on PowerPC and double free.

The rsxx driver was not checking the correct value during a
pci_map_page failure. Fixing this also uncovered a
double free if the bio was returned before it was
broken up into indiviadual 4k dmas, that is also
fixed here.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/core.c      |  3 ++-
 drivers/block/rsxx/dma.c       | 47 ++++++++++++++++++++++++------------------
 drivers/block/rsxx/rsxx_priv.h |  9 +++++++-
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 6e85e21445e..e740a650d54 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -654,7 +654,8 @@ static void rsxx_eeh_failure(struct pci_dev *dev)
 	for (i = 0; i < card->n_targets; i++) {
 		spin_lock_bh(&card->ctrl[i].queue_lock);
 		cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
-					     &card->ctrl[i].queue);
+					     &card->ctrl[i].queue,
+					     COMPLETE_DMA);
 		spin_unlock_bh(&card->ctrl[i].queue_lock);
 
 		cnt += rsxx_dma_cancel(&card->ctrl[i]);
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index bed32f16b08..71d1ca2a144 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -221,6 +221,19 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card)
 }
 
 /*----------------- RSXX DMA Handling -------------------*/
+static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma)
+{
+	if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+		pci_unmap_page(ctrl->card->dev, dma->dma_addr,
+			       get_dma_size(dma),
+			       dma->cmd == HW_CMD_BLK_WRITE ?
+					   PCI_DMA_TODEVICE :
+					   PCI_DMA_FROMDEVICE);
+	}
+
+	kmem_cache_free(rsxx_dma_pool, dma);
+}
+
 static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
 				  struct rsxx_dma *dma,
 				  unsigned int status)
@@ -232,21 +245,14 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
 	if (status & DMA_CANCELLED)
 		ctrl->stats.dma_cancelled++;
 
-	if (dma->dma_addr)
-		pci_unmap_page(ctrl->card->dev, dma->dma_addr,
-			       get_dma_size(dma),
-			       dma->cmd == HW_CMD_BLK_WRITE ?
-					   PCI_DMA_TODEVICE :
-					   PCI_DMA_FROMDEVICE);
-
 	if (dma->cb)
 		dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0);
 
-	kmem_cache_free(rsxx_dma_pool, dma);
+	rsxx_free_dma(ctrl, dma);
 }
 
 int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
-			   struct list_head *q)
+			   struct list_head *q, unsigned int done)
 {
 	struct rsxx_dma *dma;
 	struct rsxx_dma *tmp;
@@ -254,7 +260,10 @@ int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
 
 	list_for_each_entry_safe(dma, tmp, q, list) {
 		list_del(&dma->list);
-		rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+		if (done & COMPLETE_DMA)
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+		else
+			rsxx_free_dma(ctrl, dma);
 		cnt++;
 	}
 
@@ -370,7 +379,7 @@ static void dma_engine_stalled(unsigned long data)
 
 		/* Clean up the DMA queue */
 		spin_lock(&ctrl->queue_lock);
-		cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+		cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
 		spin_unlock(&ctrl->queue_lock);
 
 		cnt += rsxx_dma_cancel(ctrl);
@@ -623,7 +632,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card,
 	dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len,
 				     dir ? PCI_DMA_TODEVICE :
 				     PCI_DMA_FROMDEVICE);
-	if (!dma->dma_addr) {
+	if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
 		kmem_cache_free(rsxx_dma_pool, dma);
 		return -ENOMEM;
 	}
@@ -736,11 +745,9 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 	return 0;
 
 bvec_err:
-	for (i = 0; i < card->n_targets; i++) {
-		spin_lock_bh(&card->ctrl[i].queue_lock);
-		rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]);
-		spin_unlock_bh(&card->ctrl[i].queue_lock);
-	}
+	for (i = 0; i < card->n_targets; i++)
+		rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i],
+					FREE_DMA);
 
 	return st;
 }
@@ -990,7 +997,7 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card)
 
 		/* Clean up the DMA queue */
 		spin_lock_bh(&ctrl->queue_lock);
-		rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+		rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
 		spin_unlock_bh(&ctrl->queue_lock);
 
 		rsxx_dma_cancel(ctrl);
@@ -1045,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 		card->ctrl[i].e_cnt = 0;
 
 		list_for_each_entry(dma, &card->ctrl[i].queue, list) {
-			if (dma->dma_addr)
+			if (!pci_dma_mapping_error(card->dev, dma->dma_addr))
 				pci_unmap_page(card->dev, dma->dma_addr,
 					       get_dma_size(dma),
 					       dma->cmd == HW_CMD_BLK_WRITE ?
@@ -1073,7 +1080,7 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
 					dma->cmd == HW_CMD_BLK_WRITE ?
 					PCI_DMA_TODEVICE :
 					PCI_DMA_FROMDEVICE);
-			if (!dma->dma_addr) {
+			if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
 				spin_unlock_bh(&card->ctrl[i].queue_lock);
 				kmem_cache_free(rsxx_dma_pool, dma);
 				return -ENOMEM;
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 5ad5055a410..82779058e8e 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -345,6 +345,11 @@ enum rsxx_creg_stat {
 	CREG_STAT_TAG_MASK	= 0x0000ff00,
 };
 
+enum rsxx_dma_finish {
+	FREE_DMA	= 0x0,
+	COMPLETE_DMA	= 0x1,
+};
+
 static inline unsigned int CREG_DATA(int N)
 {
 	return CREG_DATA0 + (N << 2);
@@ -379,7 +384,9 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
 int rsxx_dma_setup(struct rsxx_cardinfo *card);
 void rsxx_dma_destroy(struct rsxx_cardinfo *card);
 int rsxx_dma_init(void);
-int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q);
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+				struct list_head *q,
+				unsigned int done);
 int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
 void rsxx_dma_cleanup(void);
 void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
-- 
cgit v1.2.3-18-g5258


From 1b21f5b2ad6047995b19b15024353a9fa64810f1 Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Wed, 4 Sep 2013 13:59:35 -0500
Subject: rsxx: Moving pci_map_page to prevent overflow.

The pci_map_page function has been moved into our
issued workqueue to prevent an us running out of
mappable addresses on non-HWWD PCIe x8 slots. The
maximum amount that can possible be mapped at one
time now is: 255 dmas X 4 dma channels X 4096 Bytes.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/core.c |  5 ----
 drivers/block/rsxx/dma.c  | 70 +++++++++++++++++++----------------------------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index e740a650d54..a8de2eec6ff 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -749,10 +749,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
 
 	card->eeh_state = 0;
 
-	st = rsxx_eeh_remap_dmas(card);
-	if (st)
-		goto failed_remap_dmas;
-
 	spin_lock_irqsave(&card->irq_lock, flags);
 	if (card->n_targets & RSXX_MAX_TARGETS)
 		rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G);
@@ -779,7 +775,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
 	return PCI_ERS_RESULT_RECOVERED;
 
 failed_hw_buffers_init:
-failed_remap_dmas:
 	for (i = 0; i < card->n_targets; i++) {
 		if (card->ctrl[i].status.buf)
 			pci_free_consistent(card->dev,
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 71d1ca2a144..34fd1018c8e 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -397,6 +397,7 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
 	int tag;
 	int cmds_pending = 0;
 	struct hw_cmd *hw_cmd_buf;
+	int dir;
 
 	hw_cmd_buf = ctrl->cmd.buf;
 
@@ -433,6 +434,28 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
 			continue;
 		}
 
+		if (dma->cmd == HW_CMD_BLK_WRITE)
+			dir = PCI_DMA_TODEVICE;
+		else
+			dir = PCI_DMA_FROMDEVICE;
+
+		/*
+		 * The function pci_map_page is placed here because we can
+		 * only, by design, issue up to 255 commands to the hardware
+		 * at one time per DMA channel. So the maximum amount of mapped
+		 * memory would be 255 * 4 channels * 4096 Bytes which is less
+		 * than 2GB, the limit of a x8 Non-HWWD PCIe slot. This way the
+		 * pci_map_page function should never fail because of a
+		 * lack of mappable memory.
+		 */
+		dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
+				     dma->pg_off, dma->sub_page.cnt << 9, dir);
+		if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+			push_tracker(ctrl->trackers, tag);
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+			continue;
+		}
+
 		set_tracker_dma(ctrl->trackers, tag, dma);
 		hw_cmd_buf[ctrl->cmd.idx].command  = dma->cmd;
 		hw_cmd_buf[ctrl->cmd.idx].tag      = tag;
@@ -629,14 +652,6 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card,
 	if (!dma)
 		return -ENOMEM;
 
-	dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len,
-				     dir ? PCI_DMA_TODEVICE :
-				     PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
-		kmem_cache_free(rsxx_dma_pool, dma);
-		return -ENOMEM;
-	}
-
 	dma->cmd          = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ;
 	dma->laddr        = laddr;
 	dma->sub_page.off = (dma_off >> 9);
@@ -1039,6 +1054,11 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 			else
 				card->ctrl[i].stats.reads_issued--;
 
+			pci_unmap_page(card->dev, dma->dma_addr,
+				       get_dma_size(dma),
+				       dma->cmd == HW_CMD_BLK_WRITE ?
+				       PCI_DMA_TODEVICE :
+				       PCI_DMA_FROMDEVICE);
 			list_add_tail(&dma->list, &issued_dmas[i]);
 			push_tracker(card->ctrl[i].trackers, j);
 			cnt++;
@@ -1050,15 +1070,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 		atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
 		card->ctrl[i].stats.sw_q_depth += cnt;
 		card->ctrl[i].e_cnt = 0;
-
-		list_for_each_entry(dma, &card->ctrl[i].queue, list) {
-			if (!pci_dma_mapping_error(card->dev, dma->dma_addr))
-				pci_unmap_page(card->dev, dma->dma_addr,
-					       get_dma_size(dma),
-					       dma->cmd == HW_CMD_BLK_WRITE ?
-					       PCI_DMA_TODEVICE :
-					       PCI_DMA_FROMDEVICE);
-		}
 		spin_unlock_bh(&card->ctrl[i].queue_lock);
 	}
 
@@ -1067,31 +1078,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 	return 0;
 }
 
-int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
-{
-	struct rsxx_dma *dma;
-	int i;
-
-	for (i = 0; i < card->n_targets; i++) {
-		spin_lock_bh(&card->ctrl[i].queue_lock);
-		list_for_each_entry(dma, &card->ctrl[i].queue, list) {
-			dma->dma_addr = pci_map_page(card->dev, dma->page,
-					dma->pg_off, get_dma_size(dma),
-					dma->cmd == HW_CMD_BLK_WRITE ?
-					PCI_DMA_TODEVICE :
-					PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(card->dev, dma->dma_addr)) {
-				spin_unlock_bh(&card->ctrl[i].queue_lock);
-				kmem_cache_free(rsxx_dma_pool, dma);
-				return -ENOMEM;
-			}
-		}
-		spin_unlock_bh(&card->ctrl[i].queue_lock);
-	}
-
-	return 0;
-}
-
 int rsxx_dma_init(void)
 {
 	rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN);
-- 
cgit v1.2.3-18-g5258


From 8f8b899563f28ef26e381a6eb90d12dead77389f Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Wed, 11 Sep 2013 13:14:42 -0600
Subject: mtip32xx: Add SRSI support

This patch add support for SRSI(Surprise Removal Surprise Insertion).

Approach:
---------
Surprise Removal:
-----------------
On surprise removal of the device, gendisk, request queue, device index, sysfs
entries, etc are retained as long as device is in use - mounted filesystem,
device opened by an application, etc. The service thread breaks out of the main
while loop, waits for pci remove to exit, and then waits for device to become
free. When there no holders of the device, service thread cleans up the block
and device related stuff and returns.

Surprise Insertion:
-------------------
No change, this scenario follows the normal pci probe() function flow.

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 453 +++++++++++++++++++++++---------------
 drivers/block/mtip32xx/mtip32xx.h |  18 +-
 2 files changed, 289 insertions(+), 182 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 952dbfe2212..76f3bc4f0c2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -126,64 +126,30 @@ struct mtip_compat_ide_task_request_s {
 static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 {
 	u16 vendor_id = 0;
+	struct driver_data *dd = pci_get_drvdata(pdev);
+
+	if (dd->sr)
+		return true;
 
        /* Read the vendorID from the configuration space */
 	pci_read_config_word(pdev, 0x00, &vendor_id);
-	if (vendor_id == 0xFFFF)
+	if (vendor_id == 0xFFFF) {
+		dd->sr = true;
+		if (dd->queue)
+			set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags);
+		else
+			dev_warn(&dd->pdev->dev,
+				"%s: dd->queue is NULL\n", __func__);
+		if (dd->port) {
+			set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags);
+			wake_up_interruptible(&dd->port->svc_wait);
+		} else
+			dev_warn(&dd->pdev->dev,
+				"%s: dd->port is NULL\n", __func__);
 		return true; /* device removed */
-
-	return false; /* device present */
-}
-
-/*
- * This function is called for clean the pending command in the
- * command slot during the surprise removal of device and return
- * error to the upper layer.
- *
- * @dd Pointer to the DRIVER_DATA structure.
- *
- * return value
- *	None
- */
-static void mtip_command_cleanup(struct driver_data *dd)
-{
-	int group = 0, commandslot = 0, commandindex = 0;
-	struct mtip_cmd *command;
-	struct mtip_port *port = dd->port;
-	static int in_progress;
-
-	if (in_progress)
-		return;
-
-	in_progress = 1;
-
-	for (group = 0; group < 4; group++) {
-		for (commandslot = 0; commandslot < 32; commandslot++) {
-			if (!(port->allocated[group] & (1 << commandslot)))
-				continue;
-
-			commandindex = group << 5 | commandslot;
-			command = &port->commands[commandindex];
-
-			if (atomic_read(&command->active)
-			    && (command->async_callback)) {
-				command->async_callback(command->async_data,
-					-ENODEV);
-				command->async_callback = NULL;
-				command->async_data = NULL;
-			}
-
-			dma_unmap_sg(&port->dd->pdev->dev,
-				command->sg,
-				command->scatter_ents,
-				command->direction);
-		}
 	}
 
-	up(&port->cmd_slot);
-
-	set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
-	in_progress = 0;
+	return false; /* device present */
 }
 
 /*
@@ -222,10 +188,7 @@ static int get_slot(struct mtip_port *port)
 	}
 	dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
 
-	if (mtip_check_surprise_removal(port->dd->pdev)) {
-		/* Device not present, clean outstanding commands */
-		mtip_command_cleanup(port->dd);
-	}
+	mtip_check_surprise_removal(port->dd->pdev);
 	return -1;
 }
 
@@ -245,6 +208,107 @@ static inline void release_slot(struct mtip_port *port, int tag)
 	smp_mb__after_clear_bit();
 }
 
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port   Pointer to the port data structure.
+ * @tag    Tag of the command.
+ * @data   Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ *	None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+				int tag,
+				void *data,
+				int status)
+{
+	struct mtip_cmd *command;
+	struct driver_data *dd = data;
+	int cb_status = status ? -EIO : 0;
+
+	if (unlikely(!dd) || unlikely(!port))
+		return;
+
+	command = &port->commands[tag];
+
+	if (unlikely(status == PORT_IRQ_TF_ERR)) {
+		dev_warn(&port->dd->pdev->dev,
+			"Command tag %d failed due to TFE\n", tag);
+	}
+
+	/* Upper layer callback */
+	if (likely(command->async_callback))
+		command->async_callback(command->async_data, cb_status);
+
+	command->async_callback = NULL;
+	command->comp_func = NULL;
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev,
+		command->sg,
+		command->scatter_ents,
+		command->direction);
+
+	/* Clear the allocated and active bits for the command */
+	atomic_set(&port->commands[tag].active, 0);
+	release_slot(port, tag);
+
+	up(&port->cmd_slot);
+}
+
+/*
+ * This function is called for clean the pending command in the
+ * command slot during the surprise removal of device and return
+ * error to the upper layer.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_command_cleanup(struct driver_data *dd)
+{
+	int tag = 0;
+	struct mtip_cmd *cmd;
+	struct mtip_port *port = dd->port;
+	unsigned int num_cmd_slots = dd->slot_groups * 32;
+
+	if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
+		return;
+
+	if (!port)
+		return;
+
+	cmd = &port->commands[MTIP_TAG_INTERNAL];
+	if (atomic_read(&cmd->active))
+		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) &
+					(1 << MTIP_TAG_INTERNAL))
+			if (cmd->comp_func)
+				cmd->comp_func(port, MTIP_TAG_INTERNAL,
+					 cmd->comp_data, -ENODEV);
+
+	while (1) {
+		tag = find_next_bit(port->allocated, num_cmd_slots, tag);
+		if (tag >= num_cmd_slots)
+			break;
+
+		cmd = &port->commands[tag];
+		if (atomic_read(&cmd->active))
+			mtip_async_complete(port, tag, dd, -ENODEV);
+	}
+
+	set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
+}
+
 /*
  * Reset the HBA (without sleeping)
  *
@@ -584,6 +648,9 @@ static void mtip_timeout_function(unsigned long int data)
 	if (unlikely(!port))
 		return;
 
+	if (unlikely(port->dd->sr))
+		return;
+
 	if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
 		mod_timer(&port->cmd_timer,
 			jiffies + msecs_to_jiffies(30000));
@@ -674,66 +741,6 @@ static void mtip_timeout_function(unsigned long int data)
 		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
 }
 
-/*
- * IO completion function.
- *
- * This completion function is called by the driver ISR when a
- * command that was issued by the kernel completes. It first calls the
- * asynchronous completion function which normally calls back into the block
- * layer passing the asynchronous callback data, then unmaps the
- * scatter list associated with the completed command, and finally
- * clears the allocated bit associated with the completed command.
- *
- * @port   Pointer to the port data structure.
- * @tag    Tag of the command.
- * @data   Pointer to driver_data.
- * @status Completion status.
- *
- * return value
- *	None
- */
-static void mtip_async_complete(struct mtip_port *port,
-				int tag,
-				void *data,
-				int status)
-{
-	struct mtip_cmd *command;
-	struct driver_data *dd = data;
-	int cb_status = status ? -EIO : 0;
-
-	if (unlikely(!dd) || unlikely(!port))
-		return;
-
-	command = &port->commands[tag];
-
-	if (unlikely(status == PORT_IRQ_TF_ERR)) {
-		dev_warn(&port->dd->pdev->dev,
-			"Command tag %d failed due to TFE\n", tag);
-	}
-
-	/* Upper layer callback */
-	if (likely(command->async_callback))
-		command->async_callback(command->async_data, cb_status);
-
-	command->async_callback = NULL;
-	command->comp_func = NULL;
-
-	/* Unmap the DMA scatter list entries */
-	dma_unmap_sg(&dd->pdev->dev,
-		command->sg,
-		command->scatter_ents,
-		command->direction);
-
-	/* Clear the allocated and active bits for the command */
-	atomic_set(&port->commands[tag].active, 0);
-	release_slot(port, tag);
-
-	if (unlikely(command->unaligned))
-		up(&port->cmd_slot_unal);
-	else
-		up(&port->cmd_slot);
-}
-
 /*
  * Internal command completion callback function.
  *
@@ -854,7 +861,6 @@ static void mtip_handle_tfe(struct driver_data *dd)
 					"Missing completion func for tag %d",
 					tag);
 				if (mtip_check_surprise_removal(dd->pdev)) {
-					mtip_command_cleanup(dd);
 					/* don't proceed further */
 					return;
 				}
@@ -1018,14 +1024,12 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
 					command->comp_data,
 					0);
 			} else {
-				dev_warn(&dd->pdev->dev,
-					"Null completion "
-					"for tag %d",
+				dev_dbg(&dd->pdev->dev,
+					"Null completion for tag %d",
 					tag);
 
 				if (mtip_check_surprise_removal(
 					dd->pdev)) {
-					mtip_command_cleanup(dd);
 					return;
 				}
 			}
@@ -1145,7 +1149,6 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
 
 		if (unlikely(port_stat & PORT_IRQ_ERR)) {
 			if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
-				mtip_command_cleanup(dd);
 				/* don't proceed further */
 				return IRQ_HANDLED;
 			}
@@ -3006,6 +3009,46 @@ static void mtip_hw_debugfs_exit(struct driver_data *dd)
 		debugfs_remove_recursive(dd->dfs_node);
 }
 
+static int mtip_free_orphan(struct driver_data *dd)
+{
+	struct kobject *kobj;
+
+	if (dd->bdev) {
+		if (dd->bdev->bd_holders >= 1)
+			return -2;
+
+		bdput(dd->bdev);
+		dd->bdev = NULL;
+	}
+
+	mtip_hw_debugfs_exit(dd);
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+
+	if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) &&
+			test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
+		put_disk(dd->disk);
+	} else {
+		if (dd->disk) {
+			kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+			if (kobj) {
+				mtip_hw_sysfs_exit(dd, kobj);
+				kobject_put(kobj);
+			}
+			del_gendisk(dd->disk);
+			dd->disk = NULL;
+		}
+		if (dd->queue) {
+			dd->queue->queuedata = NULL;
+			blk_cleanup_queue(dd->queue);
+			dd->queue = NULL;
+		}
+	}
+	kfree(dd);
+	return 0;
+}
 
 /*
  * Perform any init/resume time hardware setup
@@ -3154,6 +3197,7 @@ static int mtip_service_thread(void *data)
 	unsigned long slot, slot_start, slot_wrap;
 	unsigned int num_cmd_slots = dd->slot_groups * 32;
 	struct mtip_port *port = dd->port;
+	int ret;
 
 	while (1) {
 		/*
@@ -3164,13 +3208,18 @@ static int mtip_service_thread(void *data)
 			!(port->flags & MTIP_PF_PAUSE_IO));
 
 		if (kthread_should_stop())
+			goto st_out;
+
+		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		/* If I am an orphan, start self cleanup */
+		if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags))
 			break;
 
 		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
 				&dd->dd_flag)))
-			break;
+			goto st_out;
 
-		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
 		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
 			slot = 1;
 			/* used to restrict the loop to one iteration */
@@ -3201,7 +3250,7 @@ static int mtip_service_thread(void *data)
 
 			clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
 		} else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
-			if (!mtip_ftl_rebuild_poll(dd))
+			if (mtip_ftl_rebuild_poll(dd) < 0)
 				set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
 							&dd->dd_flag);
 			clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
@@ -3209,8 +3258,30 @@ static int mtip_service_thread(void *data)
 		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
 
 		if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			goto st_out;
+	}
+
+	/* wait for pci remove to exit */
+	while (1) {
+		if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag))
 			break;
+		msleep_interruptible(1000);
+		if (kthread_should_stop())
+			goto st_out;
 	}
+
+	while (1) {
+		ret = mtip_free_orphan(dd);
+		if (!ret) {
+			/* NOTE: All data structures are invalid, do not
+			 * access any here */
+			return 0;
+		}
+		msleep_interruptible(1000);
+		if (kthread_should_stop())
+			goto st_out;
+	}
+st_out:
 	return 0;
 }
 
@@ -3437,13 +3508,13 @@ static int mtip_hw_init(struct driver_data *dd)
 		rv = -EFAULT;
 		goto out3;
 	}
+	mtip_dump_identify(dd->port);
 
 	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
 		MTIP_FTL_REBUILD_MAGIC) {
 		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
 		return MTIP_FTL_REBUILD_MAGIC;
 	}
-	mtip_dump_identify(dd->port);
 
 	/* check write protect, over temp and rebuild statuses */
 	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
@@ -3467,8 +3538,8 @@ static int mtip_hw_init(struct driver_data *dd)
 		}
 		if (buf[288] == 0xBF) {
 			dev_info(&dd->pdev->dev,
-				"Drive indicates rebuild has failed.\n");
-			/* TODO */
+				"Drive is in security locked state.\n");
+			set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
 		}
 	}
 
@@ -3523,9 +3594,8 @@ static int mtip_hw_exit(struct driver_data *dd)
 	 * Send standby immediate (E0h) to the drive so that it
 	 * saves its state.
 	 */
-	if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
-
-		if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags))
+	if (!dd->sr) {
+		if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
 			if (mtip_standby_immediate(dd->port))
 				dev_warn(&dd->pdev->dev,
 					"STANDBY IMMEDIATE failed\n");
@@ -3551,6 +3621,7 @@ static int mtip_hw_exit(struct driver_data *dd)
 			dd->port->command_list_dma);
 	/* Free the memory allocated for the for structure. */
 	kfree(dd->port);
+	dd->port = NULL;
 
 	return 0;
 }
@@ -3572,7 +3643,8 @@ static int mtip_hw_shutdown(struct driver_data *dd)
 	 * Send standby immediate (E0h) to the drive so that it
 	 * saves its state.
 	 */
-	mtip_standby_immediate(dd->port);
+	if (!dd->sr && dd->port)
+		mtip_standby_immediate(dd->port);
 
 	return 0;
 }
@@ -3887,6 +3959,10 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 			bio_endio(bio, -ENODATA);
 			return;
 		}
+		if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
+			bio_endio(bio, -ENXIO);
+			return;
+		}
 	}
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
@@ -4010,6 +4086,8 @@ static int mtip_block_initialize(struct driver_data *dd)
 	dd->disk->private_data	= dd;
 	dd->index		= index;
 
+	mtip_hw_debugfs_init(dd);
+
 	/*
 	 * if rebuild pending, start the service thread, and delay the block
 	 * queue creation and add_disk()
@@ -4068,6 +4146,7 @@ skip_create_disk:
 	/* Enable the block device and add it to /dev */
 	add_disk(dd->disk);
 
+	dd->bdev = bdget_disk(dd->disk, 0);
 	/*
 	 * Now that the disk is active, initialize any sysfs attributes
 	 * managed by the protocol layer.
@@ -4077,7 +4156,6 @@ skip_create_disk:
 		mtip_hw_sysfs_init(dd, kobj);
 		kobject_put(kobj);
 	}
-	mtip_hw_debugfs_init(dd);
 
 	if (dd->mtip_svc_handler) {
 		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
@@ -4103,7 +4181,8 @@ start_service_thread:
 	return rv;
 
 kthread_run_error:
-	mtip_hw_debugfs_exit(dd);
+	bdput(dd->bdev);
+	dd->bdev = NULL;
 
 	/* Delete our gendisk. This also removes the device from /dev */
 	del_gendisk(dd->disk);
@@ -4112,6 +4191,7 @@ read_capacity_error:
 	blk_cleanup_queue(dd->queue);
 
 block_queue_alloc_init_error:
+	mtip_hw_debugfs_exit(dd);
 disk_index_error:
 	spin_lock(&rssd_index_lock);
 	ida_remove(&rssd_index_ida, index);
@@ -4141,40 +4221,48 @@ static int mtip_block_remove(struct driver_data *dd)
 {
 	struct kobject *kobj;
 
-	if (dd->mtip_svc_handler) {
-		set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
-		wake_up_interruptible(&dd->port->svc_wait);
-		kthread_stop(dd->mtip_svc_handler);
-	}
+	if (!dd->sr) {
+		mtip_hw_debugfs_exit(dd);
 
-	/* Clean up the sysfs attributes, if created */
-	if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
-		kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
-		if (kobj) {
-			mtip_hw_sysfs_exit(dd, kobj);
-			kobject_put(kobj);
+		if (dd->mtip_svc_handler) {
+			set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+			wake_up_interruptible(&dd->port->svc_wait);
+			kthread_stop(dd->mtip_svc_handler);
 		}
-	}
-	mtip_hw_debugfs_exit(dd);
 
-	/*
-	 * Delete our gendisk structure. This also removes the device
-	 * from /dev
-	 */
-	if (dd->disk) {
-		if (dd->disk->queue)
-			del_gendisk(dd->disk);
-		else
-			put_disk(dd->disk);
-	}
-
-	spin_lock(&rssd_index_lock);
-	ida_remove(&rssd_index_ida, dd->index);
-	spin_unlock(&rssd_index_lock);
+		/* Clean up the sysfs attributes, if created */
+		if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+			kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+			if (kobj) {
+				mtip_hw_sysfs_exit(dd, kobj);
+				kobject_put(kobj);
+			}
+		}
+		/*
+		 * Delete our gendisk structure. This also removes the device
+		 * from /dev
+		 */
+		if (dd->bdev) {
+			bdput(dd->bdev);
+			dd->bdev = NULL;
+		}
+		if (dd->disk) {
+			if (dd->disk->queue) {
+				del_gendisk(dd->disk);
+				blk_cleanup_queue(dd->queue);
+				dd->queue = NULL;
+			} else
+				put_disk(dd->disk);
+		}
+		dd->disk  = NULL;
 
-	blk_cleanup_queue(dd->queue);
-	dd->disk  = NULL;
-	dd->queue = NULL;
+		spin_lock(&rssd_index_lock);
+		ida_remove(&rssd_index_ida, dd->index);
+		spin_unlock(&rssd_index_lock);
+	} else {
+		dev_info(&dd->pdev->dev, "device %s surprise removal\n",
+						dd->disk->disk_name);
+	}
 
 	/* De-initialize the protocol layer. */
 	mtip_hw_exit(dd);
@@ -4490,8 +4578,7 @@ done:
 static void mtip_pci_remove(struct pci_dev *pdev)
 {
 	struct driver_data *dd = pci_get_drvdata(pdev);
-	int counter = 0;
-	unsigned long flags;
+	unsigned long flags, to;
 
 	set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
 
@@ -4500,17 +4587,22 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 	list_add(&dd->remove_list, &removing_list);
 	spin_unlock_irqrestore(&dev_lock, flags);
 
-	if (mtip_check_surprise_removal(pdev)) {
-		while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
-			counter++;
-			msleep(20);
-			if (counter == 10) {
-				/* Cleanup the outstanding commands */
-				mtip_command_cleanup(dd);
-				break;
-			}
-		}
+	mtip_check_surprise_removal(pdev);
+	synchronize_irq(dd->pdev->irq);
+
+	/* Spin until workers are done */
+	to = jiffies + msecs_to_jiffies(4000);
+	do {
+		msleep(20);
+	} while (atomic_read(&dd->irq_workers_active) != 0 &&
+		time_before(jiffies, to));
+
+	if (atomic_read(&dd->irq_workers_active) != 0) {
+		dev_warn(&dd->pdev->dev,
+			"Completion workers still active!\n");
 	}
+	/* Cleanup the outstanding commands */
+	mtip_command_cleanup(dd);
 
 	/* Clean up the block layer. */
 	mtip_block_remove(dd);
@@ -4529,8 +4621,15 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 	list_del_init(&dd->remove_list);
 	spin_unlock_irqrestore(&dev_lock, flags);
 
-	kfree(dd);
+	if (!dd->sr)
+		kfree(dd);
+	else
+		set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag);
+
 	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+	pci_set_drvdata(pdev, NULL);
+	pci_dev_put(pdev);
+
 }
 
 /*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 3bb8a295fbe..9be7a1582ad 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -140,6 +140,7 @@ enum {
 	MTIP_PF_SVC_THD_ACTIVE_BIT  = 4,
 	MTIP_PF_ISSUE_CMDS_BIT      = 5,
 	MTIP_PF_REBUILD_BIT         = 6,
+	MTIP_PF_SR_CLEANUP_BIT      = 7,
 	MTIP_PF_SVC_THD_STOP_BIT    = 8,
 
 	/* below are bit numbers in 'dd_flag' defined in driver_data */
@@ -147,15 +148,18 @@ enum {
 	MTIP_DDF_REMOVE_PENDING_BIT = 1,
 	MTIP_DDF_OVER_TEMP_BIT      = 2,
 	MTIP_DDF_WRITE_PROTECT_BIT  = 3,
-	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
-				(1 << MTIP_DDF_SEC_LOCK_BIT) |
-				(1 << MTIP_DDF_OVER_TEMP_BIT) |
-				(1 << MTIP_DDF_WRITE_PROTECT_BIT)),
-
+	MTIP_DDF_REMOVE_DONE_BIT    = 4,
 	MTIP_DDF_CLEANUP_BIT        = 5,
 	MTIP_DDF_RESUME_BIT         = 6,
 	MTIP_DDF_INIT_DONE_BIT      = 7,
 	MTIP_DDF_REBUILD_FAILED_BIT = 8,
+
+	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
+				(1 << MTIP_DDF_SEC_LOCK_BIT) |
+				(1 << MTIP_DDF_OVER_TEMP_BIT) |
+				(1 << MTIP_DDF_WRITE_PROTECT_BIT) |
+				(1 << MTIP_DDF_REBUILD_FAILED_BIT)),
+
 };
 
 struct smart_attr {
@@ -499,6 +503,8 @@ struct driver_data {
 
 	bool trim_supp; /* flag indicating trim support */
 
+	bool sr;
+
 	int numa_node; /* NUMA support */
 
 	char workq_name[32];
@@ -511,6 +517,8 @@ struct driver_data {
 
 	int isr_binding;
 
+	struct block_device *bdev;
+
 	int unal_qdepth; /* qdepth of unaligned IO queue */
 
 	struct list_head online_list; /* linkage for online list */
-- 
cgit v1.2.3-18-g5258


From c8afd0dcbd14e2352258f2e2d359b36d0edd459f Mon Sep 17 00:00:00 2001
From: David Milburn <dmilburn@redhat.com>
Date: Thu, 23 May 2013 16:23:45 -0500
Subject: mtip32xx: dynamically allocate buffer in debugfs functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dynamically allocate buf to prevent warnings:

drivers/block/mtip32xx/mtip32xx.c: In function ‘mtip_hw_read_device_status’:
drivers/block/mtip32xx/mtip32xx.c:2823: warning: the frame size of 1056 bytes is larger than 1024 bytes
drivers/block/mtip32xx/mtip32xx.c: In function ‘mtip_hw_read_registers’:
drivers/block/mtip32xx/mtip32xx.c:2894: warning: the frame size of 1056 bytes is larger than 1024 bytes
drivers/block/mtip32xx/mtip32xx.c: In function ‘mtip_hw_read_flags’:
drivers/block/mtip32xx/mtip32xx.c:2917: warning: the frame size of 1056 bytes is larger than 1024 bytes

Signed-off-by: David Milburn <dmilburn@redhat.com>
Acked-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 47 ++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 76f3bc4f0c2..050c71267f1 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2809,34 +2809,51 @@ static ssize_t show_device_status(struct device_driver *drv, char *buf)
 static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf,
 						size_t len, loff_t *offset)
 {
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
 	int size = *offset;
-	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	char *buf;
+	int rv = 0;
 
 	if (!len || *offset)
 		return 0;
 
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: status buffer\n");
+		return -ENOMEM;
+	}
+
 	size += show_device_status(NULL, buf);
 
 	*offset = size <= len ? size : len;
 	size = copy_to_user(ubuf, buf, *offset);
 	if (size)
-		return -EFAULT;
+		rv = -EFAULT;
 
-	return *offset;
+	kfree(buf);
+	return rv ? rv : *offset;
 }
 
 static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
 				  size_t len, loff_t *offset)
 {
 	struct driver_data *dd =  (struct driver_data *)f->private_data;
-	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	char *buf;
 	u32 group_allocated;
 	int size = *offset;
-	int n;
+	int n, rv = 0;
 
 	if (!len || size)
 		return 0;
 
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: register buffer\n");
+		return -ENOMEM;
+	}
+
 	size += sprintf(&buf[size], "H/ S ACTive      : [ 0x");
 
 	for (n = dd->slot_groups-1; n >= 0; n--)
@@ -2891,21 +2908,30 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
 	*offset = size <= len ? size : len;
 	size = copy_to_user(ubuf, buf, *offset);
 	if (size)
-		return -EFAULT;
+		rv = -EFAULT;
 
-	return *offset;
+	kfree(buf);
+	return rv ? rv : *offset;
 }
 
 static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
 				  size_t len, loff_t *offset)
 {
 	struct driver_data *dd =  (struct driver_data *)f->private_data;
-	char buf[MTIP_DFS_MAX_BUF_SIZE];
+	char *buf;
 	int size = *offset;
+	int rv = 0;
 
 	if (!len || size)
 		return 0;
 
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf) {
+		dev_err(&dd->pdev->dev,
+			"Memory allocation: flag buffer\n");
+		return -ENOMEM;
+	}
+
 	size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
 							dd->port->flags);
 	size += sprintf(&buf[size], "Flag-dd   : [ %08lX ]\n",
@@ -2914,9 +2940,10 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
 	*offset = size <= len ? size : len;
 	size = copy_to_user(ubuf, buf, *offset);
 	if (size)
-		return -EFAULT;
+		rv = -EFAULT;
 
-	return *offset;
+	kfree(buf);
+	return rv ? rv : *offset;
 }
 
 static const struct file_operations mtip_device_status_fops = {
-- 
cgit v1.2.3-18-g5258


From 0317cd6de852a70e0374e7eb40a013072274386f Mon Sep 17 00:00:00 2001
From: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Date: Fri, 27 Sep 2013 20:42:50 -0600
Subject: rsxx: Kernel Panic caused by mapping Discards

This fixes a kernel panic injected by commit id
8d26750143341831bc312f61c5ed141eeb75b8d0 where discards
are getting mapped through the pci_map_page function call.

The driver will now start verifying that a dma is not a
discard before issuing a the pci_map_page function call.

Also, we are updating the driver version.

Signed-off-by: Philip J Kelleher <pjk1939@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/dma.c       | 43 ++++++++++++++++++++++--------------------
 drivers/block/rsxx/rsxx_priv.h |  2 +-
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 34fd1018c8e..4103601ae67 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -434,26 +434,29 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
 			continue;
 		}
 
-		if (dma->cmd == HW_CMD_BLK_WRITE)
-			dir = PCI_DMA_TODEVICE;
-		else
-			dir = PCI_DMA_FROMDEVICE;
-
-		/*
-		 * The function pci_map_page is placed here because we can
-		 * only, by design, issue up to 255 commands to the hardware
-		 * at one time per DMA channel. So the maximum amount of mapped
-		 * memory would be 255 * 4 channels * 4096 Bytes which is less
-		 * than 2GB, the limit of a x8 Non-HWWD PCIe slot. This way the
-		 * pci_map_page function should never fail because of a
-		 * lack of mappable memory.
-		 */
-		dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
-				     dma->pg_off, dma->sub_page.cnt << 9, dir);
-		if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
-			push_tracker(ctrl->trackers, tag);
-			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
-			continue;
+		if (dma->cmd != HW_CMD_BLK_DISCARD) {
+			if (dma->cmd == HW_CMD_BLK_WRITE)
+				dir = PCI_DMA_TODEVICE;
+			else
+				dir = PCI_DMA_FROMDEVICE;
+
+			/*
+			 * The function pci_map_page is placed here because we
+			 * can only, by design, issue up to 255 commands to the
+			 * hardware at one time per DMA channel. So the maximum
+			 * amount of mapped memory would be 255 * 4 channels *
+			 * 4096 Bytes which is less than 2GB, the limit of a x8
+			 * Non-HWWD PCIe slot. This way the pci_map_page
+			 * function should never fail because of a lack of
+			 * mappable memory.
+			 */
+			dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
+					dma->pg_off, dma->sub_page.cnt << 9, dir);
+			if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+				push_tracker(ctrl->trackers, tag);
+				rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+				continue;
+			}
 		}
 
 		set_tracker_dma(ctrl->trackers, tag, dma);
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 82779058e8e..913740e53d3 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -52,7 +52,7 @@ struct proc_cmd;
 #define RS70_PCI_REV_SUPPORTED	4
 
 #define DRIVER_NAME "rsxx"
-#define DRIVER_VERSION "4.0"
+#define DRIVER_VERSION "4.0.1.2498"
 
 /* Block size is 4096 */
 #define RSXX_HW_BLK_SHIFT		12
-- 
cgit v1.2.3-18-g5258


From e67f86b31ae5be8a88bec27b5ecb18dc2ffc9c56 Mon Sep 17 00:00:00 2001
From: Akhil Bhansali <abhansali@stec-inc.com>
Date: Tue, 15 Oct 2013 14:19:07 -0600
Subject: Add support for sTec's pci-e flash card Kronos

Signed-off-by: Akhil Bhansali <abhansali@stec-inc.com>
Signed-off-by: Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>

Folded patch, contributions to clean up this driver from:

Jens Axboe
Dan Carpenter
Andrew Morton

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig     |   10 +
 drivers/block/Makefile    |    2 +
 drivers/block/skd_main.c  | 5817 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/block/skd_s1120.h |  354 +++
 4 files changed, 6183 insertions(+)
 create mode 100644 drivers/block/skd_main.c
 create mode 100644 drivers/block/skd_s1120.h

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e07a5fd58ad..555aed0b50d 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -316,6 +316,16 @@ config BLK_DEV_NVME
 	  To compile this driver as a module, choose M here: the
 	  module will be called nvme.
 
+config BLK_DEV_SKD
+	tristate "STEC S1120 Block Driver"
+	depends on PCI
+	depends on 64BIT
+	---help---
+	Saying Y or M here will enable support for the
+	STEC, Inc. S1120 PCIe SSD.
+
+	Use device /dev/skd$N amd /dev/skd$Np$M.
+
 config BLK_DEV_OSD
 	tristate "OSD object-as-blkdev support"
 	depends on SCSI_OSD_ULD
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index ca07399a8d9..f33b3669428 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_NVME)	+= nvme.o
+obj-$(CONFIG_BLK_DEV_SKD)	+= skd.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
@@ -43,4 +44,5 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 
 nvme-y		:= nvme-core.o nvme-scsi.o
+skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
new file mode 100644
index 00000000000..3110f68eced
--- /dev/null
+++ b/drivers/block/skd_main.c
@@ -0,0 +1,5817 @@
+/* Copyright 2012 STEC, Inc.
+ *
+ * This file is licensed under the terms of the 3-clause
+ * BSD License (http://opensource.org/licenses/BSD-3-Clause)
+ * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
+ * at your option. Both licenses are also available in the LICENSE file
+ * distributed with this project. This file may not be copied, modified,
+ * or distributed except in accordance with those terms.
+ * Gordoni Waidhofer <gwaidhofer@stec-inc.com>
+ * Initial Driver Design!
+ * Thomas Swann <tswann@stec-inc.com>
+ * Interrupt handling.
+ * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
+ * biomode implementation.
+ * Akhil Bhansali <abhansali@stec-inc.com>
+ * Added support for DISCARD / FLUSH and FUA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/hdreg.h>
+#include <linux/dma-mapping.h>
+#include <linux/completion.h>
+#include <linux/scatterlist.h>
+#include <linux/version.h>
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/aer.h>
+#include <linux/ctype.h>
+#include <linux/wait.h>
+#include <linux/uio.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/sg.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <asm-generic/unaligned.h>
+
+#include "skd_s1120.h"
+
+static int skd_dbg_level;
+static int skd_isr_comp_limit = 4;
+
+enum {
+	STEC_LINK_2_5GTS = 0,
+	STEC_LINK_5GTS = 1,
+	STEC_LINK_8GTS = 2,
+	STEC_LINK_UNKNOWN = 0xFF
+};
+
+enum {
+	SKD_FLUSH_INITIALIZER,
+	SKD_FLUSH_ZERO_SIZE_FIRST,
+	SKD_FLUSH_DATA_SECOND,
+};
+
+#define DPRINTK(skdev, fmt, args ...) \
+	do { \
+		if (unlikely((skdev)->dbg_level > 0)) {	\
+			pr_err("%s:%s:%d " fmt, (skdev)->name,	\
+			       __func__, __LINE__, ## args); \
+		} \
+	} while (0)
+
+#define SKD_ASSERT(expr) \
+	do { \
+		if (unlikely(!(expr))) { \
+			pr_err("Assertion failed! %s,%s,%s,line=%d\n",	\
+			       # expr, __FILE__, __func__, __LINE__); \
+		} \
+	} while (0)
+
+#define VPRINTK(skdev, fmt, args ...) \
+	do { \
+		if (unlikely((skdev)->dbg_level > 1)) {	\
+			pr_err("%s:%s:%d " fmt, (skdev)->name,	\
+			       __func__, __LINE__, ## args); \
+		} \
+	} while (0)
+
+
+#define DRV_NAME "skd"
+#define DRV_VERSION "2.2.1"
+#define DRV_BUILD_ID "0260"
+#define PFX DRV_NAME ": "
+#define DRV_BIN_VERSION 0x100
+#define DRV_VER_COMPL   "2.2.1." DRV_BUILD_ID
+
+MODULE_AUTHOR("bug-reports: support@stec-inc.com");
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("STEC s1120 PCIe SSD block/BIO driver (b" DRV_BUILD_ID ")");
+MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID);
+
+#define PCI_VENDOR_ID_STEC      0x1B39
+#define PCI_DEVICE_ID_S1120     0x0001
+
+#define SKD_FUA_NV		(1 << 1)
+#define SKD_MINORS_PER_DEVICE   16
+
+#define SKD_MAX_QUEUE_DEPTH     200u
+
+#define SKD_PAUSE_TIMEOUT       (5 * 1000)
+
+#define SKD_N_FITMSG_BYTES      (512u)
+
+#define SKD_N_SPECIAL_CONTEXT   32u
+#define SKD_N_SPECIAL_FITMSG_BYTES      (128u)
+
+/* SG elements are 32 bytes, so we can make this 4096 and still be under the
+ * 128KB limit.  That allows 4096*4K = 16M xfer size
+ */
+#define SKD_N_SG_PER_REQ_DEFAULT 256u
+#define SKD_N_SG_PER_SPECIAL    256u
+
+#define SKD_N_COMPLETION_ENTRY  256u
+#define SKD_N_READ_CAP_BYTES    (8u)
+
+#define SKD_N_INTERNAL_BYTES    (512u)
+
+/* 5 bits of uniqifier, 0xF800 */
+#define SKD_ID_INCR             (0x400)
+#define SKD_ID_TABLE_MASK       (3u << 8u)
+#define  SKD_ID_RW_REQUEST      (0u << 8u)
+#define  SKD_ID_INTERNAL        (1u << 8u)
+#define  SKD_ID_SPECIAL_REQUEST (2u << 8u)
+#define  SKD_ID_FIT_MSG         (3u << 8u)
+#define SKD_ID_SLOT_MASK        0x00FFu
+#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu
+
+#define SKD_N_TIMEOUT_SLOT      4u
+#define SKD_TIMEOUT_SLOT_MASK   3u
+
+#define SKD_N_MAX_SECTORS 2048u
+
+#define SKD_MAX_RETRIES 2u
+
+#define SKD_TIMER_SECONDS(seconds) (seconds)
+#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60))
+
+#define INQ_STD_NBYTES 36
+#define SKD_DISCARD_CDB_LENGTH	24
+
+enum skd_drvr_state {
+	SKD_DRVR_STATE_LOAD,
+	SKD_DRVR_STATE_IDLE,
+	SKD_DRVR_STATE_BUSY,
+	SKD_DRVR_STATE_STARTING,
+	SKD_DRVR_STATE_ONLINE,
+	SKD_DRVR_STATE_PAUSING,
+	SKD_DRVR_STATE_PAUSED,
+	SKD_DRVR_STATE_DRAINING_TIMEOUT,
+	SKD_DRVR_STATE_RESTARTING,
+	SKD_DRVR_STATE_RESUMING,
+	SKD_DRVR_STATE_STOPPING,
+	SKD_DRVR_STATE_FAULT,
+	SKD_DRVR_STATE_DISAPPEARED,
+	SKD_DRVR_STATE_PROTOCOL_MISMATCH,
+	SKD_DRVR_STATE_BUSY_ERASE,
+	SKD_DRVR_STATE_BUSY_SANITIZE,
+	SKD_DRVR_STATE_BUSY_IMMINENT,
+	SKD_DRVR_STATE_WAIT_BOOT,
+	SKD_DRVR_STATE_SYNCING,
+};
+
+#define SKD_WAIT_BOOT_TIMO      SKD_TIMER_SECONDS(90u)
+#define SKD_STARTING_TIMO       SKD_TIMER_SECONDS(8u)
+#define SKD_RESTARTING_TIMO     SKD_TIMER_MINUTES(4u)
+#define SKD_DRAINING_TIMO       SKD_TIMER_SECONDS(6u)
+#define SKD_BUSY_TIMO           SKD_TIMER_MINUTES(20u)
+#define SKD_STARTED_BUSY_TIMO   SKD_TIMER_SECONDS(60u)
+#define SKD_START_WAIT_SECONDS  90u
+
+enum skd_req_state {
+	SKD_REQ_STATE_IDLE,
+	SKD_REQ_STATE_SETUP,
+	SKD_REQ_STATE_BUSY,
+	SKD_REQ_STATE_COMPLETED,
+	SKD_REQ_STATE_TIMEOUT,
+	SKD_REQ_STATE_ABORTED,
+};
+
+enum skd_fit_msg_state {
+	SKD_MSG_STATE_IDLE,
+	SKD_MSG_STATE_BUSY,
+};
+
+enum skd_check_status_action {
+	SKD_CHECK_STATUS_REPORT_GOOD,
+	SKD_CHECK_STATUS_REPORT_SMART_ALERT,
+	SKD_CHECK_STATUS_REQUEUE_REQUEST,
+	SKD_CHECK_STATUS_REPORT_ERROR,
+	SKD_CHECK_STATUS_BUSY_IMMINENT,
+};
+
+struct skd_fitmsg_context {
+	enum skd_fit_msg_state state;
+
+	struct skd_fitmsg_context *next;
+
+	u32 id;
+	u16 outstanding;
+
+	u32 length;
+	u32 offset;
+
+	u8 *msg_buf;
+	dma_addr_t mb_dma_address;
+};
+
+struct skd_request_context {
+	enum skd_req_state state;
+
+	struct skd_request_context *next;
+
+	u16 id;
+	u32 fitmsg_id;
+
+	struct request *req;
+	struct bio *bio;
+	unsigned long start_time;
+	u8 flush_cmd;
+	u8 discard_page;
+
+	u32 timeout_stamp;
+	u8 sg_data_dir;
+	struct scatterlist *sg;
+	u32 n_sg;
+	u32 sg_byte_count;
+
+	struct fit_sg_descriptor *sksg_list;
+	dma_addr_t sksg_dma_address;
+
+	struct fit_completion_entry_v1 completion;
+
+	struct fit_comp_error_info err_info;
+
+};
+#define SKD_DATA_DIR_HOST_TO_CARD       1
+#define SKD_DATA_DIR_CARD_TO_HOST       2
+#define SKD_DATA_DIR_NONE		3	/* especially for DISCARD requests. */
+
+struct skd_special_context {
+	struct skd_request_context req;
+
+	u8 orphaned;
+
+	void *data_buf;
+	dma_addr_t db_dma_address;
+
+	u8 *msg_buf;
+	dma_addr_t mb_dma_address;
+};
+
+struct skd_sg_io {
+	fmode_t mode;
+	void __user *argp;
+
+	struct sg_io_hdr sg;
+
+	u8 cdb[16];
+
+	u32 dxfer_len;
+	u32 iovcnt;
+	struct sg_iovec *iov;
+	struct sg_iovec no_iov_iov;
+
+	struct skd_special_context *skspcl;
+};
+
+typedef enum skd_irq_type {
+	SKD_IRQ_LEGACY,
+	SKD_IRQ_MSI,
+	SKD_IRQ_MSIX
+} skd_irq_type_t;
+
+#define SKD_MAX_BARS                    2
+
+struct skd_device {
+	volatile void __iomem *mem_map[SKD_MAX_BARS];
+	resource_size_t mem_phys[SKD_MAX_BARS];
+	u32 mem_size[SKD_MAX_BARS];
+
+	skd_irq_type_t irq_type;
+	u32 msix_count;
+	struct skd_msix_entry *msix_entries;
+
+	struct pci_dev *pdev;
+	int pcie_error_reporting_is_enabled;
+
+	spinlock_t lock;
+	struct gendisk *disk;
+	struct request_queue *queue;
+	struct device *class_dev;
+	int gendisk_on;
+	int sync_done;
+
+	atomic_t device_count;
+	u32 devno;
+	u32 major;
+	char name[32];
+	char isr_name[30];
+
+	enum skd_drvr_state state;
+	u32 drive_state;
+
+	u32 in_flight;
+	u32 cur_max_queue_depth;
+	u32 queue_low_water_mark;
+	u32 dev_max_queue_depth;
+
+	u32 num_fitmsg_context;
+	u32 num_req_context;
+
+	u32 timeout_slot[SKD_N_TIMEOUT_SLOT];
+	u32 timeout_stamp;
+	struct skd_fitmsg_context *skmsg_free_list;
+	struct skd_fitmsg_context *skmsg_table;
+
+	struct skd_request_context *skreq_free_list;
+	struct skd_request_context *skreq_table;
+
+	struct skd_special_context *skspcl_free_list;
+	struct skd_special_context *skspcl_table;
+
+	struct skd_special_context internal_skspcl;
+	u32 read_cap_blocksize;
+	u32 read_cap_last_lba;
+	int read_cap_is_valid;
+	int inquiry_is_valid;
+	u8 inq_serial_num[13];  /*12 chars plus null term */
+	u8 id_str[80];          /* holds a composite name (pci + sernum) */
+
+	u8 skcomp_cycle;
+	u32 skcomp_ix;
+	struct fit_completion_entry_v1 *skcomp_table;
+	struct fit_comp_error_info *skerr_table;
+	dma_addr_t cq_dma_address;
+
+	wait_queue_head_t waitq;
+
+	struct timer_list timer;
+	u32 timer_countdown;
+	u32 timer_substate;
+
+	int n_special;
+	int sgs_per_request;
+	u32 last_mtd;
+
+	u32 proto_ver;
+
+	int dbg_level;
+	u32 connect_time_stamp;
+	int connect_retries;
+#define SKD_MAX_CONNECT_RETRIES 16
+	u32 drive_jiffies;
+
+	u32 timo_slot;
+
+
+	struct work_struct completion_worker;
+
+	struct bio_list bio_queue;
+	int queue_stopped;
+
+	struct list_head flush_list;
+};
+
+#define SKD_FLUSH_JOB   "skd-flush-jobs"
+struct kmem_cache *skd_flush_slab;
+
+/*
+ * These commands hold "nonzero size FLUSH bios",
+ * which are enqueud in skdev->flush_list during
+ * completion of "zero size FLUSH commands".
+ * It will be active in biomode.
+ */
+struct skd_flush_cmd {
+	void *cmd;
+	struct list_head flist;
+};
+
+#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF)
+#define SKD_READL(DEV, OFF)      skd_reg_read32(DEV, OFF)
+#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF)
+
+static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset)
+{
+	u32 val;
+
+	if (likely(skdev->dbg_level < 2))
+		return readl(skdev->mem_map[1] + offset);
+	else {
+		barrier();
+		val = readl(skdev->mem_map[1] + offset);
+		barrier();
+		VPRINTK(skdev, "offset %x = %x\n", offset, val);
+		return val;
+	}
+
+}
+
+static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
+				   u32 offset)
+{
+	if (likely(skdev->dbg_level < 2)) {
+		writel(val, skdev->mem_map[1] + offset);
+		barrier();
+		readl(skdev->mem_map[1] + offset);
+		barrier();
+	} else {
+		barrier();
+		writel(val, skdev->mem_map[1] + offset);
+		barrier();
+		readl(skdev->mem_map[1] + offset);
+		barrier();
+		VPRINTK(skdev, "offset %x = %x\n", offset, val);
+	}
+}
+
+static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
+				   u32 offset)
+{
+	if (likely(skdev->dbg_level < 2)) {
+		writeq(val, skdev->mem_map[1] + offset);
+		barrier();
+		readq(skdev->mem_map[1] + offset);
+		barrier();
+	} else {
+		barrier();
+		writeq(val, skdev->mem_map[1] + offset);
+		barrier();
+		readq(skdev->mem_map[1] + offset);
+		barrier();
+		VPRINTK(skdev, "offset %x = %016llx\n", offset, val);
+	}
+}
+
+
+#define SKD_IRQ_DEFAULT