diff options
-rw-r--r-- | Documentation/ioctl/ioctl-number.txt | 1 | ||||
-rw-r--r-- | drivers/block/Kconfig | 11 | ||||
-rw-r--r-- | drivers/block/Makefile | 1 | ||||
-rw-r--r-- | drivers/block/nvme.c | 1745 | ||||
-rw-r--r-- | drivers/xen/biomerge.c | 1 | ||||
-rw-r--r-- | include/linux/nvme.h | 434 |
6 files changed, 2193 insertions, 0 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 54078ed96b3..4840334ea97 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -149,6 +149,7 @@ Code Seq#(hex) Include File Comments 'M' 01-03 drivers/scsi/megaraid/megaraid_sas.h 'M' 00-0F drivers/video/fsl-diu-fb.h conflict! 'N' 00-1F drivers/usb/scanner.h +'N' 40-7F drivers/block/nvme.c 'O' 00-06 mtd/ubi-user.h UBI 'P' all linux/soundcard.h conflict! 'P' 60-6F sound/sscape_ioctl.h conflict! diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a30aa103f95..4e4c8a4a5fd 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -317,6 +317,17 @@ config BLK_DEV_NBD If unsure, say N. +config BLK_DEV_NVME + tristate "NVM Express block device" + depends on PCI + ---help--- + The NVM Express driver is for solid state drives directly + connected to the PCI or PCI Express bus. If you know you + don't have one of these, it is safe to answer N. + + To compile this driver as a module, choose M here: the + module will be called nvme. + config BLK_DEV_OSD tristate "OSD object-as-blkdev support" depends on SCSI_OSD_ULD diff --git a/drivers/block/Makefile b/drivers/block/Makefile index ad7b74a44ef..5b795059f8f 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_XILINX_SYSACE) += xsysace.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_MG_DISK) += mg_disk.o obj-$(CONFIG_SUNVDC) += sunvdc.o +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o obj-$(CONFIG_BLK_DEV_UMEM) += umem.o diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c new file mode 100644 index 00000000000..f4996b0e4b1 --- /dev/null +++ b/drivers/block/nvme.c @@ -0,0 +1,1745 @@ +/* + * NVM Express device driver + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <linux/nvme.h> +#include <linux/bio.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/idr.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kdev_t.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/pci.h> +#include <linux/poison.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/version.h> + +#define NVME_Q_DEPTH 1024 +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define NVME_MINORS 64 +#define NVME_IO_TIMEOUT (5 * HZ) +#define ADMIN_TIMEOUT (60 * HZ) + +static int nvme_major; +module_param(nvme_major, int, 0); + +static int use_threaded_interrupts; +module_param(use_threaded_interrupts, int, 0); + +static DEFINE_SPINLOCK(dev_list_lock); +static LIST_HEAD(dev_list); +static struct task_struct *nvme_thread; + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + u32 __iomem *dbs; + struct pci_dev *pci_dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + int instance; + int queue_count; + int db_stride; + u32 ctrl_config; + struct msix_entry *entry; + struct nvme_bar __iomem *bar; + struct list_head namespaces; + char serial[20]; + char model[40]; + char firmware_rev[8]; +}; + +/* + * An NVM Express namespace is equivalent to a SCSI LUN + */ +struct nvme_ns { + struct list_head list; + + struct nvme_dev *dev; + struct request_queue *queue; + struct gendisk *disk; + + int ns_id; + int lba_shift; +}; + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct device *q_dmadev; + struct nvme_dev *dev; + spinlock_t q_lock; + struct nvme_command *sq_cmds; + volatile struct nvme_completion *cqes; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + wait_queue_head_t sq_full; + wait_queue_t sq_cong_wait; + struct bio_list sq_cong; + u32 __iomem *q_db; + u16 q_depth; + u16 cq_vector; + u16 sq_head; + u16 sq_tail; + u16 cq_head; + u16 cq_phase; + unsigned long cmdid_data[]; +}; + +/* + * Check we didin't inadvertently grow the command struct + */ +static inline void _nvme_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); + BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); +} + +typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, + struct nvme_completion *); + +struct nvme_cmd_info { + nvme_completion_fn fn; + void *ctx; + unsigned long timeout; +}; + +static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +{ + return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; +} + +/** + * alloc_cmdid() - Allocate a Command ID + * @nvmeq: The queue that will be used for this command + * @ctx: A pointer that will be passed to the handler + * @handler: The function to call on completion + * + * Allocate a Command ID for a queue. The data passed in will + * be passed to the completion handler. This is implemented by using + * the bottom two bits of the ctx pointer to store the handler ID. + * Passing in a pointer that's not 4-byte aligned will cause a BUG. + * We can change this if it becomes a problem. + * + * May be called with local interrupts disabled and the q_lock held, + * or with interrupts enabled and no locks held. + */ +static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, + nvme_completion_fn handler, unsigned timeout) +{ + int depth = nvmeq->q_depth - 1; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + int cmdid; + + do { + cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); + if (cmdid >= depth) + return -EBUSY; + } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); + + info[cmdid].fn = handler; + info[cmdid].ctx = ctx; + info[cmdid].timeout = jiffies + timeout; + return cmdid; +} + +static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, + nvme_completion_fn handler, unsigned timeout) +{ + int cmdid; + wait_event_killable(nvmeq->sq_full, + (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); + return (cmdid < 0) ? -EINTR : cmdid; +} + +/* Special values must be less than 0x1000 */ +#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) +#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) +#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) +#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) +#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) + +static void special_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + if (ctx == CMD_CTX_CANCELLED) + return; + if (ctx == CMD_CTX_FLUSH) + return; + if (ctx == CMD_CTX_COMPLETED) { + dev_warn(&dev->pci_dev->dev, + "completed id %d twice on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + if (ctx == CMD_CTX_INVALID) { + dev_warn(&dev->pci_dev->dev, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + + dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, + nvme_completion_fn *fn) +{ + void *ctx; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + + if (cmdid >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + *fn = info[cmdid].fn; + ctx = info[cmdid].ctx; + info[cmdid].fn = special_completion; + info[cmdid].ctx = CMD_CTX_COMPLETED; + clear_bit(cmdid, nvmeq->cmdid_data); + wake_up(&nvmeq->sq_full); + return ctx; +} + +static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, + nvme_completion_fn *fn) +{ + void *ctx; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + if (fn) + *fn = info[cmdid].fn; + ctx = info[cmdid].ctx; + info[cmdid].fn = special_completion; + info[cmdid].ctx = CMD_CTX_CANCELLED; + return ctx; +} + +static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) +{ + return dev->queues[get_cpu() + 1]; +} + +static void put_nvmeq(struct nvme_queue *nvmeq) +{ + put_cpu(); +} + +/** + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * @nvmeq: The queue to use + * @cmd: The command to send + * + * Safe to use from interrupt context + */ +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + u16 tail; + spin_lock_irqsave(&nvmeq->q_lock, flags); + tail = nvmeq->sq_tail; + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + if (++tail == nvmeq->q_depth) + tail = 0; + writel(tail, nvmeq->q_db); + nvmeq->sq_tail = tail; + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + + return 0; +} + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_alloc_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + void *private; /* For the use of the submitter of the I/O */ + int npages; /* In the PRP list. 0 means small pool in use */ + int offset; /* Of PRP list */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist sg[0]; +}; + +static __le64 **iod_list(struct nvme_iod *iod) +{ + return ((void *)iod) + iod->offset; +} + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size) +{ + unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static struct nvme_iod * +nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) +{ + struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + + sizeof(__le64 *) * nvme_npages(nbytes) + + sizeof(struct scatterlist) * nseg, gfp); + + if (iod) { + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + } + + return iod; +} + +static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +{ + const int last_prp = PAGE_SIZE / 8 - 1; + int i; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma = iod->first_dma; + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = list[i]; + dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); + dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); + prp_dma = next_prp_dma; + } + kfree(iod); +} + +static void requeue_bio(struct nvme_dev *dev, struct bio *bio) +{ + struct nvme_queue *nvmeq = get_nvmeq(dev); + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + put_nvmeq(nvmeq); + wake_up_process(nvme_thread); +} + +static void bio_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + struct nvme_iod *iod = ctx; + struct bio *bio = iod->private; + u16 status = le16_to_cpup(&cqe->status) >> 1; + + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + nvme_free_iod(dev, iod); + if (status) { + bio_endio(bio, -EIO); + } else if (bio->bi_vcnt > bio->bi_idx) { + requeue_bio(dev, bio); + } else { + bio_endio(bio, 0); + } +} + +/* length is in bytes. gfp flags indicates whether we may sleep. */ +static int nvme_setup_prps(struct nvme_dev *dev, + struct nvme_common_command *cmd, struct nvme_iod *iod, + int total_len, gfp_t gfp) +{ + struct dma_pool *pool; + int length = total_len; + struct scatterlist *sg = iod->sg; + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + int offset = offset_in_page(dma_addr); + __le64 *prp_list; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma; + int nprps, i; + + cmd->prp1 = cpu_to_le64(dma_addr); + length -= (PAGE_SIZE - offset); + if (length <= 0) + return total_len; + + dma_len -= (PAGE_SIZE - offset); + if (dma_len) { + dma_addr += (PAGE_SIZE - offset); + } else { + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + if (length <= PAGE_SIZE) { + cmd->prp2 = cpu_to_le64(dma_addr); + return total_len; + } + + nprps = DIV_ROUND_UP(length, PAGE_SIZE); + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) { + cmd->prp2 = cpu_to_le64(dma_addr); + iod->npages = -1; + return (total_len - length) + PAGE_SIZE; + } + list[0] = prp_list; + iod->first_dma = prp_dma; + cmd->prp2 = cpu_to_le64(prp_dma); + i = 0; + for (;;) { + if (i == PAGE_SIZE / 8) { + __le64 *old_prp_list = prp_list; + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) + return total_len - length; + list[iod->npages++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + prp_list[i++] = cpu_to_le64(dma_addr); + dma_len -= PAGE_SIZE; + dma_addr += PAGE_SIZE; + length -= PAGE_SIZE; + if (length <= 0) + break; + if (dma_len > 0) + continue; + BUG_ON(dma_len < 0); + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + return total_len; +} + +/* NVMe scatterlists require no holes in the virtual address */ +#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ + (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) + +static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, + struct bio *bio, enum dma_data_direction dma_dir, int psegs) +{ + struct bio_vec *bvec, *bvprv = NULL; + struct scatterlist *sg = NULL; + int i, old_idx, length = 0, nsegs = 0; + + sg_init_table(iod->sg, psegs); + old_idx = bio->bi_idx; + bio_for_each_segment(bvec, bio, i) { + if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { + sg->length += bvec->bv_len; + } else { + if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) + break; + sg = sg ? sg + 1 : iod->sg; + sg_set_page(sg, bvec->bv_page, bvec->bv_len, + bvec->bv_offset); + nsegs++; + } + length += bvec->bv_len; + bvprv = bvec; + } + bio->bi_idx = i; + iod->nents = nsegs; + sg_mark_end(sg); + if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { + bio->bi_idx = old_idx; + return -ENOMEM; + } + return length; +} + +static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, + int cmdid) +{ + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.command_id = cmdid; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + +static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) +{ + int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, + special_completion, NVME_IO_TIMEOUT); + if (unlikely(cmdid < 0)) + return cmdid; + + return nvme_submit_flush(nvmeq, ns, cmdid); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct bio *bio) +{ + struct nvme_command *cmnd; + struct nvme_iod *iod; + enum dma_data_direction dma_dir; + int cmdid, length, result = -ENOMEM; + u16 control; + u32 dsmgmt; + int psegs = bio_phys_segments(ns->queue, bio); + + if ((bio->bi_rw & REQ_FLUSH) && psegs) { + result = nvme_submit_flush_data(nvmeq, ns); + if (result) + return result; + } + + iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); + if (!iod) + goto nomem; + iod->private = bio; + + result = -EBUSY; + cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); + if (unlikely(cmdid < 0)) + goto free_iod; + + if ((bio->bi_rw & REQ_FLUSH) && !psegs) + return nvme_submit_flush(nvmeq, ns, cmdid); + + control = 0; + if (bio->bi_rw & REQ_FUA) + control |= NVME_RW_FUA; + if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + dsmgmt = 0; + if (bio->bi_rw & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memset(cmnd, 0, sizeof(*cmnd)); + if (bio_data_dir(bio)) { + cmnd->rw.opcode = nvme_cmd_write; + dma_dir = DMA_TO_DEVICE; + } else { + cmnd->rw.opcode = nvme_cmd_read; + dma_dir = DMA_FROM_DEVICE; + } + + result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); + if (result < 0) + goto free_iod; + length = result; + + cmnd->rw.command_id = cmdid; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, + GFP_ATOMIC); + cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + + bio->bi_sector += length >> 9; + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; + + free_iod: + nvme_free_iod(nvmeq->dev, iod); + nomem: + return result; +} + +/* + * NB: return value of non-zero would mean that we were a stacking driver. + * make_request must always succeed. + */ +static int nvme_make_request(struct request_queue *q, struct bio *bio) +{ + struct nvme_ns *ns = q->queuedata; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + int result = -EBUSY; + + spin_lock_irq(&nvmeq->q_lock); + if (bio_list_empty(&nvmeq->sq_cong)) + result = nvme_submit_bio_queue(nvmeq, ns, bio); + if (unlikely(result)) { + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + } + + spin_unlock_irq(&nvmeq->q_lock); + put_nvmeq(nvmeq); + + return 0; +} + +static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) +{ + u16 head, phase; + + head = nvmeq->cq_head; + phase = nvmeq->cq_phase; + + for (;;) { + void *ctx; + nvme_completion_fn fn; + struct nvme_completion cqe = nvmeq->cqes[head]; + if ((le16_to_cpu(cqe.status) & 1) != phase) + break; + nvmeq->sq_head = le16_to_cpu(cqe.sq_head); + if (++head == nvmeq->q_depth) { + head = 0; + phase = !phase; + } + + ctx = free_cmdid(nvmeq, cqe.command_id, &fn); + fn(nvmeq->dev, ctx, &cqe); + } + + /* If the controller ignores the cq head doorbell and continuously + * writes to the queue, it is theoretically possible to wrap around + * the queue twice and mistakenly return IRQ_NONE. Linux only + * requires that 0.1% of your interrupts are handled, so this isn't + * a big problem. + */ + if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + return IRQ_NONE; + + writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); + nvmeq->cq_head = head; + nvmeq->cq_phase = phase; + + return IRQ_HANDLED; +} + +static irqreturn_t nvme_irq(int irq, void *data) +{ + irqreturn_t result; + struct nvme_queue *nvmeq = data; + spin_lock(&nvmeq->q_lock); + result = nvme_process_cq(nvmeq); + spin_unlock(&nvmeq->q_lock); + return result; +} + +static irqreturn_t nvme_irq_check(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; + if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) + return IRQ_NONE; + return IRQ_WAKE_THREAD; +} + +static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) +{ + spin_lock_irq(&nvmeq->q_lock); + cancel_cmdid(nvmeq, cmdid, NULL); + spin_unlock_irq(&nvmeq->q_lock); +} + +struct sync_cmd_info { + struct task_struct *task; + u32 result; + int status; +}; + +static void sync_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + struct sync_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + wake_up_process(cmdinfo->task); +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd, u32 *result, unsigned timeout) +{ + int cmdid; + struct sync_cmd_info cmdinfo; + + cmdinfo.task = current; + cmdinfo.status = -EINTR; + + cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, + timeout); + if (cmdid < 0) + return cmdid; + cmd->common.command_id = cmdid; + + set_current_state(TASK_KILLABLE); + nvme_submit_cmd(nvmeq, cmd); + schedule(); + + if (cmdinfo.status == -EINTR) { + nvme_abort_command(nvmeq, cmdid); + return -EINTR; + } + + if (result) + *result = cmdinfo.result; + + return cmdinfo.status; +} + +static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result) +{ + return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + int status; + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + int status; + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + + memset(&c, 0, sizeof(c)); + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + int status; + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + + memset(&c, 0, sizeof(c)); + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_le16(qid); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); +} + +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); +} + +static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, + dma_addr_t dma_addr) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.prp1 = cpu_to_le64(dma_addr); + c.identify.cns = cpu_to_le32(cns); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +static int nvme_get_features(struct nvme_dev *dev, unsigned fid, + unsigned dword11, dma_addr_t dma_addr) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +static int nvme_set_features(struct nvme_dev *dev, unsigned fid, + unsigned dword11, dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return nvme_submit_admin_cmd(dev, &c, result); +} + +static void nvme_free_queue(struct nvme_dev *dev, int qid) +{ + struct nvme_queue *nvmeq = dev->queues[qid]; + int vector = dev->entry[nvmeq->cq_vector].vector; + + irq_set_affinity_hint(vector, NULL); + free_irq(vector, nvmeq); + + /* Don't tell the adapter to delete the admin queue */ + if (qid) { + adapter_delete_sq(dev, qid); + adapter_delete_cq(dev, qid); + } + + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); +} + +static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth, int vector) +{ + struct device *dmadev = &dev->pci_dev->dev; + unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info)); + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); + if (!nvmeq) + return NULL; + + nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); + if (!nvmeq->cqes) + goto free_nvmeq; + memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); + + nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + goto free_cqdma; + + nvmeq->q_dmadev = dmadev; + nvmeq->dev = dev; + spin_lock_init(&nvmeq->q_lock); + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + init_waitqueue_head(&nvmeq->sq_full); + init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); + bio_list_init(&nvmeq->sq_cong); + nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; + nvmeq->q_depth = depth; + nvmeq->cq_vector = vector; + + return nvmeq; + + free_cqdma: + dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); + free_nvmeq: + kfree(nvmeq); + return NULL; +} + +static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, + const char *name) +{ + if (use_threaded_interrupts) + return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, + nvme_irq_check, nvme_irq, + IRQF_DISABLED | IRQF_SHARED, + name, nvmeq); + return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, + IRQF_DISABLED | IRQF_SHARED, name, nvmeq); +} + +static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, + int qid, int cq_size, int vector) +{ + int result; + struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); + + if (!nvmeq) + return ERR_PTR(-ENOMEM); + + result = adapter_alloc_cq(dev, qid, nvmeq); + if (result < 0) + goto free_nvmeq; + + result = adapter_alloc_sq(dev, qid, nvmeq); + if (result < 0) + goto release_cq; + + result = queue_request_irq(dev, nvmeq, "nvme"); + if (result < 0) + goto release_sq; + + return nvmeq; + + release_sq: + adapter_delete_sq(dev, qid); + release_cq: + adapter_delete_cq(dev, qid); + free_nvmeq: + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); + return ERR_PTR(result); +} + +static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) +{ + int result; + u32 aqa; + u64 cap; + unsigned long timeout; + struct nvme_queue *nvmeq; + + dev->dbs = ((void __iomem *)dev->bar) + 4096; + + nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + if (!nvmeq) + return -ENOMEM; + + aqa = nvmeq->q_depth - 1; + aqa |= aqa << 16; + + dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; + dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + + writel(0, &dev->bar->cc); + writel(aqa, &dev->bar->aqa); + writeq(nvmeq->sq_dma_addr, &dev->bar->asq); + writeq(nvmeq->cq_dma_addr, &dev->bar->acq); + writel(dev->ctrl_config, &dev->bar->cc); + + cap = readq(&dev->bar->cap); + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + dev->db_stride = NVME_CAP_STRIDE(cap); + + while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device not ready; aborting initialisation\n"); + return -ENODEV; + } + } + + result = queue_request_irq(dev, nvmeq, "nvme admin"); + dev->queues[0] = nvmeq; + return result; +} + +static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, + unsigned long addr, unsigned length) +{ + int i, err, count, nents, offset; + struct scatterlist *sg; + struct page **pages; + struct nvme_iod *iod; + + if (addr & 3) + return ERR_PTR(-EINVAL); + if (!length) + return ERR_PTR(-EINVAL); + + offset = offset_in_page(addr); + count = DIV_ROUND_UP(offset + length, PAGE_SIZE); + pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); + + err = get_user_pages_fast(addr, count, 1, pages); + if (err < count) { + count = err; + err = -EFAULT; + goto put_pages; + } + + iod = nvme_alloc_iod(count, length, GFP_KERNEL); + sg = iod->sg; + sg_init_table(sg, count); + for (i = 0; i < count; i++) { + sg_set_page(&sg[i], pages[i], + min_t(int, length, PAGE_SIZE - offset), offset); + length -= (PAGE_SIZE - offset); + offset = 0; + } + sg_mark_end(&sg[i - 1]); + iod->nents = count; + + err = -ENOMEM; + nents = dma_map_sg(&dev->pci_dev->dev, sg, count, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (!nents) + goto free_iod; + + kfree(pages); + return iod; + + free_iod: + kfree(iod); + put_pages: + for (i = 0; i < count; i++) + put_page(pages[i]); + kfree(pages); + return ERR_PTR(err); +} + +static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, + struct nvme_iod *iod) +{ + int i; + + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + for (i = 0; i < iod->nents; i++) + put_page(sg_page(&iod->sg[i])); +} + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_dev *dev = ns->dev; + struct nvme_queue *nvmeq; + struct nvme_user_io io; + struct nvme_command c; + unsigned length; + int status; + struct nvme_iod *iod; |