diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-09 16:35:00 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-09 16:35:00 -0700 |
commit | 2d4fe27850420606155fb1f7d18ab2b40153e67b (patch) | |
tree | 56b0d465e1189babf4ac668a5b048a747bfb9682 /drivers/block | |
parent | 2e99f3a12b20ab3afad0e042cc0bdd0ee855dca0 (diff) | |
parent | 94f370cab6e5ac514b658c6b2b3aa308cefc5c7a (diff) |
Merge git://git.infradead.org/users/willy/linux-nvme
Pull NVMe driver update from Matthew Wilcox:
"Lots of exciting new features in the NVM Express driver this time,
including support for emulating SCSI commands, discard support and the
ability to submit per-sector metadata with I/Os.
It's still mostly bugfixes though!"
* git://git.infradead.org/users/willy/linux-nvme: (27 commits)
NVMe: Use user defined admin ioctl timeout
NVMe: Simplify Firmware Activate code slightly
NVMe: Only clear the enable bit when disabling controller
NVMe: Wait for device to acknowledge shutdown
NVMe: Schedule timeout for sync commands
NVMe: Meta-data support in NVME_IOCTL_SUBMIT_IO
NVMe: Device specific stripe size handling
NVMe: Split non-mergeable bio requests
NVMe: Remove dead code in nvme_dev_add
NVMe: Check for NULL memory in nvme_dev_add
NVMe: Fix error clean-up on nvme_alloc_queue
NVMe: Free admin queue on request_irq error
NVMe: Add scsi unmap to SG_IO
NVMe: queue usage fixes in nvme-scsi
NVMe: Set TASK_INTERRUPTIBLE before processing queues
NVMe: Add a character device for each nvme device
NVMe: Fix endian-related problems in user I/O submission path
NVMe: Fix I/O cancellation status on big-endian machines
NVMe: Fix sparse warnings in scsi emulation
NVMe: Don't fail initialisation unnecessarily
...
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/Makefile | 1 | ||||
-rw-r--r-- | drivers/block/nvme-core.c (renamed from drivers/block/nvme.c) | 594 | ||||
-rw-r--r-- | drivers/block/nvme-scsi.c | 3053 |
3 files changed, 3484 insertions, 164 deletions
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index a3b40232c6a..ca07399a8d9 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -42,4 +42,5 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ +nvme-y := nvme-core.o nvme-scsi.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/nvme.c b/drivers/block/nvme-core.c index 9dcefe40380..8efdfaa44a5 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme-core.c @@ -39,14 +39,13 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/types.h> - +#include <scsi/sg.h> #include <asm-generic/io-64-nonatomic-lo-hi.h> #define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define NVME_MINORS 64 -#define NVME_IO_TIMEOUT (5 * HZ) #define ADMIN_TIMEOUT (60 * HZ) static int nvme_major; @@ -60,43 +59,6 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; /* - * Represents an NVM Express device. Each nvme_dev is a PCI function. - */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; - u32 __iomem *dbs; - struct pci_dev *pci_dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; - int instance; - int queue_count; - int db_stride; - u32 ctrl_config; - struct msix_entry *entry; - struct nvme_bar __iomem *bar; - struct list_head namespaces; - char serial[20]; - char model[40]; - char firmware_rev[8]; - u32 max_hw_sectors; -}; - -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ -struct nvme_ns { - struct list_head list; - - struct nvme_dev *dev; - struct request_queue *queue; - struct gendisk *disk; - - int ns_id; - int lba_shift; -}; - -/* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). */ @@ -131,6 +93,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); @@ -261,12 +224,12 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, return ctx; } -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) +struct nvme_queue *get_nvmeq(struct nvme_dev *dev) { return dev->queues[get_cpu() + 1]; } -static void put_nvmeq(struct nvme_queue *nvmeq) +void put_nvmeq(struct nvme_queue *nvmeq) { put_cpu(); } @@ -294,22 +257,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) return 0; } -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist sg[0]; -}; - static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; @@ -343,7 +290,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) return iod; } -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = PAGE_SIZE / 8 - 1; int i; @@ -361,16 +308,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } -static void requeue_bio(struct nvme_dev *dev, struct bio *bio) -{ - struct nvme_queue *nvmeq = get_nvmeq(dev); - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - put_nvmeq(nvmeq); - wake_up_process(nvme_thread); -} - static void bio_completion(struct nvme_dev *dev, void *ctx, struct nvme_completion *cqe) { @@ -382,19 +319,15 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(dev, iod); - if (status) { + if (status) bio_endio(bio, -EIO); - } else if (bio->bi_vcnt > bio->bi_idx) { - requeue_bio(dev, bio); - } else { + else bio_endio(bio, 0); - } } /* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, - struct nvme_common_command *cmd, struct nvme_iod *iod, - int total_len, gfp_t gfp) +int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, + struct nvme_iod *iod, int total_len, gfp_t gfp) { struct dma_pool *pool; int length = total_len; @@ -473,43 +406,193 @@ static int nvme_setup_prps(struct nvme_dev *dev, return total_len; } +struct nvme_bio_pair { + struct bio b1, b2, *parent; + struct bio_vec *bv1, *bv2; + int err; + atomic_t cnt; +}; + +static void nvme_bio_pair_endio(struct bio *bio, int err) +{ + struct nvme_bio_pair *bp = bio->bi_private; + + if (err) + bp->err = err; + + if (atomic_dec_and_test(&bp->cnt)) { + bio_endio(bp->parent, bp->err); + if (bp->bv1) + kfree(bp->bv1); + if (bp->bv2) + kfree(bp->bv2); + kfree(bp); + } +} + +static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, + int len, int offset) +{ + struct nvme_bio_pair *bp; + + BUG_ON(len > bio->bi_size); + BUG_ON(idx > bio->bi_vcnt); + + bp = kmalloc(sizeof(*bp), GFP_ATOMIC); + if (!bp) + return NULL; + bp->err = 0; + + bp->b1 = *bio; + bp->b2 = *bio; + + bp->b1.bi_size = len; + bp->b2.bi_size -= len; + bp->b1.bi_vcnt = idx; + bp->b2.bi_idx = idx; + bp->b2.bi_sector += len >> 9; + + if (offset) { + bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), + GFP_ATOMIC); + if (!bp->bv1) + goto split_fail_1; + + bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), + GFP_ATOMIC); + if (!bp->bv2) + goto split_fail_2; + + memcpy(bp->bv1, bio->bi_io_vec, + bio->bi_max_vecs * sizeof(struct bio_vec)); + memcpy(bp->bv2, bio->bi_io_vec, + bio->bi_max_vecs * sizeof(struct bio_vec)); + + bp->b1.bi_io_vec = bp->bv1; + bp->b2.bi_io_vec = bp->bv2; + bp->b2.bi_io_vec[idx].bv_offset += offset; + bp->b2.bi_io_vec[idx].bv_len -= offset; + bp->b1.bi_io_vec[idx].bv_len = offset; + bp->b1.bi_vcnt++; + } else + bp->bv1 = bp->bv2 = NULL; + + bp->b1.bi_private = bp; + bp->b2.bi_private = bp; + + bp->b1.bi_end_io = nvme_bio_pair_endio; + bp->b2.bi_end_io = nvme_bio_pair_endio; + + bp->parent = bio; + atomic_set(&bp->cnt, 2); + + return bp; + + split_fail_2: + kfree(bp->bv1); + split_fail_1: + kfree(bp); + return NULL; +} + +static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, + int idx, int len, int offset) +{ + struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); + if (!bp) + return -ENOMEM; + + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, &bp->b1); + bio_list_add(&nvmeq->sq_cong, &bp->b2); + + return 0; +} + /* NVMe scatterlists require no holes in the virtual address */ #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) -static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, +static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct bio *bio, enum dma_data_direction dma_dir, int psegs) { struct bio_vec *bvec, *bvprv = NULL; struct scatterlist *sg = NULL; - int i, old_idx, length = 0, nsegs = 0; + int i, length = 0, nsegs = 0, split_len = bio->bi_size; + + if (nvmeq->dev->stripe_size) + split_len = nvmeq->dev->stripe_size - + ((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1)); sg_init_table(iod->sg, psegs); - old_idx = bio->bi_idx; bio_for_each_segment(bvec, bio, i) { if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { sg->length += bvec->bv_len; } else { if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) - break; + return nvme_split_and_submit(bio, nvmeq, i, + length, 0); + sg = sg ? sg + 1 : iod->sg; sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); nsegs++; } + + if (split_len - length < bvec->bv_len) + return nvme_split_and_submit(bio, nvmeq, i, split_len, + split_len - length); length += bvec->bv_len; bvprv = bvec; } - bio->bi_idx = i; iod->nents = nsegs; sg_mark_end(sg); - if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { - bio->bi_idx = old_idx; + if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) return -ENOMEM; - } + + BUG_ON(length != bio->bi_size); return length; } +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct bio *bio, struct nvme_iod *iod, int cmdid) +{ + struct nvme_dsm_range *range; + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, + &iod->first_dma); + if (!range) + return -ENOMEM; + + iod_list(iod)[0] = (__le64 *)range; + iod->npages = 0; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.command_id = cmdid; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, int cmdid) { @@ -527,7 +610,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; } -static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) +int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) { int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, special_completion, NVME_IO_TIMEOUT); @@ -567,6 +650,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (unlikely(cmdid < 0)) goto free_iod; + if (bio->bi_rw & REQ_DISCARD) { + result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); + if (result) + goto free_cmdid; + return result; + } if ((bio->bi_rw & REQ_FLUSH) && !psegs) return nvme_submit_flush(nvmeq, ns, cmdid); @@ -591,8 +680,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, dma_dir = DMA_FROM_DEVICE; } - result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); - if (result < 0) + result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); + if (result <= 0) goto free_cmdid; length = result; @@ -600,13 +689,11 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd->rw.nsid = cpu_to_le32(ns->ns_id); length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, GFP_ATOMIC); - cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); - bio->bi_sector += length >> 9; - if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); @@ -724,8 +811,8 @@ static void sync_completion(struct nvme_dev *dev, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, - struct nvme_command *cmd, u32 *result, unsigned timeout) +int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, + u32 *result, unsigned timeout) { int cmdid; struct sync_cmd_info cmdinfo; @@ -741,7 +828,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, set_current_state(TASK_KILLABLE); nvme_submit_cmd(nvmeq, cmd); - schedule(); + schedule_timeout(timeout); if (cmdinfo.status == -EINTR) { nvme_abort_command(nvmeq, cmdid); @@ -754,7 +841,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, return cmdinfo.status; } -static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); @@ -827,7 +914,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, +int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, dma_addr_t dma_addr) { struct nvme_command c; @@ -841,7 +928,7 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, return nvme_submit_admin_cmd(dev, &c, NULL); } -static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, +int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -855,8 +942,8 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, return nvme_submit_admin_cmd(dev, &c, result); } -static int nvme_set_features(struct nvme_dev *dev, unsigned fid, - unsigned dword11, dma_addr_t dma_addr, u32 *result) +int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -885,7 +972,7 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) void *ctx; nvme_completion_fn fn; static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, + .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), }; if (timeout && !time_after(now, info[cmdid].timeout)) @@ -966,7 +1053,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, return nvmeq; free_cqdma: - dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, + dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: kfree(nvmeq); @@ -1021,15 +1108,60 @@ static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, return ERR_PTR(result); } +static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) +{ + unsigned long timeout; + u32 bit = enabled ? NVME_CSTS_RDY : 0; + + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + + while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device not ready; aborting initialisation\n"); + return -ENODEV; + } + } + + return 0; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) +{ + u32 cc = readl(&dev->bar->cc); + + if (cc & NVME_CC_ENABLE) + writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); + return nvme_wait_ready(dev, cap, false); +} + +static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) +{ + return nvme_wait_ready(dev, cap, true); +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { - int result = 0; + int result; u32 aqa; - u64 cap; - unsigned long timeout; + u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->db_stride = NVME_CAP_STRIDE(cap); + + result = nvme_disable_ctrl(dev, cap); + if (result < 0) + return result; nvmeq = nvme_alloc_queue(dev, 0, 64, 0); if (!nvmeq) @@ -1043,38 +1175,28 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - writel(0, &dev->bar->cc); writel(aqa, &dev->bar->aqa); writeq(nvmeq->sq_dma_addr, &dev->bar->asq); writeq(nvmeq->cq_dma_addr, &dev->bar->acq); writel(dev->ctrl_config, &dev->bar->cc); - cap = readq(&dev->bar->cap); - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - dev->db_stride = NVME_CAP_STRIDE(cap); - - while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { - msleep(100); - if (fatal_signal_pending(current)) - result = -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, - "Device not ready; aborting initialisation\n"); - result = -ENODEV; - } - } - - if (result) { - nvme_free_queue_mem(nvmeq); - return result; - } + result = nvme_enable_ctrl(dev, cap); + if (result) + goto free_q; result = queue_request_irq(dev, nvmeq, "nvme admin"); + if (result) + goto free_q; + dev->queues[0] = nvmeq; return result; + + free_q: + nvme_free_queue_mem(nvmeq); + return result; } -static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, +struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length) { int i, err, count, nents, offset; @@ -1130,7 +1252,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, return ERR_PTR(err); } -static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, +void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod) { int i; @@ -1148,13 +1270,19 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) struct nvme_queue *nvmeq; struct nvme_user_io io; struct nvme_command c; - unsigned length; - int status; - struct nvme_iod *iod; + unsigned length, meta_len; + int status, i; + struct nvme_iod *iod, *meta_iod = NULL; + dma_addr_t meta_dma_addr; + void *meta, *uninitialized_var(meta_mem); if (copy_from_user(&io, uio, sizeof(io))) return -EFAULT; length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + + if (meta_len && ((io.metadata & 3) || !io.metadata)) + return -EINVAL; switch (io.opcode) { case nvme_cmd_write: @@ -1176,11 +1304,42 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.slba = cpu_to_le64(io.slba); c.rw.length = cpu_to_le16(io.nblocks); c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); - c.rw.reftag = io.reftag; - c.rw.apptag = io.apptag; - c.rw.appmask = io.appmask; - /* XXX: metadata */ + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + if (meta_len) { + meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len); + if (IS_ERR(meta_iod)) { + status = PTR_ERR(meta_iod); + meta_iod = NULL; + goto unmap; + } + + meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, + &meta_dma_addr, GFP_KERNEL); + if (!meta_mem) { + status = -ENOMEM; + goto unmap; + } + + if (io.opcode & 1) { + int meta_offset = 0; + + for (i = 0; i < meta_iod->nents; i++) { + meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + + meta_iod->sg[i].offset; + memcpy(meta_mem + meta_offset, meta, + meta_iod->sg[i].length); + kunmap_atomic(meta); + meta_offset += meta_iod->sg[i].length; + } + } + + c.rw.metadata = cpu_to_le64(meta_dma_addr); + } + length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); nvmeq = get_nvmeq(dev); @@ -1196,8 +1355,33 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) else status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + if (meta_len) { + if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { + int meta_offset = 0; + + for (i = 0; i < meta_iod->nents; i++) { + meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + + meta_iod->sg[i].offset; + memcpy(meta, meta_mem + meta_offset, + meta_iod->sg[i].length); + kunmap_atomic(meta); + meta_offset += meta_iod->sg[i].length; + } + } + + dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, + meta_dma_addr); + } + + unmap: nvme_unmap_user_pages(dev, io.opcode & 1, iod); nvme_free_iod(dev, iod); + + if (meta_iod) { + nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); + nvme_free_iod(dev, meta_iod); + } + return status; } @@ -1208,6 +1392,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, struct nvme_command c; int status, length; struct nvme_iod *uninitialized_var(iod); + unsigned timeout; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -1237,10 +1422,13 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, GFP_KERNEL); } + timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : + ADMIN_TIMEOUT; if (length != cmd.data_len) status = -ENOMEM; else - status = nvme_submit_admin_cmd(dev, &c, &cmd.result); + status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, + timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); @@ -1266,6 +1454,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, return nvme_user_admin_cmd(ns->dev, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); default: return -ENOTTY; } @@ -1282,13 +1474,17 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq) while (bio_list_peek(&nvmeq->sq_cong)) { struct bio *bio = bio_list_pop(&nvmeq->sq_cong); struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + + if (bio_list_empty(&nvmeq->sq_cong)) + remove_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); if (nvme_submit_bio_queue(nvmeq, ns, bio)) { + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); bio_list_add_head(&nvmeq->sq_cong, bio); break; } - if (bio_list_empty(&nvmeq->sq_cong)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); } } @@ -1297,7 +1493,7 @@ static int nvme_kthread(void *data) struct nvme_dev *dev; while (!kthread_should_stop()) { - __set_current_state(TASK_RUNNING); + set_current_state(TASK_INTERRUPTIBLE); spin_lock(&dev_list_lock); list_for_each_entry(dev, &dev_list, node) { int i; @@ -1314,8 +1510,7 @@ static int nvme_kthread(void *data) } } spin_unlock(&dev_list_lock); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(round_jiffies_relative(HZ)); } return 0; } @@ -1347,6 +1542,16 @@ static void nvme_put_ns_idx(int index) spin_unlock(&dev_list_lock); } +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + ns->queue->limits.max_discard_sectors = 0xffffffff; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, struct nvme_id_ns *id, struct nvme_lba_range_type *rt) { @@ -1366,7 +1571,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); -/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */ blk_queue_make_request(ns->queue, nvme_make_request); ns->dev = dev; ns->queue->queuedata = ns; @@ -1378,6 +1582,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, ns->disk = disk; lbaf = id->flbas & 0xf; ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); @@ -1392,6 +1597,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + if (dev->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + return ns; out_free_queue: @@ -1496,14 +1704,21 @@ static void nvme_free_queues(struct nvme_dev *dev) nvme_free_queue(dev, i); } +/* + * Return: error value if an error occurred setting up the queues or calling + * Identify Device. 0 if these succeeded, even if adding some of the + * namespaces failed. At the moment, these failures are silent. TBD which + * failures should be reported. + */ static int nvme_dev_add(struct nvme_dev *dev) { int res, nn, i; - struct nvme_ns *ns, *next; + struct nvme_ns *ns; struct nvme_id_ctrl *ctrl; struct nvme_id_ns *id_ns; void *mem; dma_addr_t dma_addr; + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; res = nvme_setup_io_queues(dev); if (res) @@ -1511,22 +1726,26 @@ static int nvme_dev_add(struct nvme_dev *dev) mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, GFP_KERNEL); + if (!mem) + return -ENOMEM; res = nvme_identify(dev, 0, 1, dma_addr); if (res) { res = -EIO; - goto out_free; + goto out; } ctrl = mem; nn = le32_to_cpup(&ctrl->nn); + dev->oncs = le16_to_cpup(&ctrl->oncs); memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) { - int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; + if (ctrl->mdts) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - } + if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) && + (dev->pci_dev->device == 0x0953) && ctrl->vs[3]) + dev->stripe_size = 1 << (ctrl->vs[3] + shift); id_ns = mem; for (i = 1; i <= nn; i++) { @@ -1548,14 +1767,7 @@ static int nvme_dev_add(struct nvme_dev *dev) } list_for_each_entry(ns, &dev->namespaces, list) add_disk(ns->disk); - - goto out; - - out_free: - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - nvme_ns_free(ns); - } + res = 0; out: dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); @@ -1634,6 +1846,56 @@ static void nvme_release_instance(struct nvme_dev *dev) spin_unlock(&dev_list_lock); } +static void nvme_free_dev(struct kref *kref) +{ + struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + nvme_dev_remove(dev); + pci_disable_msix(dev->pci_dev); + iounmap(dev->bar); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); + pci_disable_device(dev->pci_dev); + pci_release_regions(dev->pci_dev); + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); +} + +static int nvme_dev_open(struct inode *inode, struct file *f) +{ + struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, + miscdev); + kref_get(&dev->kref); + f->private_data = dev; + return 0; +} + +static int nvme_dev_release(struct inode *inode, struct file *f) +{ + struct nvme_dev *dev = f->private_data; + kref_put(&dev->kref, nvme_free_dev); + return 0; +} + +static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct nvme_dev *dev = f->private_data; + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_admin_cmd(dev, (void __user *)arg); + default: + return -ENOTTY; + } +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = nvme_dev_ioctl, +}; + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int bars, result = -ENOMEM; @@ -1692,8 +1954,20 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto delete; + scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); + dev->miscdev.minor = MISC_DYNAMIC_MINOR; + dev->miscdev.parent = &pdev->dev; + dev->miscdev.name = dev->name; + dev->miscdev.fops = &nvme_dev_fops; + result = misc_register(&dev->miscdev); + if (result) + goto remove; + + kref_init(&dev->kref); return 0; + remove: + nvme_dev_remove(dev); delete: spin_lock(&dev_list_lock); list_del(&dev->node); @@ -1719,16 +1993,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_remove(dev); - pci_disable_msix(pdev); - iounmap(dev->bar); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); - pci_disable_device(pdev); - pci_release_regions(pdev); - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); + misc_deregister(&dev->miscdev); + kref_put(&dev->kref, nvme_free_dev); } /* These functions are yet to be implemented */ diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c new file mode 100644 index 00000000000..fed54b03989 --- /dev/null +++ b/drivers/block/nvme-scsi.c @@ -0,0 +1,3053 @@ +/* + * NVM Express device driver + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * Refer to the SCSI-NVMe Translation spec for details on how + * each command is translated. + */ + +#include <linux/nvme.h> +#include <linux/bio.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/idr.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kdev_t.h> + |