From 33dd2afe36586725a9456d2d00f6757b11415b63 Mon Sep 17 00:00:00 2001 From: Rajalaxmi Angadi Date: Thu, 23 Feb 2017 13:44:26 -0800 Subject: [PATCH] add files for nvme Signed-off-by: Rajalaxmi Angadi Signed-off-by: Jay Sternberg --- drivers/nvme/Kconfig | 2 + drivers/nvme/Makefile | 3 + drivers/nvme/host/Kconfig | 45 + drivers/nvme/host/Makefile | 14 + drivers/nvme/host/core.c | 2116 ++++++++++++++++++++++++ drivers/nvme/host/fabrics.c | 961 +++++++++++ drivers/nvme/host/fabrics.h | 132 ++ drivers/nvme/host/lightnvm.c | 629 +++++++ drivers/nvme/host/nvme.h | 329 ++++ drivers/nvme/host/pci.c | 2166 ++++++++++++++++++++++++ drivers/nvme/host/rdma.c | 2033 +++++++++++++++++++++++ drivers/nvme/host/scsi.c | 2574 +++++++++++++++++++++++++++++ drivers/nvme/target/Kconfig | 36 + drivers/nvme/target/Makefile | 9 + drivers/nvme/target/admin-cmd.c | 461 ++++++ drivers/nvme/target/configfs.c | 917 ++++++++++ drivers/nvme/target/core.c | 968 +++++++++++ drivers/nvme/target/discovery.c | 221 +++ drivers/nvme/target/fabrics-cmd.c | 240 +++ drivers/nvme/target/io-cmd.c | 215 +++ drivers/nvme/target/loop.c | 752 +++++++++ drivers/nvme/target/nvmet.h | 332 ++++ drivers/nvme/target/rdma.c | 1497 +++++++++++++++++ include/linux/nvme-rdma.h | 71 + include/linux/nvme.h | 965 +++++++++++ include/uapi/linux/nvme_ioctl.h | 66 + 26 files changed, 17754 insertions(+) create mode 100644 drivers/nvme/Kconfig create mode 100644 drivers/nvme/Makefile create mode 100644 drivers/nvme/host/Kconfig create mode 100644 drivers/nvme/host/Makefile create mode 100644 drivers/nvme/host/core.c create mode 100644 drivers/nvme/host/fabrics.c create mode 100644 drivers/nvme/host/fabrics.h create mode 100644 drivers/nvme/host/lightnvm.c create mode 100644 drivers/nvme/host/nvme.h create mode 100644 drivers/nvme/host/pci.c create mode 100644 drivers/nvme/host/rdma.c create mode 100644 drivers/nvme/host/scsi.c create mode 100644 drivers/nvme/target/Kconfig create mode 100644 drivers/nvme/target/Makefile create mode 100644 drivers/nvme/target/admin-cmd.c create mode 100644 drivers/nvme/target/configfs.c create mode 100644 drivers/nvme/target/core.c create mode 100644 drivers/nvme/target/discovery.c create mode 100644 drivers/nvme/target/fabrics-cmd.c create mode 100644 drivers/nvme/target/io-cmd.c create mode 100644 drivers/nvme/target/loop.c create mode 100644 drivers/nvme/target/nvmet.h create mode 100644 drivers/nvme/target/rdma.c create mode 100644 include/linux/nvme-rdma.h create mode 100644 include/linux/nvme.h create mode 100644 include/uapi/linux/nvme_ioctl.h diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig new file mode 100644 index 0000000..b7c78a5 --- /dev/null +++ b/drivers/nvme/Kconfig @@ -0,0 +1,2 @@ +source "drivers/nvme/host/Kconfig" +source "drivers/nvme/target/Kconfig" diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile new file mode 100644 index 0000000..0096a7f --- /dev/null +++ b/drivers/nvme/Makefile @@ -0,0 +1,3 @@ + +obj-y += host/ +obj-y += target/ diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig new file mode 100644 index 0000000..f7d37a6 --- /dev/null +++ b/drivers/nvme/host/Kconfig @@ -0,0 +1,45 @@ +config NVME_CORE + tristate + +config BLK_DEV_NVME + tristate "NVM Express block device" + depends on PCI && BLOCK + select NVME_CORE + ---help--- + The NVM Express driver is for solid state drives directly + connected to the PCI or PCI Express bus. If you know you + don't have one of these, it is safe to answer N. + + To compile this driver as a module, choose M here: the + module will be called nvme. + +config BLK_DEV_NVME_SCSI + bool "SCSI emulation for NVMe device nodes" + depends on NVME_CORE + ---help--- + This adds support for the SG_IO ioctl on the NVMe character + and block devices nodes, as well as a translation for a small + number of selected SCSI commands to NVMe commands to the NVMe + driver. If you don't know what this means you probably want + to say N here, unless you run a distro that abuses the SCSI + emulation to provide stable device names for mount by id, like + some OpenSuSE and SLES versions. + +config NVME_FABRICS + tristate + +config NVME_RDMA + tristate "NVM Express over Fabrics RDMA host driver" + depends on INFINIBAND && BLOCK + select NVME_CORE + select NVME_FABRICS + select SG_POOL + help + This provides support for the NVMe over Fabrics protocol using + the RDMA (Infiniband, RoCE, iWarp) transport. This allows you + to use remote block devices exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile new file mode 100644 index 0000000..47abcec --- /dev/null +++ b/drivers/nvme/host/Makefile @@ -0,0 +1,14 @@ +obj-$(CONFIG_NVME_CORE) += nvme-core.o +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o +obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o + +nvme-core-y := core.o +nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o +nvme-core-$(CONFIG_NVM) += lightnvm.o + +nvme-y += pci.o + +nvme-fabrics-y += fabrics.o + +nvme-rdma-y += rdma.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c new file mode 100644 index 0000000..2feacc7 --- /dev/null +++ b/drivers/nvme/host/core.c @@ -0,0 +1,2116 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme.h" +#include "fabrics.h" + +#define NVME_MINORS (1U << MINORBITS) + +unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); +EXPORT_SYMBOL_GPL(admin_timeout); + +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); +EXPORT_SYMBOL_GPL(nvme_io_timeout); + +unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + +unsigned int nvme_max_retries = 5; +module_param_named(max_retries, nvme_max_retries, uint, 0644); +MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); +EXPORT_SYMBOL_GPL(nvme_max_retries); + +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + +static LIST_HEAD(nvme_ctrl_list); +static DEFINE_SPINLOCK(dev_list_lock); + +static struct class *nvme_class; + +void nvme_cancel_request(struct request *req, void *data, bool reserved) +{ + int status; + + if (!blk_mq_request_started(req)) + return; + + dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, + "Cancelling I/O %d", req->tag); + + status = NVME_SC_ABORT_REQ; + if (blk_queue_dying(req->q)) + status |= NVME_SC_DNR; + blk_mq_complete_request(req, status); +} +EXPORT_SYMBOL_GPL(nvme_cancel_request); + +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state) +{ + enum nvme_ctrl_state old_state; + bool changed = false; + + spin_lock_irq(&ctrl->lock); + + old_state = ctrl->state; + switch (new_state) { + case NVME_CTRL_LIVE: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_RESETTING: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + case NVME_CTRL_RECONNECTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_RECONNECTING: + switch (old_state) { + case NVME_CTRL_LIVE: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_DELETING: + switch (old_state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_DEAD: + switch (old_state) { + case NVME_CTRL_DELETING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + default: + break; + } + + if (changed) + ctrl->state = new_state; + + spin_unlock_irq(&ctrl->lock); + + return changed; +} +EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); + +static void nvme_free_ns(struct kref *kref) +{ + struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); + + if (ns->type == NVME_NS_LIGHTNVM) + nvme_nvm_unregister(ns->queue, ns->disk->disk_name); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + + put_disk(ns->disk); + ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); + nvme_put_ctrl(ns->ctrl); + kfree(ns); +} + +static void nvme_put_ns(struct nvme_ns *ns) +{ + kref_put(&ns->kref, nvme_free_ns); +} + +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) +{ + struct nvme_ns *ns; + + spin_lock(&dev_list_lock); + ns = disk->private_data; + if (ns) { + if (!kref_get_unless_zero(&ns->kref)) + goto fail; + if (!try_module_get(ns->ctrl->ops->module)) + goto fail_put_ns; + } + spin_unlock(&dev_list_lock); + + return ns; + +fail_put_ns: + kref_put(&ns->kref, nvme_free_ns); +fail: + spin_unlock(&dev_list_lock); + return NULL; +} + +void nvme_requeue_req(struct request *req) +{ + unsigned long flags; + + blk_mq_requeue_request(req); + spin_lock_irqsave(req->q->queue_lock, flags); + if (!blk_queue_stopped(req->q)) + blk_mq_kick_requeue_list(req->q); + spin_unlock_irqrestore(req->q->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(nvme_requeue_req); + +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags, int qid) +{ + struct request *req; + + if (qid == NVME_QID_ANY) { + req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags); + } else { + req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags, + qid ? qid - 1 : 0); + } + if (IS_ERR(req)) + return req; + + req->cmd_type = REQ_TYPE_DRV_PRIV; + req->cmd_flags |= REQ_FAILFAST_DRIVER; + req->cmd = (unsigned char *)cmd; + req->cmd_len = sizeof(struct nvme_command); + + return req; +} +EXPORT_SYMBOL_GPL(nvme_alloc_request); + +static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); +} + +static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + struct nvme_dsm_range *range; + struct page *page; + int offset; + unsigned int nr_bytes = blk_rq_bytes(req); + + range = kmalloc(sizeof(*range), GFP_ATOMIC); + if (!range) + return BLK_MQ_RQ_QUEUE_BUSY; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + req->completion_data = range; + page = virt_to_page(range); + offset = offset_in_page(range); + blk_add_request_payload(req, page, offset, sizeof(*range)); + + /* + * we set __data_len back to the size of the area to be discarded + * on disk. This allows us to report completion on the full amount + * of blocks described by the request. + */ + req->__data_len = nr_bytes; + + return 0; +} + +static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (ns->ms) { + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + if (!blk_integrity_rq(req)) + control |= NVME_RW_PRINFO_PRACT; + } + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); +} + +int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmd) +{ + int ret = 0; + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + memcpy(cmd, req->cmd, sizeof(*cmd)); + else if (req_op(req) == REQ_OP_FLUSH) + nvme_setup_flush(ns, cmd); + else if (req_op(req) == REQ_OP_DISCARD) + ret = nvme_setup_discard(ns, req, cmd); + else + nvme_setup_rw(ns, req, cmd); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_setup_cmd); + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + struct nvme_completion *cqe, void *buffer, unsigned bufflen, + unsigned timeout, int qid, int at_head, int flags) +{ + struct request *req; + int ret; + + req = nvme_alloc_request(q, cmd, flags, qid); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + req->special = cqe; + + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + if (ret) + goto out; + } + + blk_execute_rq(req->q, NULL, req, at_head); + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} +EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, + NVME_QID_ANY, 0, 0); +} +EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); + +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + u32 *result, unsigned timeout) +{ + bool write = nvme_is_write(cmd); + struct nvme_completion cqe; + struct nvme_ns *ns = q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + req->special = &cqe; + + if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + + if (!disk) + goto submit; + bio->bi_bdev = bdget_disk(disk, 0); + if (!bio->bi_bdev) { + ret = -ENODEV; + goto out_unmap; + } + + if (meta_buffer && meta_len) { + struct bio_integrity_payload *bip; + + meta = kmalloc(meta_len, GFP_KERNEL); + if (!meta) { + ret = -ENOMEM; + goto out_unmap; + } + + if (write) { + if (copy_from_user(meta, meta_buffer, + meta_len)) { + ret = -EFAULT; + goto out_free_meta; + } + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = meta_len; + bip->bip_iter.bi_sector = meta_seed; + + ret = bio_integrity_add_page(bio, virt_to_page(meta), + meta_len, offset_in_page(meta)); + if (ret != meta_len) { + ret = -ENOMEM; + goto out_free_meta; + } + } + } + submit: + blk_execute_rq(req->q, disk, req, 0); + ret = req->errors; + if (result) + *result = le32_to_cpu(cqe.result); + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + out_free_meta: + kfree(meta); + out_unmap: + if (bio) { + if (disk && bio->bi_bdev) + bdput(bio->bi_bdev); + blk_rq_unmap_user(bio); + } + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout) +{ + return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, + result, timeout); +} + +static void nvme_keep_alive_end_io(struct request *rq, int error) +{ + struct nvme_ctrl *ctrl = rq->end_io_data; + + blk_mq_free_request(rq); + + if (error) { + dev_err(ctrl->device, + "failed nvme_keep_alive_end_io error=%d\n", error); + return; + } + + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +static int nvme_keep_alive(struct nvme_ctrl *ctrl) +{ + struct nvme_command c; + struct request *rq; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_keep_alive; + + rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED, + NVME_QID_ANY); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + rq->timeout = ctrl->kato * HZ; + rq->end_io_data = ctrl; + + blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); + + return 0; +} + +static void nvme_keep_alive_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, ka_work); + + if (nvme_keep_alive(ctrl)) { + /* allocation failure, reset the controller */ + dev_err(ctrl->device, "keep-alive failed\n"); + ctrl->ops->reset_ctrl(ctrl); + return; + } +} + +void nvme_start_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} +EXPORT_SYMBOL_GPL(nvme_start_keep_alive); + +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + cancel_delayed_work_sync(&ctrl->ka_work); +} +EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); + +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(1); + + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + return error; +} + +static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) +{ + struct nvme_command c = { }; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(2); + c.identify.nsid = cpu_to_le32(nsid); + return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); +} + +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, + struct nvme_id_ns **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify, + c.identify.nsid = cpu_to_le32(nsid), + + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ns)); + if (error) + kfree(*id); + return error; +} + +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + struct nvme_completion cqe; + int ret; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); + c.features.dptr.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + if (ret >= 0 && result) + *result = le32_to_cpu(cqe.result); + return ret; +} + +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + struct nvme_completion cqe; + int ret; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.dptr.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + if (ret >= 0 && result) + *result = le32_to_cpu(cqe.result); + return ret; +} + +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) +{ + struct nvme_command c = { }; + int error; + + c.common.opcode = nvme_admin_get_log_page, + c.common.nsid = cpu_to_le32(0xFFFFFFFF), + c.common.cdw10[0] = cpu_to_le32( + (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | + NVME_LOG_SMART), + + *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); + if (!*log) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, + sizeof(struct nvme_smart_log)); + if (error) + kfree(*log); + return error; +} + +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) +{ + u32 q_count = (*count - 1) | ((*count - 1) << 16); + u32 result; + int status, nr_io_queues; + + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, + &result); + if (status < 0) + return status; + + /* + * Degraded controllers might return an error when setting the queue + * count. We still want to be able to bring them online and offer + * access to the admin queue, as that might be only way to fix them up. + */ + if (status > 0) { + dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); + *count = 0; + } else { + nr_io_queues = min(result & 0xffff, result >> 16) + 1; + *count = min(*count, nr_io_queues); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_set_queue_count); + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + if (io.flags) + return -EINVAL; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + metadata = (void __user *)(uintptr_t)io.metadata; + + if (ns->ext) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return __nvme_submit_user_cmd(ns->queue, &c, + (void __user *)(uintptr_t)io.addr, length, + metadata, meta_len, io.slba, NULL, 0); +} + +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, + &cmd.result, timeout); + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->ns_id; + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, (void __user *)arg); +#ifdef CONFIG_BLK_DEV_NVME_SCSI + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); +#endif + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SG_IO: + return -ENOIOCTLCMD; + } + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif + +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + struct nvme_ns *ns = disk->private_data; + + module_put(ns->ctrl->ops->module); + nvme_put_ns(ns); +} + +static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + return 0; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + memset(&integrity, 0, sizeof(integrity)); + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity.profile = &t10_pi_type3_crc; + integrity.tag_size = sizeof(u16) + sizeof(u32); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity.profile = &t10_pi_type1_crc; + integrity.tag_size = sizeof(u16); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + break; + default: + integrity.profile = NULL; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} +#else +static void nvme_init_integrity(struct nvme_ns *ns) +{ +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + +static void nvme_config_discard(struct nvme_ns *ns) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + u32 logical_block_size = queue_logical_block_size(ns->queue); + + if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES) + ns->queue->limits.discard_zeroes_data = 1; + else + ns->queue->limits.discard_zeroes_data = 0; + + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + blk_queue_max_discard_sectors(ns->queue, UINT_MAX); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_id_ns *id; + u8 lbaf, pi_type; + u16 old_ms; + unsigned short bs; + + if (test_bit(NVME_NS_DEAD, &ns->flags)) { + set_capacity(disk, 0); + return -ENODEV; + } + if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { + dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", + __func__); + return -ENODEV; + } + if (id->ncap == 0) { + kfree(id); + return -ENODEV; + } + + if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { + if (nvme_nvm_register(ns->queue, disk->disk_name)) { + dev_warn(disk_to_dev(ns->disk), + "%s: LightNVM init failure\n", __func__); + kfree(id); + return -ENODEV; + } + ns->type = NVME_NS_LIGHTNVM; + } + + if (ns->ctrl->vs >= NVME_VS(1, 1)) + memcpy(ns->eui, id->eui64, sizeof(ns->eui)); + if (ns->ctrl->vs >= NVME_VS(1, 2)) + memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); + + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; + ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + blk_mq_freeze_queue(disk->queue); + if (blk_get_integrity(disk) && (ns->pi_type != pi_type || + ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !blk_get_integrity(disk) && !ns->ext) + nvme_init_integrity(ns); + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + blk_mq_unfreeze_queue(disk->queue); + + kfree(id); + return 0; +} + +static char nvme_pr_type(enum pr_type type) +{ + switch (type) { + case PR_WRITE_EXCLUSIVE: + return 1; + case PR_EXCLUSIVE_ACCESS: + return 2; + case PR_WRITE_EXCLUSIVE_REG_ONLY: + return 3; + case PR_EXCLUSIVE_ACCESS_REG_ONLY: + return 4; + case PR_WRITE_EXCLUSIVE_ALL_REGS: + return 5; + case PR_EXCLUSIVE_ACCESS_ALL_REGS: + return 6; + default: + return 0; + } +}; + +static int nvme_pr_command(struct block_device *bdev, u32 cdw10, + u64 key, u64 sa_key, u8 op) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + struct nvme_command c; + u8 data[16] = { 0, }; + + put_unaligned_le64(key, &data[0]); + put_unaligned_le64(sa_key, &data[8]); + + memset(&c, 0, sizeof(c)); + c.common.opcode = op; + c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.cdw10[0] = cpu_to_le32(cdw10); + + return nvme_submit_sync_cmd(ns->queue, &c, data, 16); +} + +static int nvme_pr_register(struct block_device *bdev, u64 old, + u64 new, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = old ? 2 : 0; + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; + cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); +} + +static int nvme_pr_reserve(struct block_device *bdev, u64 key, + enum pr_type type, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = nvme_pr_type(type) << 8; + cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); +} + +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, + enum pr_type type, bool abort) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); +} + +static int nvme_pr_clear(struct block_device *bdev, u64 key) +{ + u32 cdw10 = 1 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); +} + +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); +} + +static const struct pr_ops nvme_pr_ops = { + .pr_register = nvme_pr_register, + .pr_reserve = nvme_pr_reserve, + .pr_release = nvme_pr_release, + .pr_preempt = nvme_pr_preempt, + .pr_clear = nvme_pr_clear, +}; + +static const struct block_device_operations nvme_fops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, + .open = nvme_open, + .release = nvme_release, + .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, + .pr_ops = &nvme_pr_ops, +}; + +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +{ + unsigned long timeout = + ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + int ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_RDY) == bit) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->device, + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); + return -ENODEV; + } + } + + return ret; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config &= ~NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + /* Checking for ctrl->tagset is a trick to avoid sleeping on module + * load, since we only need the quirk on reset_controller. Notice + * that the HGST device needs this delay only in firmware activation + * procedure; unfortunately we have no (easy) way to verify this. + */ + if ((ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) && ctrl->tagset) + msleep(NVME_QUIRK_DELAY_AMOUNT); + + return nvme_wait_ready(ctrl, cap, false); +} +EXPORT_SYMBOL_GPL(nvme_disable_ctrl); + +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + /* + * Default to a 4K page size, with the intention to update this + * path in the future to accomodate architectures with differing + * kernel and IO page sizes. + */ + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + int ret; + + if (page_shift < dev_page_min) { + dev_err(ctrl->device, + "Minimum device page size %u too large for host (%u)\n", + 1 << dev_page_min, 1 << page_shift); + return -ENODEV; + } + + ctrl->page_size = 1 << page_shift; + + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + ctrl->ctrl_config |= NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, true); +} +EXPORT_SYMBOL_GPL(nvme_enable_ctrl); + +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) +{ + unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; + u32 csts; + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->device, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); + +static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +{ + bool vwc = false; + + if (ctrl->max_hw_sectors) { + u32 max_segments = + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } + if (ctrl->stripe_size) + blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); + blk_queue_virt_boundary(q, ctrl->page_size - 1); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u64 cap; + int ret, page_shift; + u32 max_hw_sectors; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); + return ret; + } + + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); + if (ret) { + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); + return ret; + } + page_shift = NVME_CAP_MPSMIN(cap) + 12; + + if (ctrl->vs >= NVME_VS(1, 1)) + ctrl->subsystem = NVME_CAP_NSSRC(cap); + + ret = nvme_identify_ctrl(ctrl, &id); + if (ret) { + dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); + return -EIO; + } + + ctrl->vid = le16_to_cpu(id->vid); + ctrl->oncs = le16_to_cpup(&id->oncs); + atomic_set(&ctrl->abort_limit, id->acl + 1); + ctrl->vwc = id->vwc; + ctrl->cntlid = le16_to_cpup(&id->cntlid); + memcpy(ctrl->serial, id->sn, sizeof(id->sn)); + memcpy(ctrl->model, id->mn, sizeof(id->mn)); + memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); + if (id->mdts) + max_hw_sectors = 1 << (id->mdts + page_shift - 9); + else + max_hw_sectors = UINT_MAX; + ctrl->max_hw_sectors = + min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { + unsigned int max_hw_sectors; + + ctrl->stripe_size = 1 << (id->vs[3] + page_shift); + max_hw_sectors = ctrl->stripe_size >> (page_shift - 9); + if (ctrl->max_hw_sectors) { + ctrl->max_hw_sectors = min(max_hw_sectors, + ctrl->max_hw_sectors); + } else { + ctrl->max_hw_sectors = max_hw_sectors; + } + } + + nvme_set_queue_limits(ctrl, ctrl->admin_q); + ctrl->sgls = le32_to_cpu(id->sgls); + ctrl->kas = le16_to_cpu(id->kas); + + if (ctrl->ops->is_fabrics) { + ctrl->icdoff = le16_to_cpu(id->icdoff); + ctrl->ioccsz = le32_to_cpu(id->ioccsz); + ctrl->iorcsz = le32_to_cpu(id->iorcsz); + ctrl->maxcmd = le16_to_cpu(id->maxcmd); + + /* + * In fabrics we need to verify the cntlid matches the + * admin connect + */ + if (ctrl->cntlid != le16_to_cpu(id->cntlid)) + ret = -EINVAL; + + if (!ctrl->opts->discovery_nqn && !ctrl->kas) { + dev_err(ctrl->dev, + "keep-alive support is mandatory for fabrics\n"); + ret = -EINVAL; + } + } else { + ctrl->cntlid = le16_to_cpu(id->cntlid); + } + + kfree(id); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_init_identify); + +static int nvme_dev_open(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl; + int instance = iminor(inode); + int ret = -ENODEV; + + spin_lock(&dev_list_lock); + list_for_each_entry(ctrl, &nvme_ctrl_list, node) { + if (ctrl->instance != instance) + continue; + + if (!ctrl->admin_q) { + ret = -EWOULDBLOCK; + break; + } + if (!kref_get_unless_zero(&ctrl->kref)) + break; + file->private_data = ctrl; + ret = 0; + break; + } + spin_unlock(&dev_list_lock); + + return ret; +} + +static int nvme_dev_release(struct inode *inode, struct file *file) +{ + nvme_put_ctrl(file->private_data); + return 0; +} + +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + mutex_lock(&ctrl->namespaces_mutex); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->device, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->device, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + mutex_unlock(&ctrl->namespaces_mutex); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + mutex_unlock(&ctrl->namespaces_mutex); + return ret; +} + +static long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->device, "resetting controller\n"); + return ctrl->ops->reset_ctrl(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: + nvme_queue_scan(ctrl); + return 0; + default: + return -ENOTTY; + } +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = nvme_dev_ioctl, +}; + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int ret; + + ret = ctrl->ops->reset_ctrl(ctrl); + if (ret < 0) + return ret; + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + +static ssize_t nvme_sysfs_rescan(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + nvme_queue_scan(ctrl); + return count; +} +static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); + +static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + int serial_len = sizeof(ctrl->serial); + int model_len = sizeof(ctrl->model); + + if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + return sprintf(buf, "eui.%16phN\n", ns->uuid); + + if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) + return sprintf(buf, "eui.%8phN\n", ns->eui); + + while (ctrl->serial[serial_len - 1] == ' ') + serial_len--; + while (ctrl->model[model_len - 1] == ' ') + model_len--; + + return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, + serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); +} +static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%pU\n", ns->uuid); +} +static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t eui_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%8phd\n", ns->eui); +} +static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); + +static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%d\n", ns->ns_id); +} +static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); + +static struct attribute *nvme_ns_attrs[] = { + &dev_attr_wwid.attr, + &dev_attr_uuid.attr, + &dev_attr_eui.attr, + &dev_attr_nsid.attr, + NULL, +}; + +static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + + if (a == &dev_attr_uuid.attr) { + if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + return 0; + } + if (a == &dev_attr_eui.attr) { + if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) + return 0; + } + return a->mode; +} + +static const struct attribute_group nvme_ns_attr_group = { + .attrs = nvme_ns_attrs, + .is_visible = nvme_ns_attrs_are_visible, +}; + +#define nvme_show_str_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +#define nvme_show_int_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sprintf(buf, "%d\n", ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_str_function(model); +nvme_show_str_function(serial); +nvme_show_str_function(firmware_rev); +nvme_show_int_function(cntlid); + +static ssize_t nvme_sysfs_delete(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (device_remove_file_self(dev, attr)) + ctrl->ops->delete_ctrl(ctrl); + return count; +} +static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); + +static ssize_t nvme_sysfs_show_transport(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); +} +static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); + +static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", + ctrl->ops->get_subsysnqn(ctrl)); +} +static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); + +static ssize_t nvme_sysfs_show_address(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); +} +static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); + +static struct attribute *nvme_dev_attrs[] = { + &dev_attr_reset_controller.attr, + &dev_attr_rescan_controller.attr, + &dev_attr_model.attr, + &dev_attr_serial.attr, + &dev_attr_firmware_rev.attr, + &dev_attr_cntlid.attr, + &dev_attr_delete_controller.attr, + &dev_attr_transport.attr, + &dev_attr_subsysnqn.attr, + &dev_attr_address.attr, + NULL +}; + +#define CHECK_ATTR(ctrl, a, name) \ + if ((a) == &dev_attr_##name.attr && \ + !(ctrl)->ops->get_##name) \ + return 0 + +static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (a == &dev_attr_delete_controller.attr) { + if (!ctrl->ops->delete_ctrl) + return 0; + } + + CHECK_ATTR(ctrl, a, subsysnqn); + CHECK_ATTR(ctrl, a, address); + + return a->mode; +} + +static struct attribute_group nvme_dev_attrs_group = { + .attrs = nvme_dev_attrs, + .is_visible = nvme_dev_attrs_are_visible, +}; + +static const struct attribute_group *nvme_dev_attr_groups[] = { + &nvme_dev_attrs_group, + NULL, +}; + +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); + struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + + return nsa->ns_id - nsb->ns_id; +} + +static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns, *ret = NULL; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->ns_id == nsid) { + kref_get(&ns->kref); + ret = ns; + break; + } + if (ns->ns_id > nsid) + break; + } + mutex_unlock(&ctrl->namespaces_mutex); + return ret; +} + +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + struct gendisk *disk; + int node = dev_to_node(ctrl->dev); + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + return; + + ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); + if (ns->instance < 0) + goto out_free_ns; + + ns->queue = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ns->queue)) + goto out_release_instance; + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); + ns->queue->queuedata = ns; + ns->ctrl = ctrl; + + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_queue; + + kref_init(&ns->kref); + ns->ns_id = nsid; + ns->disk = disk; + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + nvme_set_queue_limits(ctrl, ns->queue); + + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->flags = GENHD_FL_EXT_DEVT; + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); + + if (nvme_revalidate_disk(ns->disk)) + goto out_free_disk; + + mutex_lock(&ctrl->namespaces_mutex); + list_add_tail(&ns->list, &ctrl->namespaces); + mutex_unlock(&ctrl->namespaces_mutex); + + kref_get(&ctrl->kref); + if (ns->type == NVME_NS_LIGHTNVM) + return; + + device_add_disk(ctrl->device, ns->disk); + if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group)) + pr_warn("%s: failed to create sysfs group for identification\n", + ns->disk->disk_name); + return; + out_free_disk: + kfree(disk); + out_free_queue: + blk_cleanup_queue(ns->queue); + out_release_instance: + ida_simple_remove(&ctrl->ns_ida, ns->instance); + out_free_ns: + kfree(ns); +} + +static void nvme_ns_remove(struct nvme_ns *ns) +{ + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) + return; + + if (ns->disk->flags & GENHD_FL_UP) { + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); + sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group); + del_gendisk(ns->disk); + blk_mq_abort_requeue_list(ns->queue); + blk_cleanup_queue(ns->queue); + } + + mutex_lock(&ns->ctrl->namespaces_mutex); + list_del_init(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_mutex); + + nvme_put_ns(ns); +} + +static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + + ns = nvme_find_get_ns(ctrl, nsid); + if (ns) { + if (revalidate_disk(ns->disk)) + nvme_ns_remove(ns); + nvme_put_ns(ns); + } else + nvme_alloc_ns(ctrl, nsid); +} + +static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, + unsigned nsid) +{ + struct nvme_ns *ns, *next; + + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { + if (ns->ns_id > nsid) + nvme_ns_remove(ns); + } +} + +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) +{ + struct nvme_ns *ns; + __le32 *ns_list; + unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); + int ret = 0; + + ns_list = kzalloc(0x1000, GFP_KERNEL); + if (!ns_list) + return -ENOMEM; + + for (i = 0; i < num_lists; i++) { + ret = nvme_identify_ns_list(ctrl, prev, ns_list); + if (ret) + goto free; + + for (j = 0; j < min(nn, 1024U); j++) { + nsid = le32_to_cpu(ns_list[j]); + if (!nsid) + goto out; + + nvme_validate_ns(ctrl, nsid); + + while (++prev < nsid) { + ns = nvme_find_get_ns(ctrl, prev); + if (ns) { + nvme_ns_remove(ns); + nvme_put_ns(ns); + } + } + } + nn -= j; + } + out: + nvme_remove_invalid_namespaces(ctrl, prev); + free: + kfree(ns_list); + return ret; +} + +static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) +{ + unsigned i; + + for (i = 1; i <= nn; i++) + nvme_validate_ns(ctrl, i); + + nvme_remove_invalid_namespaces(ctrl, nn); +} + +static void nvme_scan_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, scan_work); + struct nvme_id_ctrl *id; + unsigned nn; + + if (ctrl->state != NVME_CTRL_LIVE) + return; + + if (nvme_identify_ctrl(ctrl, &id)) + return; + + nn = le32_to_cpu(id->nn); + if (ctrl->vs >= NVME_VS(1, 1) && + !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + if (!nvme_scan_ns_list(ctrl, nn)) + goto done; + } + nvme_scan_ns_sequential(ctrl, nn); + done: + mutex_lock(&ctrl->namespaces_mutex); + list_sort(NULL, &ctrl->namespaces, ns_cmp); + mutex_unlock(&ctrl->namespaces_mutex); + kfree(id); + + if (ctrl->ops->post_scan) + ctrl->ops->post_scan(ctrl); +} + +void nvme_queue_scan(struct nvme_ctrl *ctrl) +{ + /* + * Do not queue new scan work when a controller is reset during + * removal. + */ + if (ctrl->state == NVME_CTRL_LIVE) + schedule_work(&ctrl->scan_work); +} +EXPORT_SYMBOL_GPL(nvme_queue_scan); + +/* + * This function iterates the namespace list unlocked to allow recovery from + * controller failure. It is up to the caller to ensure the namespace list is + * not modified by scan work while this function is executing. + */ +void nvme_remove_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns, *next; + + /* + * The dead states indicates the controller was not gracefully + * disconnected. In that case, we won't be able to flush any data while + * removing the namespaces' disks; fail all the queues now to avoid + * potentially having to clean up the failed sync later. + */ + if (ctrl->state == NVME_CTRL_DEAD) + nvme_kill_queues(ctrl); + + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) + nvme_ns_remove(ns); +} +EXPORT_SYMBOL_GPL(nvme_remove_namespaces); + +static void nvme_async_event_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, async_event_work); + + spin_lock_irq(&ctrl->lock); + while (ctrl->event_limit > 0) { + int aer_idx = --ctrl->event_limit; + + spin_unlock_irq(&ctrl->lock); + ctrl->ops->submit_async_event(ctrl, aer_idx); + spin_lock_irq(&ctrl->lock); + } + spin_unlock_irq(&ctrl->lock); +} + +void nvme_complete_async_event(struct nvme_ctrl *ctrl, + struct nvme_completion *cqe) +{ + u16 status = le16_to_cpu(cqe->status) >> 1; + u32 result = le32_to_cpu(cqe->result); + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { + ++ctrl->event_limit; + schedule_work(&ctrl->async_event_work); + } + + if (status != NVME_SC_SUCCESS) + return; + + switch (result & 0xff07) { + case NVME_AER_NOTICE_NS_CHANGED: + dev_info(ctrl->device, "rescanning\n"); + nvme_queue_scan(ctrl); + break; + default: + dev_warn(ctrl->device, "async event result %08x\n", result); + } +} +EXPORT_SYMBOL_GPL(nvme_complete_async_event); + +void nvme_queue_async_events(struct nvme_ctrl *ctrl) +{ + ctrl->event_limit = NVME_NR_AERS; + schedule_work(&ctrl->async_event_work); +} +EXPORT_SYMBOL_GPL(nvme_queue_async_events); + +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_ctrl *ctrl) +{ + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + ctrl->instance = instance; + return 0; +} + +static void nvme_release_instance(struct nvme_ctrl *ctrl) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, ctrl->instance); + spin_unlock(&dev_list_lock); +} + +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) +{ + flush_work(&ctrl->async_event_work); + flush_work(&ctrl->scan_work); + nvme_remove_namespaces(ctrl); + + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); + + spin_lock(&dev_list_lock); + list_del(&ctrl->node); + spin_unlock(&dev_list_lock); +} +EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); + +static void nvme_free_ctrl(struct kref *kref) +{ + struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + + put_device(ctrl->device); + nvme_release_instance(ctrl); + ida_destroy(&ctrl->ns_ida); + + ctrl->ops->free_ctrl(ctrl); +} + +void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + kref_put(&ctrl->kref, nvme_free_ctrl); +} +EXPORT_SYMBOL_GPL(nvme_put_ctrl); + +/* + * Initialize a NVMe controller structures. This needs to be called during + * earliest initialization so that we have the initialized structured around + * during probing. + */ +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks) +{ + int ret; + + ctrl->state = NVME_CTRL_NEW; + spin_lock_init(&ctrl->lock); + INIT_LIST_HEAD(&ctrl->namespaces); + mutex_init(&ctrl->namespaces_mutex); + kref_init(&ctrl->kref); + ctrl->dev = dev; + ctrl->ops = ops; + ctrl->quirks = quirks; + INIT_WORK(&ctrl->scan_work, nvme_scan_work); + INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); + + ret = nvme_set_instance(ctrl); + if (ret) + goto out; + + ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, + MKDEV(nvme_char_major, ctrl->instance), + ctrl, nvme_dev_attr_groups, + "nvme%d", ctrl->instance); + if (IS_ERR(ctrl->device)) { + ret = PTR_ERR(ctrl->device); + goto out_release_instance; + } + get_device(ctrl->device); + ida_init(&ctrl->ns_ida); + + spin_lock(&dev_list_lock); + list_add_tail(&ctrl->node, &nvme_ctrl_list); + spin_unlock(&dev_list_lock); + + return 0; +out_release_instance: + nvme_release_instance(ctrl); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nvme_init_ctrl); + +/** + * nvme_kill_queues(): Ends all namespace queues + * @ctrl: the dead controller that needs to end + * + * Call this function when the driver determines it is unable to get the + * controller in a state capable of servicing IO. + */ +void nvme_kill_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + /* + * Revalidating a dead namespace sets capacity to 0. This will + * end buffered writers dirtying pages that can't be synced. + */ + if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + revalidate_disk(ns->disk); + + blk_set_queue_dying(ns->queue); + blk_mq_abort_requeue_list(ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + } + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_kill_queues); + +void nvme_stop_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + spin_lock_irq(ns->queue->queue_lock); + queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); + spin_unlock_irq(ns->queue->queue_lock); + + blk_mq_cancel_requeue_work(ns->queue); + blk_mq_stop_hw_queues(ns->queue); + } + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_stop_queues); + +void nvme_start_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_kick_requeue_list(ns->queue); + } + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_start_queues); + +int __init nvme_core_init(void) +{ + int result; + + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", + &nvme_dev_fops); + if (result < 0) + return result; + else if (result > 0) + nvme_char_major = result; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (IS_ERR(nvme_class)) { + result = PTR_ERR(nvme_class); + goto unregister_chrdev; + } + + return 0; + + unregister_chrdev: + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + return result; +} + +void nvme_core_exit(void) +{ + class_destroy(nvme_class); + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); +} + +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +module_init(nvme_core_init); +module_exit(nvme_core_exit); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c new file mode 100644 index 0000000..4eff491 --- /dev/null +++ b/drivers/nvme/host/fabrics.c @@ -0,0 +1,961 @@ +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include "nvme.h" +#include "fabrics.h" + +static LIST_HEAD(nvmf_transports); +static DEFINE_MUTEX(nvmf_transports_mutex); + +static LIST_HEAD(nvmf_hosts); +static DEFINE_MUTEX(nvmf_hosts_mutex); + +static struct nvmf_host *nvmf_default_host; + +static struct nvmf_host *__nvmf_host_find(const char *hostnqn) +{ + struct nvmf_host *host; + + list_for_each_entry(host, &nvmf_hosts, list) { + if (!strcmp(host->nqn, hostnqn)) + return host; + } + + return NULL; +} + +static struct nvmf_host *nvmf_host_add(const char *hostnqn) +{ + struct nvmf_host *host; + + mutex_lock(&nvmf_hosts_mutex); + host = __nvmf_host_find(hostnqn); + if (host) { + kref_get(&host->ref); + goto out_unlock; + } + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + goto out_unlock; + + kref_init(&host->ref); + memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + uuid_be_gen(&host->id); + + list_add_tail(&host->list, &nvmf_hosts); +out_unlock: + mutex_unlock(&nvmf_hosts_mutex); + return host; +} + +static struct nvmf_host *nvmf_host_default(void) +{ + struct nvmf_host *host; + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return NULL; + + kref_init(&host->ref); + uuid_be_gen(&host->id); + snprintf(host->nqn, NVMF_NQN_SIZE, + "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id); + + mutex_lock(&nvmf_hosts_mutex); + list_add_tail(&host->list, &nvmf_hosts); + mutex_unlock(&nvmf_hosts_mutex); + + return host; +} + +static void nvmf_host_destroy(struct kref *ref) +{ + struct nvmf_host *host = container_of(ref, struct nvmf_host, ref); + + mutex_lock(&nvmf_hosts_mutex); + list_del(&host->list); + mutex_unlock(&nvmf_hosts_mutex); + + kfree(host); +} + +static void nvmf_host_put(struct nvmf_host *host) +{ + if (host) + kref_put(&host->ref, nvmf_host_destroy); +} + +/** + * nvmf_get_address() - Get address/port + * @ctrl: Host NVMe controller instance which we got the address + * @buf: OUTPUT parameter that will contain the address/port + * @size: buffer size + */ +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + return snprintf(buf, size, "traddr=%s,trsvcid=%s\n", + ctrl->opts->traddr, ctrl->opts->trsvcid); +} +EXPORT_SYMBOL_GPL(nvmf_get_address); + +/** + * nvmf_get_subsysnqn() - Get subsystem NQN + * @ctrl: Host NVMe controller instance which we got the NQN + */ +const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->subsysnqn; +} +EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn); + +/** + * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 32-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(cqe.result64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read32); + +/** + * nvmf_reg_read64() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 64-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.attrib = 1; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(cqe.result64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read64); + +/** + * nvmf_reg_write32() - NVMe Fabrics "Property Write" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: Input parameter that contains the value to be + * written to the property. + * + * Used by the NVMe host system to write a 32-bit capsule property value + * to an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful write + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + struct nvme_command cmd; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_set.opcode = nvme_fabrics_command; + cmd.prop_set.fctype = nvme_fabrics_type_property_set; + cmd.prop_set.attrib = 0; + cmd.prop_set.offset = cpu_to_le32(off); + cmd.prop_set.value = cpu_to_le64(val); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + if (unlikely(ret)) + dev_err(ctrl->device, + "Property Set error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_write32); + +/** + * nvmf_log_connect_error() - Error-parsing-diagnostic print + * out function for connect() errors. + * + * @ctrl: the specific /dev/nvmeX device that had the error. + * + * @errval: Error code to be decoded in a more human-friendly + * printout. + * + * @offset: For use with the NVMe error code NVME_SC_CONNECT_INVALID_PARAM. + * + * @cmd: This is the SQE portion of a submission capsule. + * + * @data: This is the "Data" portion of a submission capsule. + */ +static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, + int errval, int offset, struct nvme_command *cmd, + struct nvmf_connect_data *data) +{ + int err_sctype = errval & (~NVME_SC_DNR); + + switch (err_sctype) { + + case (NVME_SC_CONNECT_INVALID_PARAM): + if (offset >> 16) { + char *inv_data = "Connect Invalid Data Parameter"; + + switch (offset & 0xffff) { + case (offsetof(struct nvmf_connect_data, cntlid)): + dev_err(ctrl->device, + "%s, cntlid: %d\n", + inv_data, data->cntlid); + break; + case (offsetof(struct nvmf_connect_data, hostnqn)): + dev_err(ctrl->device, + "%s, hostnqn \"%s\"\n", + inv_data, data->hostnqn); + break; + case (offsetof(struct nvmf_connect_data, subsysnqn)): + dev_err(ctrl->device, + "%s, subsysnqn \"%s\"\n", + inv_data, data->subsysnqn); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_data, offset & 0xffff); + break; + } + } else { + char *inv_sqe = "Connect Invalid SQE Parameter"; + + switch (offset) { + case (offsetof(struct nvmf_connect_command, qid)): + dev_err(ctrl->device, + "%s, qid %d\n", + inv_sqe, cmd->connect.qid); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_sqe, offset); + } + } + break; + default: + dev_err(ctrl->device, + "Connect command failed, error wo/DNR bit: %d\n", + err_sctype); + break; + } /* switch (err_sctype) */ +} + +/** + * nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to request + * a new NVMe controller allocation on the target + * system and establish an NVMe Admin connection to + * that controller. + * + * This function enables an NVMe host device to request a new allocation of + * an NVMe controller resource on a target system as well establish a + * fabrics-protocol connection of the NVMe Admin queue between the + * host system device and the allocated NVMe controller on the + * target system via a NVMe Fabrics "Connect" command. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + * + */ +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + struct nvmf_connect_data *data; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = 0; + + /* + * fabrics spec sets a minimum of depth 32 for admin queue, + * so set the queue with this depth always until + * justification otherwise. + */ + cmd.connect.sqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + + /* + * Set keep-alive timeout in seconds granularity (ms * 1000) + * and add a grace period for controller kato enforcement + */ + cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : + cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be)); + data->cntlid = cpu_to_le16(0xffff); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, + data, sizeof(*data), 0, NVME_QID_ANY, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result), + &cmd, data); + goto out_free_data; + } + + ctrl->cntlid = le16_to_cpu(cqe.result16); + +out_free_data: + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); + +/** + * nvmf_connect_io_queue() - NVMe Fabrics I/O Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to establish an + * NVMe I/O queue connection to the already allocated NVMe + * controller on the target system. + * @qid: NVMe I/O queue number for the new I/O connection between + * host and target (note qid == 0 is illegal as this is + * the Admin queue, per NVMe standard). + * + * This function issues a fabrics-protocol connection + * of a NVMe I/O queue (via NVMe Fabrics "Connect" command) + * between the host system device and the allocated NVMe controller + * on the target system. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_command cmd; + struct nvmf_connect_data *data; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = cpu_to_le16(qid); + cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be)); + data->cntlid = cpu_to_le16(ctrl->cntlid); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &cqe, + data, sizeof(*data), 0, qid, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result), + &cmd, data); + } + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); + +/** + * nvmf_register_transport() - NVMe Fabrics Library registration function. + * @ops: Transport ops instance to be registered to the + * common fabrics library. + * + * API function that registers the type of specific transport fabric + * being implemented to the common NVMe fabrics library. Part of + * the overall init sequence of starting up a fabrics driver. + */ +void nvmf_register_transport(struct nvmf_transport_ops *ops) +{ + mutex_lock(&nvmf_transports_mutex); + list_add_tail(&ops->entry, &nvmf_transports); + mutex_unlock(&nvmf_transports_mutex); +} +EXPORT_SYMBOL_GPL(nvmf_register_transport); + +/** + * nvmf_unregister_transport() - NVMe Fabrics Library unregistration function. + * @ops: Transport ops instance to be unregistered from the + * common fabrics library. + * + * Fabrics API function that unregisters the type of specific transport + * fabric being implemented from the common NVMe fabrics library. + * Part of the overall exit sequence of unloading the implemented driver. + */ +void nvmf_unregister_transport(struct nvmf_transport_ops *ops) +{ + mutex_lock(&nvmf_transports_mutex); + list_del(&ops->entry); + mutex_unlock(&nvmf_transports_mutex); +} +EXPORT_SYMBOL_GPL(nvmf_unregister_transport); + +static struct nvmf_transport_ops *nvmf_lookup_transport( + struct nvmf_ctrl_options *opts) +{ + struct nvmf_transport_ops *ops; + + lockdep_assert_held(&nvmf_transports_mutex); + + list_for_each_entry(ops, &nvmf_transports, entry) { + if (strcmp(ops->name, opts->transport) == 0) + return ops; + } + + return NULL; +} + +static const match_table_t opt_tokens = { + { NVMF_OPT_TRANSPORT, "transport=%s" }, + { NVMF_OPT_TRADDR, "traddr=%s" }, + { NVMF_OPT_TRSVCID, "trsvcid=%s" }, + { NVMF_OPT_NQN, "nqn=%s" }, + { NVMF_OPT_QUEUE_SIZE, "queue_size=%d" }, + { NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" }, + { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, + { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, + { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, + { NVMF_OPT_ERR, NULL } +}; + +static int nvmf_parse_options(struct nvmf_ctrl_options *opts, + const char *buf) +{ + substring_t args[MAX_OPT_ARGS]; + char *options, *o, *p; + int token, ret = 0; + size_t nqnlen = 0; + + /* Set defaults */ + opts->queue_size = NVMF_DEF_QUEUE_SIZE; + opts->nr_io_queues = num_online_cpus(); + opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + + options = o = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + while ((p = strsep(&o, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, opt_tokens, args); + opts->mask |= token; + switch (token) { + case NVMF_OPT_TRANSPORT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->transport = p; + break; + case NVMF_OPT_NQN: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->subsysnqn = p; + nqnlen = strlen(opts->subsysnqn); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + opts->subsysnqn, NVMF_NQN_SIZE); + ret = -EINVAL; + goto out; + } + opts->discovery_nqn = + !(strcmp(opts->subsysnqn, + NVME_DISC_SUBSYS_NAME)); + if (opts->discovery_nqn) + opts->nr_io_queues = 0; + break; + case NVMF_OPT_TRADDR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->traddr = p; + break; + case NVMF_OPT_TRSVCID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->trsvcid = p; + break; + case NVMF_OPT_QUEUE_SIZE: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < NVMF_MIN_QUEUE_SIZE || + token > NVMF_MAX_QUEUE_SIZE) { + pr_err("Invalid queue_size %d\n", token); + ret = -EINVAL; + goto out; + } + opts->queue_size = token; + break; + case NVMF_OPT_NR_IO_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid number of IOQs %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_io_queues = min_t(unsigned int, + num_online_cpus(), token); + break; + case NVMF_OPT_KATO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (opts->discovery_nqn) { + pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n"); + ret = -EINVAL; + goto out; + } + + if (token < 0) { + pr_err("Invalid keep_alive_tmo %d\n", token); + ret = -EINVAL; + goto out; + } else if (token == 0) { + /* Allowed for debug */ + pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); + } + opts->kato = token; + break; + case NVMF_OPT_HOSTNQN: + if (opts->host) { + pr_err("hostnqn already user-assigned: %s\n", + opts->host->nqn); + ret = -EADDRINUSE; + goto out; + } + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + nqnlen = strlen(p); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + p, NVMF_NQN_SIZE); + ret = -EINVAL; + goto out; + } + opts->host = nvmf_host_add(p); + if (!opts->host) { + ret = -ENOMEM; + goto out; + } + break; + case NVMF_OPT_RECONNECT_DELAY: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid reconnect_delay %d\n", token); + ret = -EINVAL; + goto out; + } + opts->reconnect_delay = token; + break; + default: + pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", + p); + ret = -EINVAL; + goto out; + } + } + + if (!opts->host) { + kref_get(&nvmf_default_host->ref); + opts->host = nvmf_default_host; + } + +out: + if (!opts->discovery_nqn && !opts->kato) + opts->kato = NVME_DEFAULT_KATO; + kfree(options); + return ret; +} + +static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, + unsigned int required_opts) +{ + if ((opts->mask & required_opts) != required_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if ((opt_tokens[i].token & required_opts) && + !(opt_tokens[i].token & opts->mask)) { + pr_warn("missing parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, + unsigned int allowed_opts) +{ + if (opts->mask & ~allowed_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if (opt_tokens[i].token & ~allowed_opts) { + pr_warn("invalid parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +void nvmf_free_options(struct nvmf_ctrl_options *opts) +{ + nvmf_host_put(opts->host); + kfree(opts->transport); + kfree(opts->traddr); + kfree(opts->trsvcid); + kfree(opts->subsysnqn); + kfree(opts); +} +EXPORT_SYMBOL_GPL(nvmf_free_options); + +#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) +#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN) + +static struct nvme_ctrl * +nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) +{ + struct nvmf_ctrl_options *opts; + struct nvmf_transport_ops *ops; + struct nvme_ctrl *ctrl; + int ret; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return ERR_PTR(-ENOMEM); + + ret = nvmf_parse_options(opts, buf); + if (ret) + goto out_free_opts; + + /* + * Check the generic options first as we need a valid transport for + * the lookup below. Then clear the generic flags so that transport + * drivers don't have to care about them. + */ + ret = nvmf_check_required_opts(opts, NVMF_REQUIRED_OPTS); + if (ret) + goto out_free_opts; + opts->mask &= ~NVMF_REQUIRED_OPTS; + + mutex_lock(&nvmf_transports_mutex); + ops = nvmf_lookup_transport(opts); + if (!ops) { + pr_info("no handler found for transport %s.\n", + opts->transport); + ret = -EINVAL; + goto out_unlock; + } + + ret = nvmf_check_required_opts(opts, ops->required_opts); + if (ret) + goto out_unlock; + ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | + ops->allowed_opts | ops->required_opts); + if (ret) + goto out_unlock; + + ctrl = ops->create_ctrl(dev, opts); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_unlock; + } + + mutex_unlock(&nvmf_transports_mutex); + return ctrl; + +out_unlock: + mutex_unlock(&nvmf_transports_mutex); +out_free_opts: + nvmf_host_put(opts->host); + kfree(opts); + return ERR_PTR(ret); +} + +static struct class *nvmf_class; +static struct device *nvmf_device; +static DEFINE_MUTEX(nvmf_dev_mutex); + +static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *pos) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl; + const char *buf; + int ret = 0; + + if (count > PAGE_SIZE) + return -ENOMEM; + + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + mutex_lock(&nvmf_dev_mutex); + if (seq_file->private) { + ret = -EINVAL; + goto out_unlock; + } + + ctrl = nvmf_create_ctrl(nvmf_device, buf, count); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_unlock; + } + + seq_file->private = ctrl; + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + kfree(buf); + return ret ? ret : count; +} + +static int nvmf_dev_show(struct seq_file *seq_file, void *private) +{ + struct nvme_ctrl *ctrl; + int ret = 0; + + mutex_lock(&nvmf_dev_mutex); + ctrl = seq_file->private; + if (!ctrl) { + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(seq_file, "instance=%d,cntlid=%d\n", + ctrl->instance, ctrl->cntlid); + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + return ret; +} + +static int nvmf_dev_open(struct inode *inode, struct file *file) +{ + /* + * The miscdevice code initializes file->private_data, but doesn't + * make use of it later. + */ + file->private_data = NULL; + return single_open(file, nvmf_dev_show, NULL); +} + +static int nvmf_dev_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl = seq_file->private; + + if (ctrl) + nvme_put_ctrl(ctrl); + return single_release(inode, file); +} + +static const struct file_operations nvmf_dev_fops = { + .owner = THIS_MODULE, + .write = nvmf_dev_write, + .read = seq_read, + .open = nvmf_dev_open, + .release = nvmf_dev_release, +}; + +static struct miscdevice nvmf_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "nvme-fabrics", + .fops = &nvmf_dev_fops, +}; + +static int __init nvmf_init(void) +{ + int ret; + + nvmf_default_host = nvmf_host_default(); + if (!nvmf_default_host) + return -ENOMEM; + + nvmf_class = class_create(THIS_MODULE, "nvme-fabrics"); + if (IS_ERR(nvmf_class)) { + pr_err("couldn't register class nvme-fabrics\n"); + ret = PTR_ERR(nvmf_class); + goto out_free_host; + } + + nvmf_device = + device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(nvmf_device)) { + pr_err("couldn't create nvme-fabris device!\n"); + ret = PTR_ERR(nvmf_device); + goto out_destroy_class; + } + + ret = misc_register(&nvmf_misc); + if (ret) { + pr_err("couldn't register misc device: %d\n", ret); + goto out_destroy_device; + } + + return 0; + +out_destroy_device: + device_destroy(nvmf_class, MKDEV(0, 0)); +out_destroy_class: + class_destroy(nvmf_class); +out_free_host: + nvmf_host_put(nvmf_default_host); + return ret; +} + +static void __exit nvmf_exit(void) +{ + misc_deregister(&nvmf_misc); + device_destroy(nvmf_class, MKDEV(0, 0)); + class_destroy(nvmf_class); + nvmf_host_put(nvmf_default_host); + + BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024); +} + +MODULE_LICENSE("GPL v2"); + +module_init(nvmf_init); +module_exit(nvmf_exit); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h new file mode 100644 index 0000000..46e460a --- /dev/null +++ b/drivers/nvme/host/fabrics.h @@ -0,0 +1,132 @@ +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifndef _NVME_FABRICS_H +#define _NVME_FABRICS_H 1 + +#include +#include + +#define NVMF_MIN_QUEUE_SIZE 16 +#define NVMF_MAX_QUEUE_SIZE 1024 +#define NVMF_DEF_QUEUE_SIZE 128 +#define NVMF_DEF_RECONNECT_DELAY 10 + +/* + * Define a host as seen by the target. We allocate one at boot, but also + * allow the override it when creating controllers. This is both to provide + * persistence of the Host NQN over multiple boots, and to allow using + * multiple ones, for example in a container scenario. Because we must not + * use different Host NQNs with the same Host ID we generate a Host ID and + * use this structure to keep track of the relation between the two. + */ +struct nvmf_host { + struct kref ref; + struct list_head list; + char nqn[NVMF_NQN_SIZE]; + uuid_be id; +}; + +/** + * enum nvmf_parsing_opts - used to define the sysfs parsing options used. + */ +enum { + NVMF_OPT_ERR = 0, + NVMF_OPT_TRANSPORT = 1 << 0, + NVMF_OPT_NQN = 1 << 1, + NVMF_OPT_TRADDR = 1 << 2, + NVMF_OPT_TRSVCID = 1 << 3, + NVMF_OPT_QUEUE_SIZE = 1 << 4, + NVMF_OPT_NR_IO_QUEUES = 1 << 5, + NVMF_OPT_TL_RETRY_COUNT = 1 << 6, + NVMF_OPT_KATO = 1 << 7, + NVMF_OPT_HOSTNQN = 1 << 8, + NVMF_OPT_RECONNECT_DELAY = 1 << 9, +}; + +/** + * struct nvmf_ctrl_options - Used to hold the options specified + * with the parsing opts enum. + * @mask: Used by the fabrics library to parse through sysfs options + * on adding a NVMe controller. + * @transport: Holds the fabric transport "technology name" (for a lack of + * better description) that will be used by an NVMe controller + * being added. + * @subsysnqn: Hold the fully qualified NQN subystem name (format defined + * in the NVMe specification, "NVMe Qualified Names"). + * @traddr: network address that will be used by the host to communicate + * to the added NVMe controller. + * @trsvcid: network port used for host-controller communication. + * @queue_size: Number of IO queue elements. + * @nr_io_queues: Number of controller IO queues that will be established. + * @reconnect_delay: Time between two consecutive reconnect attempts. + * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. + * @kato: Keep-alive timeout. + * @host: Virtual NVMe host, contains the NQN and Host ID. + */ +struct nvmf_ctrl_options { + unsigned mask; + char *transport; + char *subsysnqn; + char *traddr; + char *trsvcid; + size_t queue_size; + unsigned int nr_io_queues; + unsigned int reconnect_delay; + bool discovery_nqn; + unsigned int kato; + struct nvmf_host *host; +}; + +/* + * struct nvmf_transport_ops - used to register a specific + * fabric implementation of NVMe fabrics. + * @entry: Used by the fabrics library to add the new + * registration entry to its linked-list internal tree. + * @name: Name of the NVMe fabric driver implementation. + * @required_opts: sysfs command-line options that must be specified + * when adding a new NVMe controller. + * @allowed_opts: sysfs command-line options that can be specified + * when adding a new NVMe controller. + * @create_ctrl(): function pointer that points to a non-NVMe + * implementation-specific fabric technology + * that would go into starting up that fabric + * for the purpose of conneciton to an NVMe controller + * using that fabric technology. + * + * Notes: + * 1. At minimum, 'required_opts' and 'allowed_opts' should + * be set to the same enum parsing options defined earlier. + * 2. create_ctrl() must be defined (even if it does nothing) + */ +struct nvmf_transport_ops { + struct list_head entry; + const char *name; + int required_opts; + int allowed_opts; + struct nvme_ctrl *(*create_ctrl)(struct device *dev, + struct nvmf_ctrl_options *opts); +}; + +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); +void nvmf_register_transport(struct nvmf_transport_ops *ops); +void nvmf_unregister_transport(struct nvmf_transport_ops *ops); +void nvmf_free_options(struct nvmf_ctrl_options *opts); +const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); + +#endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c new file mode 100644 index 0000000..63f483d --- /dev/null +++ b/drivers/nvme/host/lightnvm.c @@ -0,0 +1,629 @@ +/* + * nvme-lightnvm.c - LightNVM NVMe device + * + * Copyright (C) 2014-2015 IT University of Copenhagen + * Initial release: Matias Bjorling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include "nvme.h" + +#include +#include +#include +#include + +enum nvme_nvm_admin_opcode { + nvme_nvm_admin_identity = 0xe2, + nvme_nvm_admin_get_l2p_tbl = 0xea, + nvme_nvm_admin_get_bb_tbl = 0xf2, + nvme_nvm_admin_set_bb_tbl = 0xf1, +}; + +struct nvme_nvm_hb_rw { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le64 spba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le64 slba; +}; + +struct nvme_nvm_ph_rw { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le64 spba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le64 resv; +}; + +struct nvme_nvm_identity { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd[2]; + __le64 prp1; + __le64 prp2; + __le32 chnl_off; + __u32 rsvd11[5]; +}; + +struct nvme_nvm_l2ptbl { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[4]; + __le64 prp1; + __le64 prp2; + __le64 slba; + __le32 nlb; + __le16 cdw14[6]; +}; + +struct nvme_nvm_getbbtbl { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd[2]; + __le64 prp1; + __le64 prp2; + __le64 spba; + __u32 rsvd4[4]; +}; + +struct nvme_nvm_setbbtbl { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd[2]; + __le64 prp1; + __le64 prp2; + __le64 spba; + __le16 nlb; + __u8 value; + __u8 rsvd3; + __u32 rsvd4[3]; +}; + +struct nvme_nvm_erase_blk { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd[2]; + __le64 prp1; + __le64 prp2; + __le64 spba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le64 resv; +}; + +struct nvme_nvm_command { + union { + struct nvme_common_command common; + struct nvme_nvm_identity identity; + struct nvme_nvm_hb_rw hb_rw; + struct nvme_nvm_ph_rw ph_rw; + struct nvme_nvm_l2ptbl l2p; + struct nvme_nvm_getbbtbl get_bb; + struct nvme_nvm_setbbtbl set_bb; + struct nvme_nvm_erase_blk erase; + }; +}; + +struct nvme_nvm_completion { + __le64 result; /* Used by LightNVM to return ppa completions */ + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +#define NVME_NVM_LP_MLC_PAIRS 886 +struct nvme_nvm_lp_mlc { + __le16 num_pairs; + __u8 pairs[NVME_NVM_LP_MLC_PAIRS]; +}; + +struct nvme_nvm_lp_tbl { + __u8 id[8]; + struct nvme_nvm_lp_mlc mlc; +}; + +struct nvme_nvm_id_group { + __u8 mtype; + __u8 fmtype; + __le16 res16; + __u8 num_ch; + __u8 num_lun; + __u8 num_pln; + __u8 rsvd1; + __le16 num_blk; + __le16 num_pg; + __le16 fpg_sz; + __le16 csecs; + __le16 sos; + __le16 rsvd2; + __le32 trdt; + __le32 trdm; + __le32 tprt; + __le32 tprm; + __le32 tbet; + __le32 tbem; + __le32 mpos; + __le32 mccap; + __le16 cpar; + __u8 reserved[10]; + struct nvme_nvm_lp_tbl lptbl; +} __packed; + +struct nvme_nvm_addr_format { + __u8 ch_offset; + __u8 ch_len; + __u8 lun_offset; + __u8 lun_len; + __u8 pln_offset; + __u8 pln_len; + __u8 blk_offset; + __u8 blk_len; + __u8 pg_offset; + __u8 pg_len; + __u8 sect_offset; + __u8 sect_len; + __u8 res[4]; +} __packed; + +struct nvme_nvm_id { + __u8 ver_id; + __u8 vmnt; + __u8 cgrps; + __u8 res; + __le32 cap; + __le32 dom; + struct nvme_nvm_addr_format ppaf; + __u8 resv[228]; + struct nvme_nvm_id_group groups[4]; +} __packed; + +struct nvme_nvm_bb_tbl { + __u8 tblid[4]; + __le16 verid; + __le16 revid; + __le32 rvsd1; + __le32 tblks; + __le32 tfact; + __le32 tgrown; + __le32 tdresv; + __le32 thresv; + __le32 rsvd2[8]; + __u8 blk[0]; +}; + +/* + * Check we didn't inadvertently grow the command struct + */ +static inline void _nvme_nvm_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); + BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); + BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 128); + BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 512); +} + +static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) +{ + struct nvme_nvm_id_group *src; + struct nvm_id_group *dst; + int i, end; + + end = min_t(u32, 4, nvm_id->cgrps); + + for (i = 0; i < end; i++) { + src = &nvme_nvm_id->groups[i]; + dst = &nvm_id->groups[i]; + + dst->mtype = src->mtype; + dst->fmtype = src->fmtype; + dst->num_ch = src->num_ch; + dst->num_lun = src->num_lun; + dst->num_pln = src->num_pln; + + dst->num_pg = le16_to_cpu(src->num_pg); + dst->num_blk = le16_to_cpu(src->num_blk); + dst->fpg_sz = le16_to_cpu(src->fpg_sz); + dst->csecs = le16_to_cpu(src->csecs); + dst->sos = le16_to_cpu(src->sos); + + dst->trdt = le32_to_cpu(src->trdt); + dst->trdm = le32_to_cpu(src->trdm); + dst->tprt = le32_to_cpu(src->tprt); + dst->tprm = le32_to_cpu(src->tprm); + dst->tbet = le32_to_cpu(src->tbet); + dst->tbem = le32_to_cpu(src->tbem); + dst->mpos = le32_to_cpu(src->mpos); + dst->mccap = le32_to_cpu(src->mccap); + + dst->cpar = le16_to_cpu(src->cpar); + + if (dst->fmtype == NVM_ID_FMTYPE_MLC) { + memcpy(dst->lptbl.id, src->lptbl.id, 8); + dst->lptbl.mlc.num_pairs = + le16_to_cpu(src->lptbl.mlc.num_pairs); + + if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { + pr_err("nvm: number of MLC pairs not supported\n"); + return -EINVAL; + } + + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, + dst->lptbl.mlc.num_pairs); + } + } + + return 0; +} + +static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) +{ + struct nvme_ns *ns = nvmdev->q->queuedata; + struct nvme_nvm_id *nvme_nvm_id; + struct nvme_nvm_command c = {}; + int ret; + + c.identity.opcode = nvme_nvm_admin_identity; + c.identity.nsid = cpu_to_le32(ns->ns_id); + c.identity.chnl_off = 0; + + nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL); + if (!nvme_nvm_id) + return -ENOMEM; + + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, + nvme_nvm_id, sizeof(struct nvme_nvm_id)); + if (ret) { + ret = -EIO; + goto out; + } + + nvm_id->ver_id = nvme_nvm_id->ver_id; + nvm_id->vmnt = nvme_nvm_id->vmnt; + nvm_id->cgrps = nvme_nvm_id->cgrps; + nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap); + nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom); + memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf, + sizeof(struct nvme_nvm_addr_format)); + + ret = init_grps(nvm_id, nvme_nvm_id); +out: + kfree(nvme_nvm_id); + return ret; +} + +static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, + nvm_l2p_update_fn *update_l2p, void *priv) +{ + struct nvme_ns *ns = nvmdev->q->queuedata; + struct nvme_nvm_command c = {}; + u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9; + u32 nlb_pr_rq = len / sizeof(u64); + u64 cmd_slba = slba; + void *entries; + int ret = 0; + + c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; + c.l2p.nsid = cpu_to_le32(ns->ns_id); + entries = kmalloc(len, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + while (nlb) { + u32 cmd_nlb = min(nlb_pr_rq, nlb); + + c.l2p.slba = cpu_to_le64(cmd_slba); + c.l2p.nlb = cpu_to_le32(cmd_nlb); + + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, + (struct nvme_command *)&c, entries, len); + if (ret) { + dev_err(ns->ctrl->device, + "L2P table transfer failed (%d)\n", ret); + ret = -EIO; + goto out; + } + + if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) { + ret = -EINTR; + goto out; + } + + cmd_slba += cmd_nlb; + nlb -= cmd_nlb; + } + +out: + kfree(entries); + return ret; +} + +static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, + u8 *blks) +{ + struct request_queue *q = nvmdev->q; + struct nvme_ns *ns = q->queuedata; + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_nvm_command c = {}; + struct nvme_nvm_bb_tbl *bb_tbl; + int nr_blks = nvmdev->blks_per_lun * nvmdev->plane_mode; + int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; + int ret = 0; + + c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; + c.get_bb.nsid = cpu_to_le32(ns->ns_id); + c.get_bb.spba = cpu_to_le64(ppa.ppa); + + bb_tbl = kzalloc(tblsz, GFP_KERNEL); + if (!bb_tbl) + return -ENOMEM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, + bb_tbl, tblsz); + if (ret) { + dev_err(ctrl->device, "get bad block table failed (%d)\n", ret); + ret = -EIO; + goto out; + } + + if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || + bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { + dev_err(ctrl->device, "bbt format mismatch\n"); + ret = -EINVAL; + goto out; + } + + if (le16_to_cpu(bb_tbl->verid) != 1) { + ret = -EINVAL; + dev_err(ctrl->device, "bbt version not supported\n"); + goto out; + } + + if (le32_to_cpu(bb_tbl->tblks) != nr_blks) { + ret = -EINVAL; + dev_err(ctrl->device, + "bbt unsuspected blocks returned (%u!=%u)", + le32_to_cpu(bb_tbl->tblks), nr_blks); + goto out; + } + + memcpy(blks, bb_tbl->blk, nvmdev->blks_per_lun * nvmdev->plane_mode); +out: + kfree(bb_tbl); + return ret; +} + +static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, + int nr_ppas, int type) +{ + struct nvme_ns *ns = nvmdev->q->queuedata; + struct nvme_nvm_command c = {}; + int ret = 0; + + c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; + c.set_bb.nsid = cpu_to_le32(ns->ns_id); + c.set_bb.spba = cpu_to_le64(ppas->ppa); + c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); + c.set_bb.value = type; + + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, + NULL, 0); + if (ret) + dev_err(ns->ctrl->device, "set bad block table failed (%d)\n", + ret); + return ret; +} + +static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, + struct nvme_ns *ns, struct nvme_nvm_command *c) +{ + c->ph_rw.opcode = rqd->opcode; + c->ph_rw.nsid = cpu_to_le32(ns->ns_id); + c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); + c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); + c->ph_rw.control = cpu_to_le16(rqd->flags); + c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); + + if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) + c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, + rqd->bio->bi_iter.bi_sector)); +} + +static void nvme_nvm_end_io(struct request *rq, int error) +{ + struct nvm_rq *rqd = rq->end_io_data; + struct nvme_nvm_completion *cqe = rq->special; + + if (cqe) + rqd->ppa_status = le64_to_cpu(cqe->result); + + nvm_end_io(rqd, error); + + kfree(rq->cmd); + blk_mq_free_request(rq); +} + +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + struct request_queue *q = dev->q; + struct nvme_ns *ns = q->queuedata; + struct request *rq; + struct bio *bio = rqd->bio; + struct nvme_nvm_command *cmd; + + rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0); + if (IS_ERR(rq)) + return -ENOMEM; + + cmd = kzalloc(sizeof(struct nvme_nvm_command) + + sizeof(struct nvme_nvm_completion), GFP_KERNEL); + if (!cmd) { + blk_mq_free_request(rq); + return -ENOMEM; + } + + rq->cmd_type = REQ_TYPE_DRV_PRIV; + rq->ioprio = bio_prio(bio); + + if (bio_has_data(bio)) + rq->nr_phys_segments = bio_phys_segments(q, bio); + + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + + nvme_nvm_rqtocmd(rq, rqd, ns, cmd); + + rq->cmd = (unsigned char *)cmd; + rq->cmd_len = sizeof(struct nvme_nvm_command); + rq->special = cmd + 1; + + rq->end_io_data = rqd; + + blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); + + return 0; +} + +static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + struct request_queue *q = dev->q; + struct nvme_ns *ns = q->queuedata; + struct nvme_nvm_command c = {}; + + c.erase.opcode = NVM_OP_ERASE; + c.erase.nsid = cpu_to_le32(ns->ns_id); + c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa); + c.erase.length = cpu_to_le16(rqd->nr_ppas - 1); + + return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); +} + +static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) +{ + struct nvme_ns *ns = nvmdev->q->queuedata; + + return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0); +} + +static void nvme_nvm_destroy_dma_pool(void *pool) +{ + struct dma_pool *dma_pool = pool; + + dma_pool_destroy(dma_pool); +} + +static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, + gfp_t mem_flags, dma_addr_t *dma_handler) +{ + return dma_pool_alloc(pool, mem_flags, dma_handler); +} + +static void nvme_nvm_dev_dma_free(void *pool, void *addr, + dma_addr_t dma_handler) +{ + dma_pool_free(pool, addr, dma_handler); +} + +static struct nvm_dev_ops nvme_nvm_dev_ops = { + .identity = nvme_nvm_identity, + + .get_l2p_tbl = nvme_nvm_get_l2p_tbl, + + .get_bb_tbl = nvme_nvm_get_bb_tbl, + .set_bb_tbl = nvme_nvm_set_bb_tbl, + + .submit_io = nvme_nvm_submit_io, + .erase_block = nvme_nvm_erase_block, + + .create_dma_pool = nvme_nvm_create_dma_pool, + .destroy_dma_pool = nvme_nvm_destroy_dma_pool, + .dev_dma_alloc = nvme_nvm_dev_dma_alloc, + .dev_dma_free = nvme_nvm_dev_dma_free, + + .max_phys_sect = 64, +}; + +int nvme_nvm_register(struct request_queue *q, char *disk_name) +{ + return nvm_register(q, disk_name, &nvme_nvm_dev_ops); +} + +void nvme_nvm_unregister(struct request_queue *q, char *disk_name) +{ + nvm_unregister(disk_name); +} + +/* move to shared place when used in multiple places. */ +#define PCI_VENDOR_ID_CNEX 0x1d1d +#define PCI_DEVICE_ID_CNEX_WL 0x2807 +#define PCI_DEVICE_ID_CNEX_QEMU 0x1f1f + +int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + /* XXX: this is poking into PCI structures from generic code! */ + struct pci_dev *pdev = to_pci_dev(ctrl->dev); + + /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ + if (pdev->vendor == PCI_VENDOR_ID_CNEX && + pdev->device == PCI_DEVICE_ID_CNEX_QEMU && + id->vs[0] == 0x1) + return 1; + + /* CNEX Labs - PCI ID + Vendor specific bit */ + if (pdev->vendor == PCI_VENDOR_ID_CNEX && + pdev->device == PCI_DEVICE_ID_CNEX_WL && + id->vs[0] == 0x1) + return 1; + + return 0; +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h new file mode 100644 index 0000000..ab18b78 --- /dev/null +++ b/drivers/nvme/host/nvme.h @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVME_H +#define _NVME_H + +#include +#include +#include +#include + +enum { + /* + * Driver internal status code for commands that were cancelled due + * to timeouts or controller shutdown. The value is negative so + * that it a) doesn't overlap with the unsigned hardware error codes, + * and b) can easily be tested for. + */ + NVME_SC_CANCELLED = -EINTR, +}; + +extern unsigned char nvme_io_timeout; +#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) + +extern unsigned char admin_timeout; +#define ADMIN_TIMEOUT (admin_timeout * HZ) + +extern unsigned char shutdown_timeout; +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + +#define NVME_DEFAULT_KATO 5 +#define NVME_KATO_GRACE 10 + +extern unsigned int nvme_max_retries; + +enum { + NVME_NS_LBA = 0, + NVME_NS_LIGHTNVM = 1, +}; + +/* + * List of workarounds for devices that required behavior not specified in + * the standard. + */ +enum nvme_quirks { + /* + * Prefers I/O aligned to a stripe size specified in a vendor + * specific Identify field. + */ + NVME_QUIRK_STRIPE_SIZE = (1 << 0), + + /* + * The controller doesn't handle Identify value others than 0 or 1 + * correctly. + */ + NVME_QUIRK_IDENTIFY_CNS = (1 << 1), + + /* + * The controller deterministically returns O's on reads to discarded + * logical blocks. + */ + NVME_QUIRK_DISCARD_ZEROES = (1 << 2), + + /* + * The controller needs a delay before starts checking the device + * readiness, which is done by reading the NVME_CSTS_RDY bit. + */ + NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3), +}; + +/* The below value is the specific amount of delay needed before checking + * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the + * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was + * found empirically. + */ +#define NVME_QUIRK_DELAY_AMOUNT 2000 + +enum nvme_ctrl_state { + NVME_CTRL_NEW, + NVME_CTRL_LIVE, + NVME_CTRL_RESETTING, + NVME_CTRL_RECONNECTING, + NVME_CTRL_DELETING, + NVME_CTRL_DEAD, +}; + +struct nvme_ctrl { + enum nvme_ctrl_state state; + spinlock_t lock; + const struct nvme_ctrl_ops *ops; + struct request_queue *admin_q; + struct request_queue *connect_q; + struct device *dev; + struct kref kref; + int instance; + struct blk_mq_tag_set *tagset; + struct list_head namespaces; + struct mutex namespaces_mutex; + struct device *device; /* char device */ + struct list_head node; + struct ida ns_ida; + + char name[12]; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u16 cntlid; + + u32 ctrl_config; + + u32 page_size; + u32 max_hw_sectors; + u32 stripe_size; + u16 oncs; + u16 vid; + atomic_t abort_limit; + u8 event_limit; + u8 vwc; + u32 vs; + u32 sgls; + u16 kas; + unsigned int kato; + bool subsystem; + unsigned long quirks; + struct work_struct scan_work; + struct work_struct async_event_work; + struct delayed_work ka_work; + + /* Fabrics only */ + u16 sqsize; + u32 ioccsz; + u32 iorcsz; + u16 icdoff; + u16 maxcmd; + struct nvmf_ctrl_options *opts; +}; + +/* + * An NVM Express namespace is equivalent to a SCSI LUN + */ +struct nvme_ns { + struct list_head list; + + struct nvme_ctrl *ctrl; + struct request_queue *queue; + struct gendisk *disk; + struct kref kref; + int instance; + + u8 eui[8]; + u8 uuid[16]; + + unsigned ns_id; + int lba_shift; + u16 ms; + bool ext; + u8 pi_type; + int type; + unsigned long flags; + +#define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 + + u64 mode_select_num_blocks; + u32 mode_select_block_len; +}; + +struct nvme_ctrl_ops { + const char *name; + struct module *module; + bool is_fabrics; + int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); + int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); + int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); + int (*reset_ctrl)(struct nvme_ctrl *ctrl); + void (*free_ctrl)(struct nvme_ctrl *ctrl); + void (*post_scan)(struct nvme_ctrl *ctrl); + void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); + int (*delete_ctrl)(struct nvme_ctrl *ctrl); + const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl); + int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); +}; + +static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_RDY; +} + +static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) +{ + if (!ctrl->subsystem) + return -ENOTTY; + return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); +} + +static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) +{ + return (sector >> (ns->lba_shift - 9)); +} + +static inline unsigned nvme_map_len(struct request *rq) +{ + if (req_op(rq) == REQ_OP_DISCARD) + return sizeof(struct nvme_dsm_range); + else + return blk_rq_bytes(rq); +} + +static inline void nvme_cleanup_cmd(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD) + kfree(req->completion_data); +} + +static inline int nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; + } +} + +static inline bool nvme_req_needs_retry(struct request *req, u16 status) +{ + return !(status & NVME_SC_DNR || blk_noretry_request(req)) && + (jiffies - req->start_time) < req->timeout && + req->retries < nvme_max_retries; +} + +void nvme_cancel_request(struct request *req, void *data, bool reserved); +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks); +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); +void nvme_put_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_identify(struct nvme_ctrl *ctrl); + +void nvme_queue_scan(struct nvme_ctrl *ctrl); +void nvme_remove_namespaces(struct nvme_ctrl *ctrl); + +#define NVME_NR_AERS 1 +void nvme_complete_async_event(struct nvme_ctrl *ctrl, + struct nvme_completion *cqe); +void nvme_queue_async_events(struct nvme_ctrl *ctrl); + +void nvme_stop_queues(struct nvme_ctrl *ctrl); +void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_kill_queues(struct nvme_ctrl *ctrl); + +#define NVME_QID_ANY -1 +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags, int qid); +void nvme_requeue_req(struct request *req); +int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmd); +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buf, unsigned bufflen); +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + struct nvme_completion *cqe, void *buffer, unsigned bufflen, + unsigned timeout, int qid, int at_head, int flags); +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout); +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + u32 *result, unsigned timeout); +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, + struct nvme_id_ns **id); +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result); +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result); +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); +void nvme_start_keep_alive(struct nvme_ctrl *ctrl); +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); + +struct sg_io_hdr; + +int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); +int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg); +int nvme_sg_get_version_num(int __user *ip); + +#ifdef CONFIG_NVM +int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); +int nvme_nvm_register(struct request_queue *q, char *disk_name); +void nvme_nvm_unregister(struct request_queue *q, char *disk_name); +#else +static inline int nvme_nvm_register(struct request_queue *q, char *disk_name) +{ + return 0; +} + +static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {}; + +static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + return 0; +} +#endif /* CONFIG_NVM */ + +int __init nvme_core_init(void); +void nvme_core_exit(void); + +#endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c new file mode 100644 index 0000000..60f7eab --- /dev/null +++ b/drivers/nvme/host/pci.c @@ -0,0 +1,2166 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme.h" + +#define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 256 +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) + +static int use_threaded_interrupts; +module_param(use_threaded_interrupts, int, 0); + +static bool use_cmb_sqes = true; +module_param(use_cmb_sqes, bool, 0644); +MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); + +static struct workqueue_struct *nvme_workq; + +struct nvme_dev; +struct nvme_queue; + +static int nvme_reset(struct nvme_dev *dev); +static void nvme_process_cq(struct nvme_queue *nvmeq); +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct nvme_queue **queues; + struct blk_mq_tag_set tagset; + struct blk_mq_tag_set admin_tagset; + u32 __iomem *dbs; + struct device *dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + unsigned queue_count; + unsigned online_queues; + unsigned max_qid; + int q_depth; + u32 db_stride; + struct msix_entry *entry; + void __iomem *bar; + struct work_struct reset_work; + struct work_struct remove_work; + struct timer_list watchdog_timer; + struct mutex shutdown_lock; + bool subsystem; + void __iomem *cmb; + dma_addr_t cmb_dma_addr; + u64 cmb_size; + u32 cmbsz; + struct nvme_ctrl ctrl; + struct completion ioq_wait; +}; + +static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_dev, ctrl); +} + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct device *q_dmadev; + struct nvme_dev *dev; + char irqname[24]; /* nvme4294967295-65535\0 */ + spinlock_t q_lock; + struct nvme_command *sq_cmds; + struct nvme_command __iomem *sq_cmds_io; + volatile struct nvme_completion *cqes; + struct blk_mq_tags **tags; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + u32 __iomem *q_db; + u16 q_depth; + s16 cq_vector; + u16 sq_tail; + u16 cq_head; + u16 qid; + u8 cq_phase; + u8 cqe_seen; +}; + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_init_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + struct nvme_queue *nvmeq; + int aborted; + int npages; /* In the PRP list. 0 means small pool in use */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ + struct scatterlist *sg; + struct scatterlist inline_sg[0]; +}; + +/* + * Check we didin't inadvertently grow the command struct + */ +static inline void _nvme_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); + BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); +} + +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, + dev->ctrl.page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, + unsigned int size, unsigned int nseg) +{ + return sizeof(__le64 *) * nvme_npages(size, dev) + + sizeof(struct scatterlist) * nseg; +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + return sizeof(struct nvme_iod) + + nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); +} + +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + WARN_ON(hctx_idx != 0); + WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + WARN_ON(nvmeq->tags); + + hctx->driver_data = nvmeq; + nvmeq->tags = &dev->admin_tagset.tags[0]; + return 0; +} + +static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + nvmeq->tags = NULL; +} + +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + struct nvme_dev *dev = data; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + iod->nvmeq = nvmeq; + return 0; +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + if (!nvmeq->tags) + nvmeq->tags = &dev->tagset.tags[hctx_idx]; + + WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); + hctx->driver_data = nvmeq; + return 0; +} + +static int nvme_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + struct nvme_dev *dev = data; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + BUG_ON(!nvmeq); + iod->nvmeq = nvmeq; + return 0; +} + +/** + * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * @nvmeq: The queue to use + * @cmd: The command to send + * + * Safe to use from interrupt context + */ +static void __nvme_submit_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd) +{ + u16 tail = nvmeq->sq_tail; + + if (nvmeq->sq_cmds_io) + memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); + else + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + + if (++tail == nvmeq->q_depth) + tail = 0; + writel(tail, nvmeq->q_db); + nvmeq->sq_tail = tail; +} + +static __le64 **iod_list(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + return (__le64 **)(iod->sg + req->nr_phys_segments); +} + +static int nvme_init_iod(struct request *rq, unsigned size, + struct nvme_dev *dev) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); + int nseg = rq->nr_phys_segments; + + if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { + iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); + if (!iod->sg) + return BLK_MQ_RQ_QUEUE_BUSY; + } else { + iod->sg = iod->inline_sg; + } + + iod->aborted = 0; + iod->npages = -1; + iod->nents = 0; + iod->length = size; + + if (!(rq->cmd_flags & REQ_DONTPREP)) { + rq->retries = 0; + rq->cmd_flags |= REQ_DONTPREP; + } + return 0; +} + +static void nvme_free_iod(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + const int last_prp = dev->ctrl.page_size / 8 - 1; + int i; + __le64 **list = iod_list(req); + dma_addr_t prp_dma = iod->first_dma; + + nvme_cleanup_cmd(req); + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = list[i]; + dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); + dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); + prp_dma = next_prp_dma; + } + + if (iod->sg != iod->inline_sg) + kfree(iod->sg); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == v) + pi->ref_tag = cpu_to_be32(p); +} + +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == p) + pi->ref_tag = cpu_to_be32(v); +} + +/** + * nvme_dif_remap - remaps ref tags to bip seed and physical lba + * + * The virtual start sector is the one that was originally submitted by the + * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical + * start sector may be different. Remap protection information to match the + * physical LBA on writes, and back to the original seed on reads. + * + * Type 0 and 3 do not have a ref tag, so no remapping required. + */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ + struct nvme_ns *ns = req->rq_disk->private_data; + struct bio_integrity_payload *bip; + struct t10_pi_tuple *pi; + void *p, *pmap; + u32 i, nlb, ts, phys, virt; + + if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) + return; + + bip = bio_integrity(req->bio); + if (!bip) + return; + + pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; + + p = pmap; + virt = bip_get_seed(bip); + phys = nvme_block_nr(ns, blk_rq_pos(req)); + nlb = (blk_rq_bytes(req) >> ns->lba_shift); + ts = ns->disk->queue->integrity.tuple_size; + + for (i = 0; i < nlb; i++, virt++, phys++) { + pi = (struct t10_pi_tuple *)p; + dif_swap(phys, virt, pi); + p += ts; + } + kunmap_atomic(pmap); +} +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ +} +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ +} +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ +} +#endif + +static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, + int total_len) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct dma_pool *pool; + int length = total_len; + struct scatterlist *sg = iod->sg; + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + u32 page_size = dev->ctrl.page_size; + int offset = dma_addr & (page_size - 1); + __le64 *prp_list; + __le64 **list = iod_list(req); + dma_addr_t prp_dma; + int nprps, i; + + length -= (page_size - offset); + if (length <= 0) + return true; + + dma_len -= (page_size - offset); + if (dma_len) { + dma_addr += (page_size - offset); + } else { + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + if (length <= page_size) { + iod->first_dma = dma_addr; + return true; + } + + nprps = DIV_ROUND_UP(length, page_size); + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) { + iod->first_dma = dma_addr; + iod->npages = -1; + return false; + } + list[0] = prp_list; + iod->first_dma = prp_dma; + i = 0; + for (;;) { + if (i == page_size >> 3) { + __le64 *old_prp_list = prp_list; + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) + return false; + list[iod->npages++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + prp_list[i++] = cpu_to_le64(dma_addr); + dma_len -= page_size; + dma_addr += page_size; + length -= page_size; + if (length <= 0) + break; + if (dma_len > 0) + continue; + BUG_ON(dma_len < 0); + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + return true; +} + +static int nvme_map_data(struct nvme_dev *dev, struct request *req, + unsigned size, struct nvme_command *cmnd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct request_queue *q = req->q; + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + int ret = BLK_MQ_RQ_QUEUE_ERROR; + + sg_init_table(iod->sg, req->nr_phys_segments); + iod->nents = blk_rq_map_sg(q, req, iod->sg); + if (!iod->nents) + goto out; + + ret = BLK_MQ_RQ_QUEUE_BUSY; + if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) + goto out; + + if (!nvme_setup_prps(dev, req, size)) + goto out_unmap; + + ret = BLK_MQ_RQ_QUEUE_ERROR; + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(q, req->bio) != 1) + goto out_unmap; + + sg_init_table(&iod->meta_sg, 1); + if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) + goto out_unmap; + + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); + + if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) + goto out_unmap; + } + + cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); + if (blk_integrity_rq(req)) + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); + return BLK_MQ_RQ_QUEUE_OK; + +out_unmap: + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); +out: + return ret; +} + +static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + if (iod->nents) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); + } + } + + nvme_free_iod(dev, req); +} + +/* + * NOTE: ns is NULL when called on the admin queue. + */ +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_queue *nvmeq = hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + struct request *req = bd->rq; + struct nvme_command cmnd; + unsigned map_len; + int ret = BLK_MQ_RQ_QUEUE_OK; + + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns && ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8) && + req->cmd_type != REQ_TYPE_DRV_PRIV) { + blk_mq_end_request(req, -EFAULT); + return BLK_MQ_RQ_QUEUE_OK; + } + } + + map_len = nvme_map_len(req); + ret = nvme_init_iod(req, map_len, dev); + if (ret) + return ret; + + ret = nvme_setup_cmd(ns, req, &cmnd); + if (ret) + goto out; + + if (req->nr_phys_segments) + ret = nvme_map_data(dev, req, map_len, &cmnd); + + if (ret) + goto out; + + cmnd.common.command_id = req->tag; + blk_mq_start_request(req); + + spin_lock_irq(&nvmeq->q_lock); + if (unlikely(nvmeq->cq_vector < 0)) { + if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) + ret = BLK_MQ_RQ_QUEUE_BUSY; + else + ret = BLK_MQ_RQ_QUEUE_ERROR; + spin_unlock_irq(&nvmeq->q_lock); + goto out; + } + __nvme_submit_cmd(nvmeq, &cmnd); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; +out: + nvme_free_iod(dev, req); + return ret; +} + +static void nvme_complete_rq(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = iod->nvmeq->dev; + int error = 0; + + nvme_unmap_data(dev, req); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + req->retries++; + nvme_requeue_req(req); + return; + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } + + if (unlikely(iod->aborted)) { + dev_warn(dev->ctrl.device, + "completing aborted command with status: %04x\n", + req->errors); + } + + blk_mq_end_request(req, error); +} + +/* We read the CQE phase first to check if the rest of the entry is valid */ +static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, + u16 phase) +{ + return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; +} + +static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) +{ + u16 head, phase; + + head = nvmeq->cq_head; + phase = nvmeq->cq_phase; + + while (nvme_cqe_valid(nvmeq, head, phase)) { + struct nvme_completion cqe = nvmeq->cqes[head]; + struct request *req; + + if (++head == nvmeq->q_depth) { + head = 0; + phase = !phase; + } + + if (tag && *tag == cqe.command_id) + *tag = -1; + + if (unlikely(cqe.command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + cqe.command_id, le16_to_cpu(cqe.sq_id)); + continue; + } + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvmeq->qid == 0 && + cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe); + continue; + } + + req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); + if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) + memcpy(req->special, &cqe, sizeof(cqe)); + blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1); + + } + + /* If the controller ignores the cq head doorbell and continuously + * writes to the queue, it is theoretically possible to wrap around + * the queue twice and mistakenly return IRQ_NONE. Linux only + * requires that 0.1% of your interrupts are handled, so this isn't + * a big problem. + */ + if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + return; + + if (likely(nvmeq->cq_vector >= 0)) + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); + nvmeq->cq_head = head; + nvmeq->cq_phase = phase; + + nvmeq->cqe_seen = 1; +} + +static void nvme_process_cq(struct nvme_queue *nvmeq) +{ + __nvme_process_cq(nvmeq, NULL); +} + +static irqreturn_t nvme_irq(int irq, void *data) +{ + irqreturn_t result; + struct nvme_queue *nvmeq = data; + spin_lock(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; + nvmeq->cqe_seen = 0; + spin_unlock(&nvmeq->q_lock); + return result; +} + +static irqreturn_t nvme_irq_check(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + return IRQ_WAKE_THREAD; + return IRQ_NONE; +} + +static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { + spin_lock_irq(&nvmeq->q_lock); + __nvme_process_cq(nvmeq, &tag); + spin_unlock_irq(&nvmeq->q_lock); + + if (tag == -1) + return 1; + } + + return 0; +} + +static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_async_event; + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; + + spin_lock_irq(&nvmeq->q_lock); + __nvme_submit_cmd(nvmeq, &c); + spin_unlock_irq(&nvmeq->q_lock); +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ + memset(&c, 0, sizeof(c)); + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ + memset(&c, 0, sizeof(c)); + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_le16(qid); + + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); +} + +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); +} + +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); +} + +static void abort_endio(struct request *req, int error) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; + u16 status = req->errors; + + dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status); + atomic_inc(&nvmeq->dev->ctrl.abort_limit); + blk_mq_free_request(req); +} + +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; + struct nvme_dev *dev = nvmeq->dev; + struct request *abort_req; + struct nvme_command cmd; + + /* + * Shutdown immediately if controller times out while starting. The + * reset work will see the pci device disabled when it gets the forced + * cancellation error. All outstanding requests are completed on + * shutdown, so we return BLK_EH_HANDLED. + */ + if (dev->ctrl.state == NVME_CTRL_RESETTING) { + dev_warn(dev->ctrl.device, + "I/O %d QID %d timeout, disable controller\n", + req->tag, nvmeq->qid); + nvme_dev_disable(dev, false); + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; + } + + /* + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. + */ + if (!nvmeq->qid || iod->aborted) { + dev_warn(dev->ctrl.device, + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); + nvme_dev_disable(dev, false); + queue_work(nvme_workq, &dev->reset_work); + + /* + * Mark the request as handled, since the inline shutdown + * forces all outstanding requests to complete. + */ + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; + } + + iod->aborted = 1; + + if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.abort.opcode = nvme_admin_abort_cmd; + cmd.abort.cid = req->tag; + cmd.abort.sqid = cpu_to_le16(nvmeq->qid); + + dev_warn(nvmeq->dev->ctrl.device, + "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); + + abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, + BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + if (IS_ERR(abort_req)) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + + abort_req->timeout = ADMIN_TIMEOUT; + abort_req->end_io_data = NULL; + blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; +} + +static void nvme_free_queue(struct nvme_queue *nvmeq) +{ + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + if (nvmeq->sq_cmds) + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); +} + +static void nvme_free_queues(struct nvme_dev *dev, int lowest) +{ + int i; + + for (i = dev->queue_count - 1; i >= lowest; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + dev->queue_count--; + dev->queues[i] = NULL; + nvme_free_queue(nvmeq); + } +} + +/** + * nvme_suspend_queue - put queue into suspended state + * @nvmeq - queue to suspend + */ +static int nvme_suspend_queue(struct nvme_queue *nvmeq) +{ + int vector; + + spin_lock_irq(&nvmeq->q_lock); + if (nvmeq->cq_vector == -1) { + spin_unlock_irq(&nvmeq->q_lock); + return 1; + } + vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; + nvmeq->dev->online_queues--; + nvmeq->cq_vector = -1; + spin_unlock_irq(&nvmeq->q_lock); + + if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) + blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); + + irq_set_affinity_hint(vector, NULL); + free_irq(vector, nvmeq); + + return 0; +} + +static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + + if (!nvmeq) + return; + if (nvme_suspend_queue(nvmeq)) + return; + + if (shutdown) + nvme_shutdown_ctrl(&dev->ctrl); + else + nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( + dev->bar + NVME_REG_CAP)); + + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); +} + +static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, + int entry_size) +{ + int q_depth = dev->q_depth; + unsigned q_size_aligned = roundup(q_depth * entry_size, + dev->ctrl.page_size); + + if (q_size_aligned * nr_io_queues > dev->cmb_size) { + u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); + mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); + q_depth = div_u64(mem_per_q, entry_size); + + /* + * Ensure the reduced q_depth is above some threshold where it + * would be better to map queues in system memory with the + * original depth + */ + if (q_depth < 64) + return -ENOMEM; + } + + return q_depth; +} + +static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, + int qid, int depth) +{ + if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { + unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), + dev->ctrl.page_size); + nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; + nvmeq->sq_cmds_io = dev->cmb + offset; + } else { + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + return -ENOMEM; + } + + return 0; +} + +static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth) +{ + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); + if (!nvmeq) + return NULL; + + nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); + if (!nvmeq->cqes) + goto free_nvmeq; + + if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) + goto free_cqdma; + + nvmeq->q_dmadev = dev->dev; + nvmeq->dev = dev; + snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", + dev->ctrl.instance, qid); + spin_lock_init(&nvmeq->q_lock); + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + nvmeq->q_depth = depth; + nvmeq->qid = qid; + nvmeq->cq_vector = -1; + dev->queues[qid] = nvmeq; + dev->queue_count++; + + return nvmeq; + + free_cqdma: + dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); + free_nvmeq: + kfree(nvmeq); + return NULL; +} + +static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, + const char *name) +{ + if (use_threaded_interrupts) + return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, + nvme_irq_check, nvme_irq, IRQF_SHARED, + name, nvmeq); + return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, + IRQF_SHARED, name, nvmeq); +} + +static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) +{ + struct nvme_dev *dev = nvmeq->dev; + + spin_lock_irq(&nvmeq->q_lock); + nvmeq->sq_tail = 0; + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + dev->online_queues++; + spin_unlock_irq(&nvmeq->q_lock); +} + +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) +{ + struct nvme_dev *dev = nvmeq->dev; + int result; + + nvmeq->cq_vector = qid - 1; + result = adapter_alloc_cq(dev, qid, nvmeq); + if (result < 0) + return result; + + result = adapter_alloc_sq(dev, qid, nvmeq); + if (result < 0) + goto release_cq; + + result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + if (result < 0) + goto release_sq; + + nvme_init_queue(nvmeq, qid); + return result; + + release_sq: + adapter_delete_sq(dev, qid); + release_cq: + adapter_delete_cq(dev, qid); + return result; +} + +static struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_admin_init_hctx, + .exit_hctx = nvme_admin_exit_hctx, + .init_request = nvme_admin_init_request, + .timeout = nvme_timeout, +}; + +static struct blk_mq_ops nvme_mq_ops = { + .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_init_hctx, + .init_request = nvme_init_request, + .timeout = nvme_timeout, + .poll = nvme_poll, +}; + +static void nvme_dev_remove_admin(struct nvme_dev *dev) +{ + if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + /* + * If the controller was reset during removal, it's possible + * user requests may be waiting on a stopped queue. Start the + * queue to flush these to completion. + */ + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); + blk_cleanup_queue(dev->ctrl.admin_q); + blk_mq_free_tag_set(&dev->admin_tagset); + } +} + +static int nvme_alloc_admin_tags(struct nvme_dev *dev) +{ + if (!dev->ctrl.admin_q) { + dev->admin_tagset.ops = &nvme_mq_admin_ops; + dev->admin_tagset.nr_hw_queues = 1; + + /* + * Subtract one to leave an empty queue entry for 'Full Queue' + * condition. See NVM-Express 1.2 specification, section 4.1.2. + */ + dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; + dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev_to_node(dev->dev); + dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) + return -ENOMEM; + + dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (IS_ERR(dev->ctrl.admin_q)) { + blk_mq_free_tag_set(&dev->admin_tagset); + return -ENOMEM; + } + if (!blk_get_queue(dev->ctrl.admin_q)) { + nvme_dev_remove_admin(dev); + dev->ctrl.admin_q = NULL; + return -ENODEV; + } + } else + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); + + return 0; +} + +static int nvme_configure_admin_queue(struct nvme_dev *dev) +{ + int result; + u32 aqa; + u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + struct nvme_queue *nvmeq; + + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? + NVME_CAP_NSSRC(cap) : 0; + + if (dev->subsystem && + (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); + + result = nvme_disable_ctrl(&dev->ctrl, cap); + if (result < 0) + return result; + + nvmeq = dev->queues[0]; + if (!nvmeq) { + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); + if (!nvmeq) + return -ENOMEM; + } + + aqa = nvmeq->q_depth - 1; + aqa |= aqa << 16; + + writel(aqa, dev->bar + NVME_REG_AQA); + lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); + lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); + + result = nvme_enable_ctrl(&dev->ctrl, cap); + if (result) + goto free_nvmeq; + + nvmeq->cq_vector = 0; + result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + if (result) { + nvmeq->cq_vector = -1; + goto free_nvmeq; + } + + return result; + + free_nvmeq: + nvme_free_queues(dev, 0); + return result; +} + +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset ongoing, we shouldn't reset again. */ + if (work_busy(&dev->reset_work)) + return false; + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return false; + + return true; +} + +static void nvme_watchdog_timer(unsigned long data) +{ + struct nvme_dev *dev = (struct nvme_dev *)data; + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* Skip controllers under certain specific conditions. */ + if (nvme_should_reset(dev, csts)) { + if (queue_work(nvme_workq, &dev->reset_work)) + dev_warn(dev->dev, + "Failed status: 0x%x, reset controller.\n", + csts); + return; + } + + mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); +} + +static int nvme_create_io_queues(struct nvme_dev *dev) +{ + unsigned i, max; + int ret = 0; + + for (i = dev->queue_count; i <= dev->max_qid; i++) { + if (!nvme_alloc_queue(dev, i, dev->q_depth)) { + ret = -ENOMEM; + break; + } + } + + max = min(dev->max_qid, dev->queue_count - 1); + for (i = dev->online_queues; i <= max; i++) { + ret = nvme_create_queue(dev->queues[i], i); + if (ret) { + nvme_free_queues(dev, i); + break; + } + } + + /* + * Ignore failing Create SQ/CQ commands, we can continue with less + * than the desired aount of queues, and even a controller without + * I/O queues an still be used to issue admin commands. This might + * be useful to upgrade a buggy firmware for example. + */ + return ret >= 0 ? 0 : ret; +} + +static void __iomem *nvme_map_cmb(struct nvme_dev *dev) +{ + u64 szu, size, offset; + u32 cmbloc; + resource_size_t bar_size; + struct pci_dev *pdev = to_pci_dev(dev->dev); + void __iomem *cmb; + dma_addr_t dma_addr; + + if (!use_cmb_sqes) + return NULL; + + dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); + if (!(NVME_CMB_SZ(dev->cmbsz))) + return NULL; + + cmbloc = readl(dev->bar + NVME_REG_CMBLOC); + + szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); + size = szu * NVME_CMB_SZ(dev->cmbsz); + offset = szu * NVME_CMB_OFST(cmbloc); + bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); + + if (offset > bar_size) + return NULL; + + /* + * Controllers may support a CMB size larger than their BAR, + * for example, due to being behind a bridge. Reduce the CMB to + * the reported size of the BAR + */ + if (size > bar_size - offset) + size = bar_size - offset; + + dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; + cmb = ioremap_wc(dma_addr, size); + if (!cmb) + return NULL; + + dev->cmb_dma_addr = dma_addr; + dev->cmb_size = size; + return cmb; +} + +static inline void nvme_release_cmb(struct nvme_dev *dev) +{ + if (dev->cmb) { + iounmap(dev->cmb); + dev->cmb = NULL; + } +} + +static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); +} + +static int nvme_setup_io_queues(struct nvme_dev *dev) +{ + struct nvme_queue *adminq = dev->queues[0]; + struct pci_dev *pdev = to_pci_dev(dev->dev); + int result, i, vecs, nr_io_queues, size; + + nr_io_queues = num_online_cpus(); + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); + if (result < 0) + return result; + + if (nr_io_queues == 0) + return 0; + + if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { + result = nvme_cmb_qdepth(dev, nr_io_queues, + sizeof(struct nvme_command)); + if (result > 0) + dev->q_depth = result; + else + nvme_release_cmb(dev); + } + + size = db_bar_size(dev, nr_io_queues); + if (size > 8192) { + iounmap(dev->bar); + do { + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (dev->bar) + break; + if (!--nr_io_queues) + return -ENOMEM; + size = db_bar_size(dev, nr_io_queues); + } while (1); + dev->dbs = dev->bar + 4096; + adminq->q_db = dev->dbs; + } + + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, adminq); + + /* + * If we enable msix early due to not intx, disable it again before + * setting up the full range we need. + */ + if (pdev->msi_enabled) + pci_disable_msi(pdev); + else if (pdev->msix_enabled) + pci_disable_msix(pdev); + + for (i = 0; i < nr_io_queues; i++) + dev->entry[i].entry = i; + vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); + if (vecs < 0) { + vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); + if (vecs < 0) { + vecs = 1; + } else { + for (i = 0; i < vecs; i++) + dev->entry[i].vector = i + pdev->irq; + } + } + + /* + * Should investigate if there's a performance win from allocating + * more queues than interrupt vectors; it might allow the submission + * path to scale better, even if the receive path is limited by the + * number of interrupts. + */ + nr_io_queues = vecs; + dev->max_qid = nr_io_queues; + + result = queue_request_irq(dev, adminq, adminq->irqname); + if (result) { + adminq->cq_vector = -1; + goto free_queues; + } + return nvme_create_io_queues(dev); + + free_queues: + nvme_free_queues(dev, 1); + return result; +} + +static void nvme_pci_post_scan(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq; + int i; + + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; + + if (!nvmeq->tags || !(*nvmeq->tags)) + continue; + + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + blk_mq_tags_cpumask(*nvmeq->tags)); + } +} + +static void nvme_del_queue_end(struct request *req, int error) +{ + struct nvme_queue *nvmeq = req->end_io_data; + + blk_mq_free_request(req); + complete(&nvmeq->dev->ioq_wait); +} + +static void nvme_del_cq_end(struct request *req, int error) +{ + struct nvme_queue *nvmeq = req->end_io_data; + + if (!error) { + unsigned long flags; + + /* + * We might be called with the AQ q_lock held + * and the I/O queue q_lock should always + * nest inside the AQ one. + */ + spin_lock_irqsave_nested(&nvmeq->q_lock, flags, + SINGLE_DEPTH_NESTING); + nvme_process_cq(nvmeq); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + } + + nvme_del_queue_end(req, error); +} + +static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) +{ + struct request_queue *q = nvmeq->dev->ctrl.admin_q; + struct request *req; + struct nvme_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.delete_queue.opcode = opcode; + cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); + + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = ADMIN_TIMEOUT; + req->end_io_data = nvmeq; + + blk_execute_rq_nowait(q, NULL, req, false, + opcode == nvme_admin_delete_cq ? + nvme_del_cq_end : nvme_del_queue_end); + return 0; +} + +static void nvme_disable_io_queues(struct nvme_dev *dev) +{ + int pass, queues = dev->online_queues - 1; + unsigned long timeout; + u8 opcode = nvme_admin_delete_sq; + + for (pass = 0; pass < 2; pass++) { + int sent = 0, i = queues; + + reinit_completion(&dev->ioq_wait); + retry: + timeout = ADMIN_TIMEOUT; + for (; i > 0; i--, sent++) + if (nvme_delete_queue(dev->queues[i], opcode)) + break; + + while (sent--) { + timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); + if (timeout == 0) + return; + if (i) + goto retry; + } + opcode = nvme_admin_delete_cq; + } +} + +/* + * Return: error value if an error occurred setting up the queues or calling + * Identify Device. 0 if these succeeded, even if adding some of the + * namespaces failed. At the moment, these failures are silent. TBD which + * failures should be reported. + */ +static int nvme_dev_add(struct nvme_dev *dev) +{ + if (!dev->ctrl.tagset) { + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(dev->dev); + dev->tagset.queue_depth = + min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.cmd_size = nvme_cmd_size(dev); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->tagset)) + return 0; + dev->ctrl.tagset = &dev->tagset; + } else { + blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); + + /* Free previously allocated queues that are no longer usable */ + nvme_free_queues(dev, dev->online_queues); + } + + return 0; +} + +static int nvme_pci_enable(struct nvme_dev *dev) +{ + u64 cap; + int result = -ENOMEM; + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pci_enable_device_mem(pdev)) + return result; + + pci_set_master(pdev); + + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && + dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) + goto disable; + + if (readl(dev->bar + NVME_REG_CSTS) == -1) { + result = -ENODEV; + goto disable; + } + + /* + * Some devices and/or platforms don't advertise or work with INTx + * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll + * adjust this later. + */ + if (pci_enable_msix(pdev, dev->entry, 1)) { + pci_enable_msi(pdev); + dev->entry[0].vector = pdev->irq; + } + + if (!dev->entry[0].vector) { + result = -ENODEV; + goto disable; + } + + cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + + dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); + dev->db_stride = 1 << NVME_CAP_STRIDE(cap); + dev->dbs = dev->bar + 4096; + + /* + * Temporary fix for the Apple controller found in the MacBook8,1 and + * some MacBook7,1 to avoid controller resets and data loss. + */ + if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { + dev->q_depth = 2; + dev_warn(dev->dev, "detected Apple NVMe controller, set " + "queue depth=%u to work around controller resets\n", + dev->q_depth); + } + + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) + dev->cmb = nvme_map_cmb(dev); + + pci_enable_pcie_error_reporting(pdev); + pci_save_state(pdev); + return 0; + + disable: + pci_disable_device(pdev); + return result; +} + +static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->bar) + iounmap(dev->bar); + pci_release_mem_regions(to_pci_dev(dev->dev)); +} + +static void nvme_pci_disable(struct nvme_dev *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pdev->msi_enabled) + pci_disable_msi(pdev); + else if (pdev->msix_enabled) + pci_disable_msix(pdev); + + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); + pci_disable_device(pdev); + } +} + +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) +{ + int i; + u32 csts = -1; + + del_timer_sync(&dev->watchdog_timer); + + mutex_lock(&dev->shutdown_lock); + if (pci_is_enabled(to_pci_dev(dev->dev))) { + nvme_stop_queues(&dev->ctrl); + csts = readl(dev->bar + NVME_REG_CSTS); + } + + for (i = dev->queue_count - 1; i > 0; i--) + nvme_suspend_queue(dev->queues[i]); + + if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { + /* A device might become IO incapable very soon during + * probe, before the admin queue is configured. Thus, + * queue_count can be 0 here. + */ + if (dev->queue_count) + nvme_suspend_queue(dev->queues[0]); + } else { + nvme_disable_io_queues(dev); + nvme_disable_admin_queue(dev, shutdown); + } + nvme_pci_disable(dev); + + blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); + blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + mutex_unlock(&dev->shutdown_lock); +} + +static int nvme_setup_prp_pools(struct nvme_dev *dev) +{ + dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, + PAGE_SIZE, PAGE_SIZE, 0); + if (!dev->prp_page_pool) + return -ENOMEM; + + /* Optimisation for I/Os between 4k and 128k */ + dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, + 256, 256, 0); + if (!dev->prp_small_pool) { + dma_pool_destroy(dev->prp_page_pool); + return -ENOMEM; + } + return 0; +} + +static void nvme_release_prp_pools(struct nvme_dev *dev) +{ + dma_pool_destroy(dev->prp_page_pool); + dma_pool_destroy(dev->prp_small_pool); +} + +static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + + put_device(dev->dev); + if (dev->tagset.tags) + blk_mq_free_tag_set(&dev->tagset); + if (dev->ctrl.admin_q) + blk_put_queue(dev->ctrl.admin_q); + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); +} + +static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) +{ + dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); + + kref_get(&dev->ctrl.kref); + nvme_dev_disable(dev, false); + if (!schedule_work(&dev->remove_work)) + nvme_put_ctrl(&dev->ctrl); +} + +static void nvme_reset_work(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + int result = -ENODEV; + + if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) + goto out; + + /* + * If we're called to reset a live controller first shut it down before + * moving on. + */ + if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) + nvme_dev_disable(dev, false); + + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) + goto out; + + result = nvme_pci_enable(dev); + if (result) + goto out; + + result = nvme_configure_admin_queue(dev); + if (result) + goto out; + + nvme_init_queue(dev->queues[0], 0); + result = nvme_alloc_admin_tags(dev); + if (result) + goto out; + + result = nvme_init_identify(&dev->ctrl); + if (result) + goto out; + + result = nvme_setup_io_queues(dev); + if (result) + goto out; + + /* + * A controller that can not execute IO typically requires user + * intervention to correct. For such degraded controllers, the driver + * should not submit commands the user did not request, so skip + * registering for asynchronous event notification on this condition. + */ + if (dev->online_queues > 1) + nvme_queue_async_events(&dev->ctrl); + + mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); + + /* + * Keep the controller around but remove all namespaces if we don't have + * any working I/O queue. + */ + if (dev->online_queues < 2) { + dev_warn(dev->ctrl.device, "IO queues not created\n"); + nvme_kill_queues(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + } else { + nvme_start_queues(&dev->ctrl); + nvme_dev_add(dev); + } + + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, "failed to mark controller live\n"); + goto out; + } + + if (dev->online_queues > 1) + nvme_queue_scan(&dev->ctrl); + return; + + out: + nvme_remove_dead_ctrl(dev, result); +} + +static void nvme_remove_dead_ctrl_work(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); + struct pci_dev *pdev = to_pci_dev(dev->dev); + + nvme_kill_queues(&dev->ctrl); + if (pci_get_drvdata(pdev)) + device_release_driver(&pdev->dev); + nvme_put_ctrl(&dev->ctrl); +} + +static int nvme_reset(struct nvme_dev *dev) +{ + if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) + return -ENODEV; + + if (!queue_work(nvme_workq, &dev->reset_work)) + return -EBUSY; + + flush_work(&dev->reset_work); + return 0; +} + +static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + *val = readl(to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + writel(val, to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + *val = readq(to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) +{ + return nvme_reset(to_nvme_dev(ctrl)); +} + +static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .name = "pcie", + .module = THIS_MODULE, + .reg_read32 = nvme_pci_reg_read32, + .reg_write32 = nvme_pci_reg_write32, + .reg_read64 = nvme_pci_reg_read64, + .reset_ctrl = nvme_pci_reset_ctrl, + .free_ctrl = nvme_pci_free_ctrl, + .post_scan = nvme_pci_post_scan, + .submit_async_event = nvme_pci_submit_async_event, +}; + +static int nvme_dev_map(struct nvme_dev *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pci_request_mem_regions(pdev, "nvme")) + return -ENODEV; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto release; + + return 0; + release: + pci_release_mem_regions(pdev); + return -ENODEV; +} + +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int node, result = -ENOMEM; + struct nvme_dev *dev; + + node = dev_to_node(&pdev->dev); + if (node == NUMA_NO_NODE) + set_dev_node(&pdev->dev, first_memory_node); + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); + if (!dev) + return -ENOMEM; + dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), + GFP_KERNEL, node); + if (!dev->entry) + goto free; + dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), + GFP_KERNEL, node); + if (!dev->queues) + goto free; + + dev->dev = get_device(&pdev->dev); + pci_set_drvdata(pdev, dev); + + result = nvme_dev_map(dev); + if (result) + goto free; + + INIT_WORK(&dev->reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, + (unsigned long)dev); + mutex_init(&dev->shutdown_lock); + init_completion(&dev->ioq_wait); + + result = nvme_setup_prp_pools(dev); + if (result) + goto put_pci; + + result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + id->driver_data); + if (result) + goto release_pools; + + dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + + queue_work(nvme_workq, &dev->reset_work); + return 0; + + release_pools: + nvme_release_prp_pools(dev); + put_pci: + put_device(dev->dev); + nvme_dev_unmap(dev); + free: + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); + return result; +} + +static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + if (prepare) + nvme_dev_disable(dev, false); + else + queue_work(nvme_workq, &dev->reset_work); +} + +static void nvme_shutdown(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_dev_disable(dev, true); +} + +/* + * The driver's remove may be called on a device in a partially initialized + * state. This function must not have any dependencies on the device state in + * order to proceed. + */ +static void nvme_remove(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + + pci_set_drvdata(pdev, NULL); + + if (!pci_device_is_present(pdev)) + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); + + flush_work(&dev->reset_work); + nvme_uninit_ctrl(&dev->ctrl); + nvme_dev_disable(dev, true); + nvme_dev_remove_admin(dev); + nvme_free_queues(dev, 0); + nvme_release_cmb(dev); + nvme_release_prp_pools(dev); + nvme_dev_unmap(dev); + nvme_put_ctrl(&dev->ctrl); +} + +static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs) +{ + int ret = 0; + + if (numvfs == 0) { + if (pci_vfs_assigned(pdev)) { + dev_warn(&pdev->dev, + "Cannot disable SR-IOV VFs while assigned\n"); + return -EPERM; + } + pci_disable_sriov(pdev); + return 0; + } + + ret = pci_enable_sriov(pdev, numvfs); + return ret ? ret : numvfs; +} + +#ifdef CONFIG_PM_SLEEP +static int nvme_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + + nvme_dev_disable(ndev, true); + return 0; +} + +static int nvme_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + + queue_work(nvme_workq, &ndev->reset_work); + return 0; +} +#endif + +static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); + +static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + /* + * A frozen channel requires a reset. When detected, this method will + * shutdown the controller to quiesce. The controller will be restarted + * after the slot reset through driver's slot_reset callback. + */ + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(dev->ctrl.device, + "frozen state error detected, reset controller\n"); + nvme_dev_disable(dev, false); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(dev->ctrl.device, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + dev_info(dev->ctrl.device, "restart after slot reset\n"); + pci_restore_state(pdev); + queue_work(nvme_workq, &dev->reset_work); + return PCI_ERS_RESULT_RECOVERED; +} + +static void nvme_error_resume(struct pci_dev *pdev) +{ + pci_cleanup_aer_uncorrect_error_status(pdev); +} + +static const struct pci_error_handlers nvme_err_handler = { + .error_detected = nvme_error_detected, + .slot_reset = nvme_slot_reset, + .resume = nvme_error_resume, + .reset_notify = nvme_reset_notify, +}; + +/* Move to pci_ids.h later */ +#define PCI_CLASS_STORAGE_EXPRESS 0x010802 + +static const struct pci_device_id nvme_id_table[] = { + { PCI_VDEVICE(INTEL, 0x0953), + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a53), + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a54), + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, nvme_id_table); + +static struct pci_driver nvme_driver = { + .name = "nvme", + .id_table = nvme_id_table, + .probe = nvme_probe, + .remove = nvme_remove, + .shutdown = nvme_shutdown, + .driver = { + .pm = &nvme_dev_pm_ops, + }, + .sriov_configure = nvme_pci_sriov_configure, + .err_handler = &nvme_err_handler, +}; + +static int __init nvme_init(void) +{ + int result; + + nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!nvme_workq) + return -ENOMEM; + + result = pci_register_driver(&nvme_driver); + if (result) + destroy_workqueue(nvme_workq); + return result; +} + +static void __exit nvme_exit(void) +{ + pci_unregister_driver(&nvme_driver); + destroy_workqueue(nvme_workq); + _nvme_check_size(); +} + +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +module_init(nvme_init); +module_exit(nvme_exit); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c new file mode 100644 index 0000000..fbdb226 --- /dev/null +++ b/drivers/nvme/host/rdma.c @@ -0,0 +1,2033 @@ +/* + * NVMe over Fabrics RDMA host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "nvme.h" +#include "fabrics.h" + + +#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */ + +#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */ + +#define NVME_RDMA_MAX_SEGMENTS 256 + +#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_RDMA_NR_AEN_COMMANDS 1 +#define NVME_RDMA_AQ_BLKMQ_DEPTH \ + (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) + +struct nvme_rdma_device { + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct kref ref; + struct list_head entry; +}; + +struct nvme_rdma_qe { + struct ib_cqe cqe; + void *data; + u64 dma; +}; + +struct nvme_rdma_queue; +struct nvme_rdma_request { + struct ib_mr *mr; + struct nvme_rdma_qe sqe; + struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; + u32 num_sge; + int nents; + bool inline_data; + struct ib_reg_wr reg_wr; + struct ib_cqe reg_cqe; + struct nvme_rdma_queue *queue; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +enum nvme_rdma_queue_flags { + NVME_RDMA_Q_CONNECTED = (1 << 0), + NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1), + NVME_RDMA_Q_DELETING = (1 << 2), +}; + +struct nvme_rdma_queue { + struct nvme_rdma_qe *rsp_ring; + u8 sig_count; + int queue_size; + size_t cmnd_capsule_len; + struct nvme_rdma_ctrl *ctrl; + struct nvme_rdma_device *device; + struct ib_cq *ib_cq; + struct ib_qp *qp; + + unsigned long flags; + struct rdma_cm_id *cm_id; + int cm_error; + struct completion cm_done; +}; + +struct nvme_rdma_ctrl { + /* read and written in the hot path */ + spinlock_t lock; + + /* read only in the hot path */ + struct nvme_rdma_queue *queues; + u32 queue_count; + + /* other member variables */ + struct blk_mq_tag_set tag_set; + struct work_struct delete_work; + struct work_struct reset_work; + struct work_struct err_work; + + struct nvme_rdma_qe async_event_sqe; + + int reconnect_delay; + struct delayed_work reconnect_work; + + struct list_head list; + + struct blk_mq_tag_set admin_tag_set; + struct nvme_rdma_device *device; + + u64 cap; + u32 max_fr_pages; + + union { + struct sockaddr addr; + struct sockaddr_in addr_in; + }; + + struct nvme_ctrl ctrl; +}; + +static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_rdma_ctrl, ctrl); +} + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static LIST_HEAD(nvme_rdma_ctrl_list); +static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); + +static struct workqueue_struct *nvme_rdma_wq; + +/* + * Disabling this option makes small I/O goes faster, but is fundamentally + * unsafe. With it turned off we will have to register a global rkey that + * allows read and write access to all physical memory. + */ +static bool register_always = true; +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); + +/* XXX: really should move to a generic header sooner or later.. */ +static inline void put_unaligned_le24(u32 val, u8 *p) +{ + *p++ = val; + *p++ = val >> 8; + *p++ = val >> 16; +} + +static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} + +static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir); + kfree(qe->data); +} + +static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + qe->data = kzalloc(capsule_size, GFP_KERNEL); + if (!qe->data) + return -ENOMEM; + + qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); + if (ib_dma_mapping_error(ibdev, qe->dma)) { + kfree(qe->data); + return -ENOMEM; + } + + return 0; +} + +static void nvme_rdma_free_ring(struct ib_device *ibdev, + struct nvme_rdma_qe *ring, size_t ib_queue_size, + size_t capsule_size, enum dma_data_direction dir) +{ + int i; + + for (i = 0; i < ib_queue_size; i++) + nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir); + kfree(ring); +} + +static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, + size_t ib_queue_size, size_t capsule_size, + enum dma_data_direction dir) +{ + struct nvme_rdma_qe *ring; + int i; + + ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL); + if (!ring) + return NULL; + + for (i = 0; i < ib_queue_size; i++) { + if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) + goto out_free_ring; + } + + return ring; + +out_free_ring: + nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir); + return NULL; +} + +static void nvme_rdma_qp_event(struct ib_event *event, void *context) +{ + pr_debug("QP event %d\n", event->event); +} + +static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) +{ + wait_for_completion_interruptible_timeout(&queue->cm_done, + msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1); + return queue->cm_error; +} + +static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) +{ + struct nvme_rdma_device *dev = queue->device; + struct ib_qp_init_attr init_attr; + int ret; + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.event_handler = nvme_rdma_qp_event; + /* +1 for drain */ + init_attr.cap.max_send_wr = factor * queue->queue_size + 1; + /* +1 for drain */ + init_attr.cap.max_recv_wr = queue->queue_size + 1; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + init_attr.send_cq = queue->ib_cq; + init_attr.recv_cq = queue->ib_cq; + + ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); + + queue->qp = queue->cm_id->qp; + return ret; +} + +static int nvme_rdma_reinit_request(void *data, struct request *rq) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_device *dev = ctrl->device; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int ret = 0; + + if (!req->mr->need_inval) + goto out; + + ib_dereg_mr(req->mr); + + req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, + ctrl->max_fr_pages); + if (IS_ERR(req->mr)) { + ret = PTR_ERR(req->mr); + req->mr = NULL; + goto out; + } + + req->mr->need_inval = false; + +out: + return ret; +} + +static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, + struct request *rq, unsigned int queue_idx) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + struct nvme_rdma_device *dev = queue->device; + + if (req->mr) + ib_dereg_mr(req->mr); + + nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); +} + +static void nvme_rdma_exit_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx) +{ + return __nvme_rdma_exit_request(data, rq, hctx_idx + 1); +} + +static void nvme_rdma_exit_admin_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx) +{ + return __nvme_rdma_exit_request(data, rq, 0); +} + +static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, + struct request *rq, unsigned int queue_idx) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int ret; + + BUG_ON(queue_idx >= ctrl->queue_count); + + ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (ret) + return ret; + + req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, + ctrl->max_fr_pages); + if (IS_ERR(req->mr)) { + ret = PTR_ERR(req->mr); + goto out_free_qe; + } + + req->queue = queue; + + return 0; + +out_free_qe: + nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + return -ENOMEM; +} + +static int nvme_rdma_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return __nvme_rdma_init_request(data, rq, hctx_idx + 1); +} + +static int nvme_rdma_init_admin_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return __nvme_rdma_init_request(data, rq, 0); +} + +static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static void nvme_rdma_free_dev(struct kref *ref) +{ + struct nvme_rdma_device *ndev = + container_of(ref, struct nvme_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + if (!register_always) + ib_dereg_mr(ndev->mr); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static void nvme_rdma_dev_put(struct nvme_rdma_device *dev) +{ + kref_put(&dev->ref, nvme_rdma_free_dev); +} + +static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) +{ + return kref_get_unless_zero(&dev->ref); +} + +static struct nvme_rdma_device * +nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvme_rdma_device *ndev; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->dev->node_guid == cm_id->device->node_guid && + nvme_rdma_dev_get(ndev)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->dev = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->dev); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (!register_always) { + ndev->mr = ib_get_dma_mr(ndev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(ndev->mr)) + goto out_free_pd; + } + + if (!(ndev->dev->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS)) { + dev_err(&ndev->dev->dev, + "Memory registrations not supported.\n"); + goto out_free_mr; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + return ndev; + +out_free_mr: + if (!register_always) + ib_dereg_mr(ndev->mr); +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_device *dev; + struct ib_device *ibdev; + + if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags)) + return; + + dev = queue->device; + ibdev = dev->dev; + rdma_destroy_qp(queue->cm_id); + ib_free_cq(queue->ib_cq); + + nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + + nvme_rdma_dev_put(dev); +} + +static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, + struct nvme_rdma_device *dev) +{ + struct ib_device *ibdev = dev->dev; + const int send_wr_factor = 3; /* MR, SEND, INV */ + const int cq_factor = send_wr_factor + 1; /* + RECV */ + int comp_vector, idx = nvme_rdma_queue_idx(queue); + + int ret; + + queue->device = dev; + + /* + * The admin queue is barely used once the controller is live, so don't + * bother to spread it out. + */ + if (idx == 0) + comp_vector = 0; + else + comp_vector = idx % ibdev->num_comp_vectors; + + + /* +1 for ib_stop_cq */ + queue->ib_cq = ib_alloc_cq(dev->dev, queue, + cq_factor * queue->queue_size + 1, comp_vector, + IB_POLL_SOFTIRQ); + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + goto out; + } + + ret = nvme_rdma_create_qp(queue, send_wr_factor); + if (ret) + goto out_destroy_ib_cq; + + queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + if (!queue->rsp_ring) { + ret = -ENOMEM; + goto out_destroy_qp; + } + set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags); + + return 0; + +out_destroy_qp: + ib_destroy_qp(queue->qp); +out_destroy_ib_cq: + ib_free_cq(queue->ib_cq); +out: + return ret; +} + +static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, + int idx, size_t queue_size) +{ + struct nvme_rdma_queue *queue; + int ret; + + queue = &ctrl->queues[idx]; + queue->ctrl = ctrl; + init_completion(&queue->cm_done); + + if (idx > 0) + queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command); + + queue->queue_size = queue_size; + + queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(queue->cm_id)) { + dev_info(ctrl->ctrl.device, + "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id)); + return PTR_ERR(queue->cm_id); + } + + queue->cm_error = -ETIMEDOUT; + ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr, + NVME_RDMA_CONNECT_TIMEOUT_MS); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma_resolve_addr failed (%d).\n", ret); + goto out_destroy_cm_id; + } + + ret = nvme_rdma_wait_for_cm(queue); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma_resolve_addr wait failed (%d).\n", ret); + goto out_destroy_cm_id; + } + + clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); + set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); + + return 0; + +out_destroy_cm_id: + nvme_rdma_destroy_queue_ib(queue); + rdma_destroy_id(queue->cm_id); + return ret; +} + +static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); +} + +static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) +{ + nvme_rdma_destroy_queue_ib(queue); + rdma_destroy_id(queue->cm_id); +} + +static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue) +{ + if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) + return; + nvme_rdma_stop_queue(queue); + nvme_rdma_free_queue(queue); +} + +static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); +} + +static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + break; + } + + return ret; +} + +static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_rdma_init_queue(ctrl, i, + ctrl->ctrl.opts->queue_size); + if (ret) { + dev_info(ctrl->ctrl.device, + "failed to initialize i/o queue: %d\n", ret); + goto out_free_queues; + } + } + + return 0; + +out_free_queues: + for (i--; i >= 1; i--) + nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); + + return ret; +} + +static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvme_rdma_dev_put(ctrl->device); +} + +static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + bool changed; + int ret; + + if (ctrl->queue_count > 1) { + nvme_rdma_free_io_queues(ctrl); + + ret = blk_mq_reinit_tagset(&ctrl->tag_set); + if (ret) + goto requeue; + } + + nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + + ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set); + if (ret) + goto requeue; + + ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + if (ret) + goto requeue; + + blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + + ret = nvmf_connect_admin_queue(&ctrl->ctrl); + if (ret) + goto stop_admin_q; + + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (ret) + goto stop_admin_q; + + nvme_start_keep_alive(&ctrl->ctrl); + + if (ctrl->queue_count > 1) { + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + goto stop_admin_q; + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto stop_admin_q; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + if (ctrl->queue_count > 1) { + nvme_start_queues(&ctrl->ctrl); + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); + + return; + +stop_admin_q: + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); +requeue: + /* Make sure we are not resetting/deleting */ + if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) { + dev_info(ctrl->ctrl.device, + "Failed reconnect attempt, requeueing...\n"); + queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + ctrl->reconnect_delay * HZ); + } +} + +static void nvme_rdma_error_recovery_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, err_work); + int i; + + nvme_stop_keep_alive(&ctrl->ctrl); + + for (i = 0; i < ctrl->queue_count; i++) + clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags); + + if (ctrl->queue_count > 1) + nvme_stop_queues(&ctrl->ctrl); + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + + /* We must take care of fastfail/requeue all our inflight requests */ + if (ctrl->queue_count > 1) + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + + dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n", + ctrl->reconnect_delay); + + queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + ctrl->reconnect_delay * HZ); +} + +static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) + return; + + queue_work(nvme_rdma_wq, &ctrl->err_work); +} + +static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, + const char *op) +{ + struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + dev_info(ctrl->ctrl.device, + "%s for CQE 0x%p failed with status %s (%d)\n", + op, wc->wr_cqe, + ib_wc_status_msg(wc->status), wc->status); + nvme_rdma_error_recovery(ctrl); +} + +static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "MEMREG"); +} + +static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); +} + +static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req) +{ + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = req->mr->rkey, + }; + + req->reg_cqe.done = nvme_rdma_inv_rkey_done; + wr.wr_cqe = &req->reg_cqe; + + return ib_post_send(queue->qp, &wr, &bad_wr); +} + +static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, + struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int res; + + if (!blk_rq_bytes(rq)) + return; + + if (req->mr->need_inval) { + res = nvme_rdma_inv_rkey(queue, req); + if (res < 0) { + dev_err(ctrl->ctrl.device, + "Queueing INV WR for rkey %#x failed (%d)\n", + req->mr->rkey, res); + nvme_rdma_error_recovery(queue->ctrl); + } + } + + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, + req->nents, rq_data_dir(rq) == + WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + nvme_cleanup_cmd(rq); + sg_free_table_chained(&req->sg_table, true); +} + +static int nvme_rdma_set_sg_null(struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = 0; + put_unaligned_le24(0, sg->length); + put_unaligned_le32(0, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + req->sge[1].addr = sg_dma_address(req->sg_table.sgl); + req->sge[1].length = sg_dma_len(req->sg_table.sgl); + req->sge[1].lkey = queue->device->pd->local_dma_lkey; + + sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); + sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; + + req->inline_data = true; + req->num_sge++; + return 0; +} + +static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl)); + put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length); + put_unaligned_le32(queue->device->mr->rkey, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + int nr; + + nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE); + if (nr < count) { + if (nr < 0) + return nr; + return -EINVAL; + } + + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); + + req->reg_cqe.done = nvme_rdma_memreg_done; + memset(&req->reg_wr, 0, sizeof(req->reg_wr)); + req->reg_wr.wr.opcode = IB_WR_REG_MR; + req->reg_wr.wr.wr_cqe = &req->reg_cqe; + req->reg_wr.wr.num_sge = 0; + req->reg_wr.mr = req->mr; + req->reg_wr.key = req->mr->rkey; + req->reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + req->mr->need_inval = true; + + sg->addr = cpu_to_le64(req->mr->iova); + put_unaligned_le24(req->mr->length, sg->length); + put_unaligned_le32(req->mr->rkey, sg->key); + sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) | + NVME_SGL_FMT_INVALIDATE; + + return 0; +} + +static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, + struct request *rq, unsigned int map_len, + struct nvme_command *c) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int nents, count; + int ret; + + req->num_sge = 1; + req->inline_data = false; + req->mr->need_inval = false; + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (!blk_rq_bytes(rq)) + return nvme_rdma_set_sg_null(c); + + req->sg_table.sgl = req->first_sgl; + ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments, + req->sg_table.sgl); + if (ret) + return -ENOMEM; + + nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); + BUG_ON(nents > rq->nr_phys_segments); + req->nents = nents; + + count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents, + rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (unlikely(count <= 0)) { + sg_free_table_chained(&req->sg_table, true); + return -EIO; + } + + if (count == 1) { + if (rq_data_dir(rq) == WRITE && + map_len <= nvme_rdma_inline_data_size(queue) && + nvme_rdma_queue_idx(queue)) + return nvme_rdma_map_sg_inline(queue, req, c); + + if (!register_always) + return nvme_rdma_map_sg_single(queue, req, c); + } + + return nvme_rdma_map_sg_fr(queue, req, c, count); +} + +static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "SEND"); +} + +static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, + struct ib_send_wr *first, bool flush) +{ + struct ib_send_wr wr, *bad_wr; + int ret; + + sge->addr = qe->dma; + sge->length = sizeof(struct nvme_command), + sge->lkey = queue->device->pd->local_dma_lkey; + + qe->cqe.done = nvme_rdma_send_done; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = sge; + wr.num_sge = num_sge; + wr.opcode = IB_WR_SEND; + wr.send_flags = 0; + + /* + * Unsignalled send completions are another giant desaster in the + * IB Verbs spec: If we don't regularly post signalled sends + * the send queue will fill up and only a QP reset will rescue us. + * Would have been way to obvious to handle this in hardware or + * at least the RDMA stack.. + * + * This messy and racy code sniplet is copy and pasted from the iSER + * initiator, and the magic '32' comes from there as well. + * + * Always signal the flushes. The magic request used for the flush + * sequencer is not allocated in our driver's tagset and it's + * triggered to be freed by blk_cleanup_queue(). So we need to + * always mark it as signaled to ensure that the "wr_cqe", which is + * embeded in request's payload, is not freed when __ib_process_cq() + * calls wr_cqe->done(). + */ + if ((++queue->sig_count % 32) == 0 || flush) + wr.send_flags |= IB_SEND_SIGNALED; + + if (first) + first->next = ≀ + else + first = ≀ + + ret = ib_post_send(queue->qp, first, &bad_wr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe) +{ + struct ib_recv_wr wr, *bad_wr; + struct ib_sge list; + int ret; + + list.addr = qe->dma; + list.length = sizeof(struct nvme_completion); + list.lkey = queue->device->pd->local_dma_lkey; + + qe->cqe.done = nvme_rdma_recv_done; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = &list; + wr.num_sge = 1; + + ret = ib_post_recv(queue->qp, &wr, &bad_wr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) +{ + u32 queue_idx = nvme_rdma_queue_idx(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + struct ib_device *dev = queue->device->dev; + struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe; + struct nvme_command *cmd = sqe->data; + struct ib_sge sge; + int ret; + + if (WARN_ON_ONCE(aer_idx != 0)) + return; + + ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); + + memset(cmd, 0, sizeof(*cmd)); + cmd->common.opcode = nvme_admin_async_event; + cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; + cmd->common.flags |= NVME_CMD_SGL_METABUF; + nvme_rdma_set_sg_null(cmd); + + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), + DMA_TO_DEVICE); + + ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); + WARN_ON_ONCE(ret); +} + +static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, + struct nvme_completion *cqe, struct ib_wc *wc, int tag) +{ + u16 status = le16_to_cpu(cqe->status); + struct request *rq; + struct nvme_rdma_request *req; + int ret = 0; + + status >>= 1; + + rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "tag 0x%x on QP %#x not found\n", + cqe->command_id, queue->qp->qp_num); + nvme_rdma_error_recovery(queue->ctrl); + return ret; + } + req = blk_mq_rq_to_pdu(rq); + + if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special) + memcpy(rq->special, cqe, sizeof(*cqe)); + + if (rq->tag == tag) + ret = 1; + + if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && + wc->ex.invalidate_rkey == req->mr->rkey) + req->mr->need_inval = false; + + blk_mq_complete_request(rq, status); + + return ret; +} + +static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) +{ + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_queue *queue = cq->cq_context; + struct ib_device *ibdev = queue->device->dev; + struct nvme_completion *cqe = qe->data; + const size_t len = sizeof(struct nvme_completion); + int ret = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvme_rdma_wr_error(cq, wc, "RECV"); + return 0; + } + + ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_rdma_queue_idx(queue) == 0 && + cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) + nvme_complete_async_event(&queue->ctrl->ctrl, cqe); + else + ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag); + ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); + + nvme_rdma_post_recv(queue, qe); + return ret; +} + +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + __nvme_rdma_recv_done(cq, wc, -1); +} + +static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) +{ + int ret, i; + + for (i = 0; i < queue->queue_size; i++) { + ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]); + if (ret) + goto out_destroy_queue_ib; + } + + return 0; + +out_destroy_queue_ib: + nvme_rdma_destroy_queue_ib(queue); + return ret; +} + +static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, + struct rdma_cm_event *ev) +{ + if (ev->param.conn.private_data_len) { + struct nvme_rdma_cm_rej *rej = + (struct nvme_rdma_cm_rej *)ev->param.conn.private_data; + + dev_err(queue->ctrl->ctrl.device, + "Connect rejected, status %d.", le16_to_cpu(rej->sts)); + /* XXX: Think of something clever to do here... */ + } else { + dev_err(queue->ctrl->ctrl.device, + "Connect rejected, no private data.\n"); + } + + return -ECONNRESET; +} + +static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_device *dev; + int ret; + + dev = nvme_rdma_find_get_device(queue->cm_id); + if (!dev) { + dev_err(queue->cm_id->device->dma_device, + "no client data found!\n"); + return -ECONNREFUSED; + } + + ret = nvme_rdma_create_queue_ib(queue, dev); + if (ret) { + nvme_rdma_dev_put(dev); + goto out; + } + + ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "rdma_resolve_route failed (%d).\n", + queue->cm_error); + goto out_destroy_queue; + } + + return 0; + +out_destroy_queue: + nvme_rdma_destroy_queue_ib(queue); +out: + return ret; +} + +static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_req priv = { }; + int ret; + + param.qp_num = queue->qp->qp_num; + param.flow_control = 1; + + param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom; + /* maximum retry count */ + param.retry_count = 7; + param.rnr_retry_count = 7; + param.private_data = &priv; + param.private_data_len = sizeof(priv); + + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue)); + /* + * set the admin queue depth to the minimum size + * specified by the Fabrics standard. + */ + if (priv.qid == 0) { + priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH); + priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + } else { + /* + * current interpretation of the fabrics spec + * is at minimum you make hrqsize sqsize+1, or a + * 1's based representation of sqsize. + */ + priv.hrqsize = cpu_to_le16(queue->queue_size); + priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); + } + + ret = rdma_connect(queue->cm_id, ¶m); + if (ret) { + dev_err(ctrl->ctrl.device, + "rdma_connect failed (%d).\n", ret); + goto out_destroy_queue_ib; + } + + return 0; + +out_destroy_queue_ib: + nvme_rdma_destroy_queue_ib(queue); + return ret; +} + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev) +{ + struct nvme_rdma_queue *queue = cm_id->context; + int cm_error = 0; + + dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n", + rdma_event_msg(ev->event), ev->event, + ev->status, cm_id); + + switch (ev->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + cm_error = nvme_rdma_addr_resolved(queue); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + cm_error = nvme_rdma_route_resolved(queue); + break; + case RDMA_CM_EVENT_ESTABLISHED: + queue->cm_error = nvme_rdma_conn_established(queue); + /* complete cm_done regardless of success/failure */ + complete(&queue->cm_done); + return 0; + case RDMA_CM_EVENT_REJECTED: + cm_error = nvme_rdma_conn_rejected(queue, ev); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + dev_dbg(queue->ctrl->ctrl.device, + "CM error event %d\n", ev->event); + cm_error = -ECONNRESET; + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + dev_dbg(queue->ctrl->ctrl.device, + "disconnect received - connection closed\n"); + nvme_rdma_error_recovery(queue->ctrl); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* device removal is handled via the ib_client API */ + break; + default: + dev_err(queue->ctrl->ctrl.device, + "Unexpected RDMA CM event (%d)\n", ev->event); + nvme_rdma_error_recovery(queue->ctrl); + break; + } + + if (cm_error) { + queue->cm_error = cm_error; + complete(&queue->cm_done); + } + + return 0; +} + +static enum blk_eh_timer_return +nvme_rdma_timeout(struct request *rq, bool reserved) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + + /* queue error recovery */ + nvme_rdma_error_recovery(req->queue->ctrl); + + /* fail with DNR on cmd timeout */ + rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR; + + return BLK_EH_HANDLED; +} + +static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_rdma_queue *queue = hctx->driver_data; + struct request *rq = bd->rq; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_qe *sqe = &req->sqe; + struct nvme_command *c = sqe->data; + bool flush = false; + struct ib_device *dev; + unsigned int map_len; + int ret; + + WARN_ON_ONCE(rq->tag < 0); + + dev = queue->device->dev; + ib_dma_sync_single_for_cpu(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + ret = nvme_setup_cmd(ns, rq, c); + if (ret) + return ret; + + c->common.command_id = rq->tag; + blk_mq_start_request(rq); + + map_len = nvme_map_len(rq); + ret = nvme_rdma_map_data(queue, rq, map_len, c); + if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "Failed to map data (%d)\n", ret); + nvme_cleanup_cmd(rq); + goto err; + } + + ib_dma_sync_single_for_device(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH) + flush = true; + ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, + req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); + if (ret) { + nvme_rdma_unmap_data(queue, rq); + goto err; + } + + return BLK_MQ_RQ_QUEUE_OK; +err: + return (ret == -ENOMEM || ret == -EAGAIN) ? + BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; +} + +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +{ + struct nvme_rdma_queue *queue = hctx->driver_data; + struct ib_cq *cq = queue->ib_cq; + struct ib_wc wc; + int found = 0; + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while (ib_poll_cq(cq, 1, &wc) > 0) { + struct ib_cqe *cqe = wc.wr_cqe; + + if (cqe) { + if (cqe->done == nvme_rdma_recv_done) + found |= __nvme_rdma_recv_done(cq, &wc, tag); + else + cqe->done(cq, &wc); + } + } + + return found; +} + +static void nvme_rdma_complete_rq(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + int error = 0; + + nvme_rdma_unmap_data(queue, rq); + + if (unlikely(rq->errors)) { + if (nvme_req_needs_retry(rq, rq->errors)) { + nvme_requeue_req(rq); + return; + } + + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) + error = rq->errors; + else + error = nvme_error_status(rq->errors); + } + + blk_mq_end_request(rq, error); +} + +static struct blk_mq_ops nvme_rdma_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, + .reinit_request = nvme_rdma_reinit_request, + .init_hctx = nvme_rdma_init_hctx, + .poll = nvme_rdma_poll, + .timeout = nvme_rdma_timeout, +}; + +static struct blk_mq_ops nvme_rdma_admin_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_rdma_init_admin_request, + .exit_request = nvme_rdma_exit_admin_request, + .reinit_request = nvme_rdma_reinit_request, + .init_hctx = nvme_rdma_init_admin_hctx, + .timeout = nvme_rdma_timeout, +}; + +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + int error; + + error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + if (error) + return error; + + ctrl->device = ctrl->queues[0].device; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + error = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_queue; + + ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, + ctrl->device->dev->attrs.max_fast_reg_page_list_len); + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_put_dev; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, + &ctrl->async_event_sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (error) + goto out_cleanup_queue; + + nvme_start_keep_alive(&ctrl->ctrl); + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + /* disconnect and drain the queue before freeing the tagset */ + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_queue: + nvme_rdma_free_queue(&ctrl->queues[0]); + return error; +} + +static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) +{ + nvme_stop_keep_alive(&ctrl->ctrl); + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->reconnect_work); + + if (ctrl->queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_free_io_queues(ctrl); + } + + if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags)) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl); +} + +static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) +{ + nvme_uninit_ctrl(&ctrl->ctrl); + if (shutdown) + nvme_rdma_shutdown_ctrl(ctrl); + + if (ctrl->ctrl.tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + nvme_rdma_dev_put(ctrl->device); + } + + nvme_put_ctrl(&ctrl->ctrl); +} + +static void nvme_rdma_del_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, delete_work); + + __nvme_rdma_remove_ctrl(ctrl, true); +} + +static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + + if (!queue_work(nvme_rdma_wq, &ctrl->delete_work)) + return -EBUSY; + + return 0; +} + +static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + int ret = 0; + + /* + * Keep a reference until all work is flushed since + * __nvme_rdma_del_ctrl can free the ctrl mem + */ + if (!kref_get_unless_zero(&ctrl->ctrl.kref)) + return -EBUSY; + ret = __nvme_rdma_del_ctrl(ctrl); + if (!ret) + flush_work(&ctrl->delete_work); + nvme_put_ctrl(&ctrl->ctrl); + return ret; +} + +static void nvme_rdma_remove_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, delete_work); + + __nvme_rdma_remove_ctrl(ctrl, false); +} + +static void nvme_rdma_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, reset_work); + int ret; + bool changed; + + nvme_rdma_shutdown_ctrl(ctrl); + + ret = nvme_rdma_configure_admin_queue(ctrl); + if (ret) { + /* ctrl is already shutdown, just remove the ctrl */ + INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work); + goto del_dead_ctrl; + } + + if (ctrl->queue_count > 1) { + ret = blk_mq_reinit_tagset(&ctrl->tag_set); + if (ret) + goto del_dead_ctrl; + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + goto del_dead_ctrl; + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto del_dead_ctrl; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + if (ctrl->queue_count > 1) { + nvme_start_queues(&ctrl->ctrl); + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return; + +del_dead_ctrl: + /* Deleting this dead controller... */ + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work)); +} + +static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + + if (!queue_work(nvme_rdma_wq, &ctrl->reset_work)) + return -EBUSY; + + flush_work(&ctrl->reset_work); + + return 0; +} + +static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { + .name = "rdma", + .module = THIS_MODULE, + .is_fabrics = true, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .reset_ctrl = nvme_rdma_reset_ctrl, + .free_ctrl = nvme_rdma_free_ctrl, + .submit_async_event = nvme_rdma_submit_async_event, + .delete_ctrl = nvme_rdma_del_ctrl, + .get_subsysnqn = nvmf_get_subsysnqn, + .get_address = nvmf_get_address, +}; + +static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + int ret; + + ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + if (ret) + return ret; + + ctrl->queue_count = opts->nr_io_queues + 1; + if (ctrl->queue_count < 2) + return 0; + + dev_info(ctrl->ctrl.device, + "creating %d I/O queues.\n", opts->nr_io_queues); + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + return ret; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + ret = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_io_queues; + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_rdma_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; + ctrl->tag_set.reserved_tags = 1; /* fabric connect */ + ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_put_dev; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tag_set; + } + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tag_set: + blk_mq_free_tag_set(&ctrl->tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_io_queues: + nvme_rdma_free_io_queues(ctrl); + return ret; +} + +static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p) +{ + u8 *addr = (u8 *)&in_addr->sin_addr.s_addr; + size_t buflen = strlen(p); + + /* XXX: handle IPv6 addresses */ + + if (buflen > INET_ADDRSTRLEN) + return -EINVAL; + if (in4_pton(p, buflen, addr, '\0', NULL) == 0) + return -EINVAL; + in_addr->sin_family = AF_INET; + return 0; +} + +static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + int ret; + bool changed; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr); + if (ret) { + pr_err("malformed IP address passed: %s\n", opts->traddr); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_TRSVCID) { + u16 port; + + ret = kstrtou16(opts->trsvcid, 0, &port); + if (ret) + goto out_free_ctrl; + + ctrl->addr_in.sin_port = cpu_to_be16(port); + } else { + ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT); + } + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_free_ctrl; + + ctrl->reconnect_delay = opts->reconnect_delay; + INIT_DELAYED_WORK(&ctrl->reconnect_work, + nvme_rdma_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); + INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); + INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work); + spin_lock_init(&ctrl->lock); + + ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + + ret = -ENOMEM; + ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_rdma_configure_admin_queue(ctrl); + if (ret) + goto out_kfree_queues; + + /* sanity check icdoff */ + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto out_remove_admin_queue; + } + + /* sanity check keyed sgls */ + if (!(ctrl->ctrl.sgls & (1 << 20))) { + dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n"); + goto out_remove_admin_queue; + } + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_rdma_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + + kref_get(&ctrl->ctrl.kref); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + if (opts->nr_io_queues) { + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_stop_keep_alive(&ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl); +out_kfree_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvmf_transport_ops nvme_rdma_transport = { + .name = "rdma", + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY, + .create_ctrl = nvme_rdma_create_ctrl, +}; + +static void nvme_rdma_add_one(struct ib_device *ib_device) +{ +} + +static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct nvme_rdma_ctrl *ctrl; + + /* Delete all controllers using this device */ + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { + if (ctrl->device->dev != ib_device) + continue; + dev_info(ctrl->ctrl.device, + "Removing ctrl: NQN \"%s\", addr %pISp\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + __nvme_rdma_del_ctrl(ctrl); + } + mutex_unlock(&nvme_rdma_ctrl_mutex); + + flush_workqueue(nvme_rdma_wq); +} + +static struct ib_client nvme_rdma_ib_client = { + .name = "nvme_rdma", + .add = nvme_rdma_add_one, + .remove = nvme_rdma_remove_one +}; + +static int __init nvme_rdma_init_module(void) +{ + int ret; + + nvme_rdma_wq = create_workqueue("nvme_rdma_wq"); + if (!nvme_rdma_wq) + return -ENOMEM; + + ret = ib_register_client(&nvme_rdma_ib_client); + if (ret) { + destroy_workqueue(nvme_rdma_wq); + return ret; + } + + nvmf_register_transport(&nvme_rdma_transport); + return 0; +} + +static void __exit nvme_rdma_cleanup_module(void) +{ + nvmf_unregister_transport(&nvme_rdma_transport); + ib_unregister_client(&nvme_rdma_ib_client); + destroy_workqueue(nvme_rdma_wq); +} + +module_init(nvme_rdma_init_module); +module_exit(nvme_rdma_cleanup_module); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c new file mode 100644 index 0000000..e947e29 --- /dev/null +++ b/drivers/nvme/host/scsi.c @@ -0,0 +1,2574 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +/* + * Refer to the SCSI-NVMe Translation spec for details on how + * each command is translated. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme.h" + +static int sg_version_num = 30534; /* 2 digits for each component */ + +/* VPD Page Codes */ +#define VPD_SUPPORTED_PAGES 0x00 +#define VPD_SERIAL_NUMBER 0x80 +#define VPD_DEVICE_IDENTIFIERS 0x83 +#define VPD_EXTENDED_INQUIRY 0x86 +#define VPD_BLOCK_LIMITS 0xB0 +#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 + +/* format unit paramter list offsets */ +#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 +#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 +#define FORMAT_UNIT_PROT_INT_OFFSET 3 +#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 +#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 + +/* Misc. defines */ +#define FIXED_SENSE_DATA 0x70 +#define DESC_FORMAT_SENSE_DATA 0x72 +#define FIXED_SENSE_DATA_ADD_LENGTH 10 +#define LUN_ENTRY_SIZE 8 +#define LUN_DATA_HEADER_SIZE 8 +#define ALL_LUNS_RETURNED 0x02 +#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 +#define RESTRICTED_LUNS_RETURNED 0x00 +#define NVME_POWER_STATE_START_VALID 0x00 +#define NVME_POWER_STATE_ACTIVE 0x01 +#define NVME_POWER_STATE_IDLE 0x02 +#define NVME_POWER_STATE_STANDBY 0x03 +#define NVME_POWER_STATE_LU_CONTROL 0x07 +#define POWER_STATE_0 0 +#define POWER_STATE_1 1 +#define POWER_STATE_2 2 +#define POWER_STATE_3 3 +#define DOWNLOAD_SAVE_ACTIVATE 0x05 +#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E +#define ACTIVATE_DEFERRED_MICROCODE 0x0F +#define FORMAT_UNIT_IMMED_MASK 0x2 +#define FORMAT_UNIT_IMMED_OFFSET 1 +#define KELVIN_TEMP_FACTOR 273 +#define FIXED_FMT_SENSE_DATA_SIZE 18 +#define DESC_FMT_SENSE_DATA_SIZE 8 + +/* SCSI/NVMe defines and bit masks */ +#define INQ_STANDARD_INQUIRY_PAGE 0x00 +#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 +#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 +#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 +#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 +#define INQ_BDEV_LIMITS_PAGE 0xB0 +#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 +#define INQ_SERIAL_NUMBER_LENGTH 0x14 +#define INQ_NUM_SUPPORTED_VPD_PAGES 6 +#define VERSION_SPC_4 0x06 +#define ACA_UNSUPPORTED 0 +#define STANDARD_INQUIRY_LENGTH 36 +#define ADDITIONAL_STD_INQ_LENGTH 31 +#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C +#define RESERVED_FIELD 0 + +/* Mode Sense/Select defines */ +#define MODE_PAGE_INFO_EXCEP 0x1C +#define MODE_PAGE_CACHING 0x08 +#define MODE_PAGE_CONTROL 0x0A +#define MODE_PAGE_POWER_CONDITION 0x1A +#define MODE_PAGE_RETURN_ALL 0x3F +#define MODE_PAGE_BLK_DES_LEN 0x08 +#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 +#define MODE_PAGE_CACHING_LEN 0x14 +#define MODE_PAGE_CONTROL_LEN 0x0C +#define MODE_PAGE_POW_CND_LEN 0x28 +#define MODE_PAGE_INF_EXC_LEN 0x0C +#define MODE_PAGE_ALL_LEN 0x54 +#define MODE_SENSE6_MPH_SIZE 4 +#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 +#define MODE_SENSE_PAGE_CODE_OFFSET 2 +#define MODE_SENSE_PAGE_CODE_MASK 0x3F +#define MODE_SENSE_LLBAA_MASK 0x10 +#define MODE_SENSE_LLBAA_SHIFT 4 +#define MODE_SENSE_DBD_MASK 8 +#define MODE_SENSE_DBD_SHIFT 3 +#define MODE_SENSE10_MPH_SIZE 8 +#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 +#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 +#define MODE_SELECT_6_BD_OFFSET 3 +#define MODE_SELECT_10_BD_OFFSET 6 +#define MODE_SELECT_10_LLBAA_OFFSET 4 +#define MODE_SELECT_10_LLBAA_MASK 1 +#define MODE_SELECT_6_MPH_SIZE 4 +#define MODE_SELECT_10_MPH_SIZE 8 +#define CACHING_MODE_PAGE_WCE_MASK 0x04 +#define MODE_SENSE_BLK_DESC_ENABLED 0 +#define MODE_SENSE_BLK_DESC_COUNT 1 +#define MODE_SELECT_PAGE_CODE_MASK 0x3F +#define SHORT_DESC_BLOCK 8 +#define LONG_DESC_BLOCK 16 +#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 +#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A +#define MODE_PAGE_CACHING_LEN_FIELD 0x12 +#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A +#define MODE_SENSE_PC_CURRENT_VALUES 0 + +/* Log Sense defines */ +#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 +#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 +#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F +#define LOG_PAGE_TEMPERATURE_PAGE 0x0D +#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 +#define LOG_SENSE_CDB_PC_MASK 0xC0 +#define LOG_SENSE_CDB_PC_SHIFT 6 +#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 +#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F +#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 +#define LOG_INFO_EXCP_PAGE_LENGTH 0xC +#define REMAINING_TEMP_PAGE_LENGTH 0xC +#define LOG_TEMP_PAGE_LENGTH 0x10 +#define LOG_TEMP_UNKNOWN 0xFF +#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 + +/* Read Capacity defines */ +#define READ_CAP_10_RESP_SIZE 8 +#define READ_CAP_16_RESP_SIZE 32 + +/* NVMe Namespace and Command Defines */ +#define BYTES_TO_DWORDS 4 +#define NVME_MAX_FIRMWARE_SLOT 7 + +/* Report LUNs defines */ +#define REPORT_LUNS_FIRST_LUN_OFFSET 8 + +/* SCSI ADDITIONAL SENSE Codes */ + +#define SCSI_ASC_NO_SENSE 0x00 +#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 +#define SCSI_ASC_LUN_NOT_READY 0x04 +#define SCSI_ASC_WARNING 0x0B +#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 +#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D +#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 +#define SCSI_ASC_ILLEGAL_COMMAND 0x20 +#define SCSI_ASC_ILLEGAL_BLOCK 0x21 +#define SCSI_ASC_INVALID_CDB 0x24 +#define SCSI_ASC_INVALID_LUN 0x25 +#define SCSI_ASC_INVALID_PARAMETER 0x26 +#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 +#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 + +/* SCSI ADDITIONAL SENSE Code Qualifiers */ + +#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 +#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 +#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 +#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 +#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 +#define SCSI_ASCQ_INVALID_LUN_ID 0x09 + +/* copied from drivers/usb/gadget/function/storage_common.h */ +static inline u32 get_unaligned_be24(u8 *buf) +{ + return 0xffffff & (u32) get_unaligned_be32(buf - 1); +} + +/* Struct to gather data that needs to be extracted from a SCSI CDB. + Not conforming to any particular CDB variant, but compatible with all. */ + +struct nvme_trans_io_cdb { + u8 fua; + u8 prot_info; + u64 lba; + u32 xfer_len; +}; + + +/* Internal Helper Functions */ + + +/* Copy data to userspace memory */ + +static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, + unsigned long n) +{ + int i; + void *index = from; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec sgl; + + for (i = 0; i < hdr->iovec_count; i++) { + if (copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec))) + return -EFAULT; + xfer_len = min(remaining, sgl.iov_len); + if (copy_to_user(sgl.iov_base, index, xfer_len)) + return -EFAULT; + + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return 0; + } + + if (copy_to_user(hdr->dxferp, from, n)) + return -EFAULT; + return 0; +} + +/* Copy data from userspace memory */ + +static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, + unsigned long n) +{ + int i; + void *index = to; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec sgl; + + for (i = 0; i < hdr->iovec_count; i++) { + if (copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec))) + return -EFAULT; + xfer_len = min(remaining, sgl.iov_len); + if (copy_from_user(index, sgl.iov_base, xfer_len)) + return -EFAULT; + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return 0; + } + + if (copy_from_user(to, hdr->dxferp, n)) + return -EFAULT; + return 0; +} + +/* Status/Sense Buffer Writeback */ + +static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, + u8 asc, u8 ascq) +{ + u8 xfer_len; + u8 resp[DESC_FMT_SENSE_DATA_SIZE]; + + if (scsi_status_is_good(status)) { + hdr->status = SAM_STAT_GOOD; + hdr->masked_status = GOOD; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + hdr->sb_len_wr = 0; + } else { + hdr->status = status; + hdr->masked_status = status >> 1; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + + memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); + resp[0] = DESC_FORMAT_SENSE_DATA; + resp[1] = sense_key; + resp[2] = asc; + resp[3] = ascq; + + xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); + hdr->sb_len_wr = xfer_len; + if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) + return -EFAULT; + } + + return 0; +} + +/* + * Take a status code from a lowlevel routine, and if it was a positive NVMe + * error code update the sense data based on it. In either case the passed + * in value is returned again, unless an -EFAULT from copy_to_user overrides + * it. + */ +static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) +{ + u8 status, sense_key, asc, ascq; + int res; + + /* For non-nvme (Linux) errors, simply return the error code */ + if (nvme_sc < 0) + return nvme_sc; + + /* Mask DNR, More, and reserved fields */ + switch (nvme_sc & 0x7FF) { + /* Generic Command Status */ + case NVME_SC_SUCCESS: + status = SAM_STAT_GOOD; + sense_key = NO_SENSE; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_OPCODE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_COMMAND; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_FIELD: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_DATA_XFER_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_POWER_LOSS: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_WARNING; + ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; + break; + case NVME_SC_INTERNAL: + status = SAM_STAT_CHECK_CONDITION; + sense_key = HARDWARE_ERROR; + asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_REQ: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_QUEUE: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_FAIL: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_MISSING: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_NS: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + case NVME_SC_LBA_RANGE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_BLOCK; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_CAP_EXCEEDED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_NS_NOT_READY: + status = SAM_STAT_CHECK_CONDITION; + sense_key = NOT_READY; + asc = SCSI_ASC_LUN_NOT_READY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Command Specific Status */ + case NVME_SC_INVALID_FORMAT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_FORMAT_COMMAND_FAILED; + ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; + break; + case NVME_SC_BAD_ATTRIBUTES: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Media Errors */ + case NVME_SC_WRITE_FAULT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_READ_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_UNRECOVERED_READ_ERROR; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_GUARD_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; + break; + case NVME_SC_APPTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; + break; + case NVME_SC_REFTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; + break; + case NVME_SC_COMPARE_FAILED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MISCOMPARE; + asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ACCESS_DENIED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + + /* Unspecified/Default */ + case NVME_SC_CMDID_CONFLICT: + case NVME_SC_CMD_SEQ_ERROR: + case NVME_SC_CQ_INVALID: + case NVME_SC_QID_INVALID: + case NVME_SC_QUEUE_SIZE: + case NVME_SC_ABORT_LIMIT: + case NVME_SC_ABORT_MISSING: + case NVME_SC_ASYNC_LIMIT: + case NVME_SC_FIRMWARE_SLOT: + case NVME_SC_FIRMWARE_IMAGE: + case NVME_SC_INVALID_VECTOR: + case NVME_SC_INVALID_LOG_PAGE: + default: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + + res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); + return res ? res : nvme_sc; +} + +/* INQUIRY Helper Functions */ + +static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_id_ns *id_ns; + int res; + int nvme_sc; + int xfer_len; + u8 resp_data_format = 0x02; + u8 protect; + u8 cmdque = 0x01 << 1; + u8 fw_offset = sizeof(ctrl->firmware_rev); + + /* nvme ns identify - use DPS value for PROTECT field */ + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + if (id_ns->dps) + protect = 0x01; + else + protect = 0; + kfree(id_ns); + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[2] = VERSION_SPC_4; + inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ + inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; + inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ + inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ + strncpy(&inq_response[8], "NVMe ", 8); + strncpy(&inq_response[16], ctrl->model, 16); + + while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) + fw_offset--; + fw_offset -= 4; + strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); +} + +static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ + inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ + inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; + inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; + inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; + inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; + inq_response[9] = INQ_BDEV_LIMITS_PAGE; + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); +} + +static int nvme_trans_unit_serial_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ + inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ + strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); +} + +static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) +{ + struct nvme_id_ns *id_ns; + int nvme_sc, res; + size_t len; + void *eui; + + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + eui = id_ns->eui64; + len = sizeof(id_ns->eui64); + + if (ns->ctrl->vs >= NVME_VS(1, 2)) { + if (bitmap_empty(eui, len * 8)) { + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); + } + } + + if (bitmap_empty(eui, len * 8)) { + res = -EOPNOTSUPP; + goto out_free_id; + } + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 4 + len; /* Page Length */ + + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); + + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); +out_free_id: + kfree(id_ns); + return res; +} + +static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_id_ctrl *id_ctrl; + int nvme_sc, res; + + if (alloc_len < 72) { + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 0x48; /* Page Length */ + + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); + memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model)); + sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); + memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial)); + + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); + kfree(id_ctrl); + return res; +} + +static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int alloc_len) +{ + int res; + + if (ns->ctrl->vs >= NVME_VS(1, 1)) { + res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); + if (res != -EOPNOTSUPP) + return res; + } + + return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len); +} + +static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res; + int nvme_sc; + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_id_ctrl *id_ctrl; + struct nvme_id_ns *id_ns; + int xfer_len; + u8 microcode = 0x80; + u8 spt; + u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; + u8 grd_chk, app_chk, ref_chk, protect; + u8 uask_sup = 0x20; + u8 v_sup; + u8 luiclr = 0x01; + + inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) + return -ENOMEM; + + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free_inq; + + spt = spt_lut[id_ns->dpc & 0x07] << 3; + if (id_ns->dps) + protect = 0x01; + else + protect = 0; + kfree(id_ns); + + grd_chk = protect << 2; + app_chk = protect << 1; + ref_chk = protect; + + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free_inq; + + v_sup = id_ctrl->vwc; + kfree(id_ctrl); + + memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; + inq_response[5] = uask_sup; + inq_response[6] = v_sup; + inq_response[7] = luiclr; + inq_response[8] = 0; + inq_response[9] = 0; + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free_inq: + kfree(inq_response); + return res; +} + +static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) +{ + __be32 max_sectors = cpu_to_be32( + nvme_block_nr(ns, queue_max_hw_sectors(ns->queue))); + __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors); + __be32 discard_desc_count = cpu_to_be32(0x100); + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = VPD_BLOCK_LIMITS; + inq_response[3] = 0x3c; /* Page Length */ + memcpy(&inq_response[8], &max_sectors, sizeof(u32)); + memcpy(&inq_response[20], &max_discard, sizeof(u32)); + + if (max_discard) + memcpy(&inq_response[24], &discard_desc_count, sizeof(u32)); + + return nvme_trans_copy_to_user(hdr, inq_response, 0x3c); +} + +static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res; + int xfer_len; + + inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ + inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ + inq_response[6] = 0x00; /* Form Factor */ + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + kfree(inq_response); + out_mem: + return res; +} + +/* LOG SENSE Helper Functions */ + +static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res; + int xfer_len; + u8 *log_response; + + log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; + log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; + + xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + kfree(log_response); + out_mem: + return res; +} + +static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, + struct sg_io_hdr *hdr, int alloc_len) +{ + int res; + int xfer_len; + u8 *log_response; + struct nvme_smart_log *smart_log; + u8 temp_c; + u16 temp_k; + + log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) + return -ENOMEM; + + res = nvme_get_log_page(ns->ctrl, &smart_log); + if (res < 0) + goto out_free_response; + + if (res != NVME_SC_SUCCESS) { + temp_c = LOG_TEMP_UNKNOWN; + } else { + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c = temp_k - KELVIN_TEMP_FACTOR; + } + kfree(smart_log); + + log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; + /* Informational Exceptions Log Parameter 1 Start */ + /* Parameter Code=0x0000 bytes 4,5 */ + log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ + log_response[7] = 0x04; /* PARAMETER LENGTH */ + /* Add sense Code and qualifier = 0x00 each */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[10] = temp_c; + + xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + out_free_response: + kfree(log_response); + return res; +} + +static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res; + int xfer_len; + u8 *log_response; + struct nvme_smart_log *smart_log; + u32 feature_resp; + u8 temp_c_cur, temp_c_thresh; + u16 temp_k; + + log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) + return -ENOMEM; + + res = nvme_get_log_page(ns->ctrl, &smart_log); + if (res < 0) + goto out_free_response; + + if (res != NVME_SC_SUCCESS) { + temp_c_cur = LOG_TEMP_UNKNOWN; + } else { + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; + } + kfree(smart_log); + + /* Get Features for Temp Threshold */ + res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0, + &feature_resp); + if (res != NVME_SC_SUCCESS) + temp_c_thresh = LOG_TEMP_UNKNOWN; + else + temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; + + log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_TEMP_PAGE_LENGTH; + /* Temperature Log Parameter 1 (Temperature) Start */ + /* Parameter Code = 0x0000 */ + log_response[6] = 0x01; /* Format and Linking = 01b */ + log_response[7] = 0x02; /* Parameter Length */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[9] = temp_c_cur; + /* Temperature Log Parameter 2 (Reference Temperature) Start */ + log_response[11] = 0x01; /* Parameter Code = 0x0001 */ + log_response[12] = 0x01; /* Format and Linking = 01b */ + log_response[13] = 0x02; /* Parameter Length */ + /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ + log_response[15] = temp_c_thresh; + + xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + out_free_response: + kfree(log_response); + return res; +} + +/* MODE SENSE Helper Functions */ + +static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, + u16 mode_data_length, u16 blk_desc_len) +{ + /* Quick check to make sure I don't stomp on my own memory... */ + if ((cdb10 && len < 8) || (!cdb10 && len < 4)) + return -EINVAL; + + if (cdb10) { + resp[0] = (mode_data_length & 0xFF00) >> 8; + resp[1] = (mode_data_length & 0x00FF); + resp[3] = 0x10 /* DPOFUA */; + resp[4] = llbaa; + resp[5] = RESERVED_FIELD; + resp[6] = (blk_desc_len & 0xFF00) >> 8; + resp[7] = (blk_desc_len & 0x00FF); + } else { + resp[0] = (mode_data_length & 0x00FF); + resp[2] = 0x10 /* DPOFUA */; + resp[3] = (blk_desc_len & 0x00FF); + } + + return 0; +} + +static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len, u8 llbaa) +{ + int res; + int nvme_sc; + struct nvme_id_ns *id_ns; + u8 flbas; + u32 lba_length; + + if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) + return -EINVAL; + else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) + return -EINVAL; + + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + + if (llbaa == 0) { + __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap)); + /* Byte 4 is reserved */ + __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF); + + memcpy(resp, &tmp_cap, sizeof(u32)); + memcpy(&resp[4], &tmp_len, sizeof(u32)); + } else { + __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap)); + __be32 tmp_len = cpu_to_be32(lba_length); + + memcpy(resp, &tmp_cap, sizeof(u64)); + /* Bytes 8, 9, 10, 11 are reserved */ + memcpy(&resp[12], &tmp_len, sizeof(u32)); + } + + kfree(id_ns); + return res; +} + +static int nvme_trans_fill_control_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + if (len < MODE_PAGE_CONTROL_LEN) + return -EINVAL; + + resp[0] = MODE_PAGE_CONTROL; + resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; + resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, + * D_SENSE=1, GLTSD=1, RLEC=0 */ + resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ + /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ + resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ + /* resp[6] and [7] are obsolete, thus zero */ + resp[8] = 0xFF; /* Busy timeout period = 0xffff */ + resp[9] = 0xFF; + /* Bytes 10,11: Extended selftest completion time = 0x0000 */ + + return 0; +} + +static int nvme_trans_fill_caching_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res = 0; + int nvme_sc; + u32 feature_resp; + u8 vwc; + + if (len < MODE_PAGE_CACHING_LEN) + return -EINVAL; + + nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0, + &feature_resp); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + vwc = feature_resp & 0x00000001; + + resp[0] = MODE_PAGE_CACHING; + resp[1] = MODE_PAGE_CACHING_LEN_FIELD; + resp[2] = vwc << 2; + return 0; +} + +static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + if (len < MODE_PAGE_POW_CND_LEN) + return -EINVAL; + + resp[0] = MODE_PAGE_POWER_CONDITION; + resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; + /* All other bytes are zero */ + + return 0; +} + +static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + if (len < MODE_PAGE_INF_EXC_LEN) + return -EINVAL; + + resp[0] = MODE_PAGE_INFO_EXCEP; + resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; + resp[2] = 0x88; + /* All other bytes are zero */ + + return 0; +} + +static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res; + u16 mode_pages_offset_1 = 0; + u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; + + mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; + mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; + mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; + + res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], + MODE_PAGE_CACHING_LEN); + if (res) + return res; + res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], + MODE_PAGE_CONTROL_LEN); + if (res) + return res; + res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], + MODE_PAGE_POW_CND_LEN); + if (res) + return res; + return nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], + MODE_PAGE_INF_EXC_LEN); +} + +static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) +{ + if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { + /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ + return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; + } else { + return 0; + } +} + +static int nvme_trans_mode_page_create(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *cmd, + u16 alloc_len, u8 cdb10, + int (*mode_page_fill_func) + (struct nvme_ns *, + struct sg_io_hdr *hdr, u8 *, int), + u16 mode_pages_tot_len) +{ + int res; + int xfer_len; + u8 *response; + u8 dbd, llbaa; + u16 resp_size; + int mph_size; + u16 mode_pages_offset_1; + u16 blk_desc_len, blk_desc_offset, mode_data_length; + + dbd = (cmd[1] & MODE_SENSE_DBD_MASK) >> MODE_SENSE_DBD_SHIFT; + llbaa = (cmd[1] & MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT; + mph_size = cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE; + + blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); + + resp_size = mph_size + blk_desc_len + mode_pages_tot_len; + /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ + mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; + + blk_desc_offset = mph_size; + mode_pages_offset_1 = blk_desc_offset + blk_desc_len; + + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, + llbaa, mode_data_length, blk_desc_len); + if (res) + goto out_free; + if (blk_desc_len > 0) { + res = nvme_trans_fill_blk_desc(ns, hdr, + &response[blk_desc_offset], + blk_desc_len, llbaa); + if (res) + goto out_free; + } + res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], + mode_pages_tot_len); + if (res) + goto out_free; + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + out_free: + kfree(response); + out_mem: + return res; +} + +/* Read Capacity Helper Functions */ + +static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, + u8 cdb16) +{ + u8 flbas; + u32 lba_length; + u64 rlba; + u8 prot_en; + u8 p_type_lut[4] = {0, 0, 1, 2}; + __be64 tmp_rlba; + __be32 tmp_rlba_32; + __be32 tmp_len; + + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + rlba = le64_to_cpup(&id_ns->nsze) - 1; + (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); + + if (!cdb16) { + if (rlba > 0xFFFFFFFF) + rlba = 0xFFFFFFFF; + tmp_rlba_32 = cpu_to_be32(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba_32, sizeof(u32)); + memcpy(&response[4], &tmp_len, sizeof(u32)); + } else { + tmp_rlba = cpu_to_be64(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba, sizeof(u64)); + memcpy(&response[8], &tmp_len, sizeof(u32)); + response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; + /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ + /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ + /* Bytes 16-31 - Reserved */ + } +} + +/* Start Stop Unit Helper Functions */ + +static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 pc, u8 pcmod, u8 start) +{ + int res; + int nvme_sc; + struct nvme_id_ctrl *id_ctrl; + int lowest_pow_st; /* max npss = lowest power consumption */ + unsigned ps_desired = 0; + + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); + kfree(id_ctrl); + + switch (pc) { + case NVME_POWER_STATE_START_VALID: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0 && start == 0x1) + ps_desired = POWER_STATE_0; + if (pcmod == 0 && start == 0x0) + ps_desired = lowest_pow_st; + break; + case NVME_POWER_STATE_ACTIVE: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0) + ps_desired = POWER_STATE_0; + break; + case NVME_POWER_STATE_IDLE: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ + if (pcmod == 0x0) + ps_desired = POWER_STATE_1; + else if (pcmod == 0x1) + ps_desired = POWER_STATE_2; + else if (pcmod == 0x2) + ps_desired = POWER_STATE_3; + break; + case NVME_POWER_STATE_STANDBY: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ + if (pcmod == 0x0) + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); + else if (pcmod == 0x1) + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); + break; + case NVME_POWER_STATE_LU_CONTROL: + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0, + NULL); + return nvme_trans_status_code(hdr, nvme_sc); +} + +static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 buffer_id) +{ + struct nvme_command c; + int nvme_sc; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_activate_fw; + c.common.cdw10[0] = cpu_to_le32(buffer_id | NVME_FWACT_REPL_ACTV); + + nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); + return nvme_trans_status_code(hdr, nvme_sc); +} + +static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 opcode, u32 tot_len, u32 offset, + u8 buffer_id) +{ + int nvme_sc; + struct nvme_command c; + + if (hdr->iovec_count > 0) { + /* Assuming SGL is not allowed for this command */ + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_download_fw; + c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); + c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); + + nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c, + hdr->dxferp, tot_len, NULL, 0); + return nvme_trans_status_code(hdr, nvme_sc); +} + +/* Mode Select Helper Functions */ + +static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, + u16 *bd_len, u8 *llbaa) +{ + if (cdb10) { + /* 10 Byte CDB */ + *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + + parm_list[MODE_SELECT_10_BD_OFFSET + 1]; + *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & + MODE_SELECT_10_LLBAA_MASK; + } else { + /* 6 Byte CDB */ + *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; + } +} + +static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, + u16 idx, u16 bd_len, u8 llbaa) +{ + u16 bd_num; + + bd_num = bd_len / ((llbaa == 0) ? + SHORT_DESC_BLOCK : LONG_DESC_BLOCK); + /* Store block descriptor info if a FORMAT UNIT comes later */ + /* TODO Saving 1st BD info; what to do if multiple BD received? */ + if (llbaa == 0) { + /* Standard Block Descriptor - spc4r34 7.5.5.1 */ + ns->mode_select_num_blocks = + (parm_list[idx + 1] << 16) + + (parm_list[idx + 2] << 8) + + (parm_list[idx + 3]); + + ns->mode_select_block_len = + (parm_list[idx + 5] << 16) + + (parm_list[idx + 6] << 8) + + (parm_list[idx + 7]); + } else { + /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ + ns->mode_select_num_blocks = + (((u64)parm_list[idx + 0]) << 56) + + (((u64)parm_list[idx + 1]) << 48) + + (((u64)parm_list[idx + 2]) << 40) + + (((u64)parm_list[idx + 3]) << 32) + + (((u64)parm_list[idx + 4]) << 24) + + (((u64)parm_list[idx + 5]) << 16) + + (((u64)parm_list[idx + 6]) << 8) + + ((u64)parm_list[idx + 7]); + + ns->mode_select_block_len = + (parm_list[idx + 12] << 24) + + (parm_list[idx + 13] << 16) + + (parm_list[idx + 14] << 8) + + (parm_list[idx + 15]); + } +} + +static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *mode_page, u8 page_code) +{ + int res = 0; + int nvme_sc; + unsigned dword11; + + switch (page_code) { + case MODE_PAGE_CACHING: + dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, + dword11, 0, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + break; + case MODE_PAGE_CONTROL: + break; + case MODE_PAGE_POWER_CONDITION: + /* Verify the OS is not trying to set timers */ + if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + return res; +} + +static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd, u16 parm_list_len, u8 pf, + u8 sp, u8 cdb10) +{ + int res; + u8 *parm_list; + u16 bd_len; + u8 llbaa = 0; + u16 index, saved_index; + u8 page_code; + u16 mp_size; + + /* Get parm list from data-in/out buffer */ + parm_list = kmalloc(parm_list_len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + + res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); + if (res) + goto out_mem; + + nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); + index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); + + if (bd_len != 0) { + /* Block Descriptors present, parse */ + nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); + index += bd_len; + } + saved_index = index; + + /* Multiple mode pages may be present; iterate through all */ + /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + if ((page_code != MODE_PAGE_CACHING) && + (page_code != MODE_PAGE_CONTROL) && + (page_code != MODE_PAGE_POWER_CONDITION)) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + index += mp_size; + } while (index < parm_list_len); + + /* In 2nd Iteration, do the NVME Commands */ + index = saved_index; + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], + page_code); + if (res) + break; + index += mp_size; + } while (index < parm_list_len); + + out_mem: + kfree(parm_list); + out: + return res; +} + +/* Format Unit Helper Functions */ + +static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, + struct sg_io_hdr *hdr) +{ + int res = 0; + int nvme_sc; + u8 flbas; + + /* + * SCSI Expects a MODE SELECT would have been issued prior to + * a FORMAT UNIT, and the block size and number would be used + * from the block descriptor in it. If a MODE SELECT had not + * been issued, FORMAT shall use the current values for both. + */ + + if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { + struct nvme_id_ns *id_ns; + + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + if (ns->mode_select_num_blocks == 0) + ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); + if (ns->mode_select_block_len == 0) { + flbas = (id_ns->flbas) & 0x0F; + ns->mode_select_block_len = + (1 << (id_ns->lbaf[flbas].ds)); + } + + kfree(id_ns); + } + + return 0; +} + +static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, + u8 format_prot_info, u8 *nvme_pf_code) +{ + int res; + u8 *parm_list; + u8 pf_usage, pf_code; + + parm_list = kmalloc(len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + res = nvme_trans_copy_from_user(hdr, parm_list, len); + if (res) + goto out_mem; + + if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & + FORMAT_UNIT_IMMED_MASK) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + + if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && + (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & + FORMAT_UNIT_PROT_FIELD_USAGE_MASK; + pf_code = (pf_usage << 2) | format_prot_info; + switch (pf_code) { + case 0: + *nvme_pf_code = 0; + break; + case 2: + *nvme_pf_code = 1; + break; + case 3: + *nvme_pf_code = 2; + break; + case 7: + *nvme_pf_code = 3; + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out_mem: + kfree(parm_list); + out: + return res; +} + +static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 prot_info) +{ + int res; + int nvme_sc; + struct nvme_id_ns *id_ns; + u8 i; + u8 flbas, nlbaf; + u8 selected_lbaf = 0xFF; + u32 cdw10 = 0; + struct nvme_command c; + + /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + flbas = (id_ns->flbas) & 0x0F; + nlbaf = id_ns->nlbaf; + + for (i = 0; i < nlbaf; i++) { + if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { + selected_lbaf = i; + break; + } + } + if (selected_lbaf > 0x0F) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + cdw10 |= prot_info << 5; + cdw10 |= selected_lbaf & 0x0F; + memset(&c, 0, sizeof(c)); + c.format.opcode = nvme_admin_format_nvm; + c.format.nsid = cpu_to_le32(ns->ns_id); + c.format.cdw10 = cpu_to_le32(cdw10); + + nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0); + res = nvme_trans_status_code(hdr, nvme_sc); + + kfree(id_ns); + return res; +} + +static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, + u32 max_blocks) +{ + /* If using iovecs, send one nvme command per vector */ + if (hdr->iovec_count > 0) + return hdr->iovec_count; + else if (cdb_info->xfer_len > max_blocks) + return ((cdb_info->xfer_len - 1) / max_blocks) + 1; + else + return 1; +} + +static u16 nvme_trans_io_get_control(struct nvme_ns *ns, + struct nvme_trans_io_cdb *cdb_info) +{ + u16 control = 0; + + /* When Protection information support is added, implement here */ + + if (cdb_info->fua > 0) + control |= NVME_RW_FUA; + + return control; +} + +static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, u8 is_write) +{ + int nvme_sc = NVME_SC_SUCCESS; + u32 num_cmds; + u64 unit_len; + u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ + u32 retcode; + u32 i = 0; + u64 nvme_offset = 0; + void __user *next_mapping_addr; + struct nvme_command c; + u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); + u16 control; + u32 max_blocks = queue_max_hw_sectors(ns->queue); + + num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); + + /* + * This loop handles two cases. + * First, when an SGL is used in the form of an iovec list: + * - Use iov_base as the next mapping address for the nvme command_id + * - Use iov_len as the data transfer length for the command. + * Second, when we have a single buffer + * - If larger than max_blocks, split into chunks, offset + * each nvme command accordingly. + */ + for (i = 0; i < num_cmds; i++) { + memset(&c, 0, sizeof(c)); + if (hdr->iovec_count > 0) { + struct sg_iovec sgl; + + retcode = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (retcode) + return -EFAULT; + unit_len = sgl.iov_len; + unit_num_blocks = unit_len >> ns->lba_shift; + next_mapping_addr = sgl.iov_base; + } else { + unit_num_blocks = min((u64)max_blocks, + (cdb_info->xfer_len - nvme_offset)); + unit_len = unit_num_blocks << ns->lba_shift; + next_mapping_addr = hdr->dxferp + + ((1 << ns->lba_shift) * nvme_offset); + } + + c.rw.opcode = opcode; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset); + c.rw.length = cpu_to_le16(unit_num_blocks - 1); + control = nvme_trans_io_get_control(ns, cdb_info); + c.rw.control = cpu_to_le16(control); + + if (get_capacity(ns->disk) - unit_num_blocks < + cdb_info->lba + nvme_offset) { + nvme_sc = NVME_SC_LBA_RANGE; + break; + } + nvme_sc = nvme_submit_user_cmd(ns->queue, &c, + next_mapping_addr, unit_len, NULL, 0); + if (nvme_sc) + break; + + nvme_offset += unit_num_blocks; + } + + return nvme_trans_status_code(hdr, nvme_sc); +} + + +/* SCSI Command Translation Functions */ + +static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, + u8 *cmd) +{ + int res = 0; + struct nvme_trans_io_cdb cdb_info = { 0, }; + u8 opcode = cmd[0]; + u64 xfer_bytes; + u64 sum_iov_len = 0; + struct sg_iovec sgl; + int i; + size_t not_copied; + + /* + * The FUA and WPROTECT fields are not supported in 6-byte CDBs, + * but always in the same place for all others. + */ + switch (opcode) { + case WRITE_6: + case READ_6: + break; + default: + cdb_info.fua = cmd[1] & 0x8; + cdb_info.prot_info = (cmd[1] & 0xe0) >> 5; + if (cdb_info.prot_info && !ns->pi_type) { + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + } + + switch (opcode) { + case WRITE_6: + case READ_6: + cdb_info.lba = get_unaligned_be24(&cmd[1]); + cdb_info.xfer_len = cmd[4]; + if (cdb_info.xfer_len == 0) + cdb_info.xfer_len = 256; + break; + case WRITE_10: + case READ_10: + cdb_info.lba = get_unaligned_be32(&cmd[2]); + cdb_info.xfer_len = get_unaligned_be16(&cmd[7]); + break; + case WRITE_12: + case READ_12: + cdb_info.lba = get_unaligned_be32(&cmd[2]); + cdb_info.xfer_len = get_unaligned_be32(&cmd[6]); + break; + case WRITE_16: + case READ_16: + cdb_info.lba = get_unaligned_be64(&cmd[2]); + cdb_info.xfer_len = get_unaligned_be32(&cmd[10]); + break; + default: + /* Will never really reach here */ + res = -EIO; + goto out; + } + + /* Calculate total length of transfer (in bytes) */ + if (hdr->iovec_count > 0) { + for (i = 0; i < hdr->iovec_count; i++) { + not_copied = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (not_copied) + return -EFAULT; + sum_iov_len += sgl.iov_len; + /* IO vector sizes should be multiples of block size */ + if (sgl.iov_len % (1 << ns->lba_shift) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + } + } else { + sum_iov_len = hdr->dxfer_len; + } + + /* As Per sg ioctl howto, if the lengths differ, use the lower one */ + xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); + + /* If block count and actual data buffer size dont match, error out */ + if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) { + res = -EINVAL; + goto out; + } + + /* Check for 0 length transfer - it is not illegal */ + if (cdb_info.xfer_len == 0) + goto out; + + /* Send NVMe IO Command(s) */ + res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); + if (res) + goto out; + + out: + return res; +} + +static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = 0; + u8 evpd; + u8 page_code; + int alloc_len; + u8 *inq_response; + + evpd = cmd[1] & 0x01; + page_code = cmd[2]; + alloc_len = get_unaligned_be16(&cmd[3]); + + inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH), + GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + if (evpd == 0) { + if (page_code == INQ_STANDARD_INQUIRY_PAGE) { + res = nvme_trans_standard_inquiry_page(ns, hdr, + inq_response, alloc_len); + } else { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + } else { + switch (page_code) { + case VPD_SUPPORTED_PAGES: + res = nvme_trans_supported_vpd_pages(ns, hdr, + inq_response, alloc_len); + break; + case VPD_SERIAL_NUMBER: + res = nvme_trans_unit_serial_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_DEVICE_IDENTIFIERS: + res = nvme_trans_device_id_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_EXTENDED_INQUIRY: + res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); + break; + case VPD_BLOCK_LIMITS: + res = nvme_trans_bdev_limits_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_BLOCK_DEV_CHARACTERISTICS: + res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + } + kfree(inq_response); + out_mem: + return res; +} + +static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res; + u16 alloc_len; + u8 pc; + u8 page_code; + + if (cmd[1] != LOG_SENSE_CDB_SP_NOT_ENABLED) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + page_code = cmd[2] & LOG_SENSE_CDB_PAGE_CODE_MASK; + pc = (cmd[2] & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; + if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + alloc_len = get_unaligned_be16(&cmd[7]); + switch (page_code) { + case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: + res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); + break; + case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: + res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); + break; + case LOG_PAGE_TEMPERATURE_PAGE: + res = nvme_trans_log_temperature(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + u8 cdb10 = 0; + u16 parm_list_len; + u8 page_format; + u8 save_pages; + + page_format = cmd[1] & MODE_SELECT_CDB_PAGE_FORMAT_MASK; + save_pages = cmd[1] & MODE_SELECT_CDB_SAVE_PAGES_MASK; + + if (cmd[0] == MODE_SELECT) { + parm_list_len = cmd[4]; + } else { + parm_list_len = cmd[7]; + cdb10 = 1; + } + + if (parm_list_len != 0) { + /* + * According to SPC-4 r24, a paramter list length field of 0 + * shall not be considered an error + */ + return nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, + page_format, save_pages, cdb10); + } + + return 0; +} + +static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = 0; + u16 alloc_len; + u8 cdb10 = 0; + + if (cmd[0] == MODE_SENSE) { + alloc_len = cmd[4]; + } else { + alloc_len = get_unaligned_be16(&cmd[7]); + cdb10 = 1; + } + + if ((cmd[2] & MODE_SENSE_PAGE_CONTROL_MASK) != + MODE_SENSE_PC_CURRENT_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + switch (cmd[2] & MODE_SENSE_PAGE_CODE_MASK) { + case MODE_PAGE_CACHING: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_caching_page, + MODE_PAGE_CACHING_LEN); + break; + case MODE_PAGE_CONTROL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_control_page, + MODE_PAGE_CONTROL_LEN); + break; + case MODE_PAGE_POWER_CONDITION: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_pow_cnd_page, + MODE_PAGE_POW_CND_LEN); + break; + case MODE_PAGE_INFO_EXCEP: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_inf_exc_page, + MODE_PAGE_INF_EXC_LEN); + break; + case MODE_PAGE_RETURN_ALL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_all_pages, + MODE_PAGE_ALL_LEN); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd, u8 cdb16) +{ + int res; + int nvme_sc; + u32 alloc_len; + u32 resp_size; + u32 xfer_len; + struct nvme_id_ns *id_ns; + u8 *response; + + if (cdb16) { + alloc_len = get_unaligned_be32(&cmd[10]); + resp_size = READ_CAP_16_RESP_SIZE; + } else { + alloc_len = READ_CAP_10_RESP_SIZE; + resp_size = READ_CAP_10_RESP_SIZE; + } + + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_free_id; + } + nvme_trans_fill_read_cap(response, id_ns, cdb16); + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_free_id: + kfree(id_ns); + return res; +} + +static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res; + int nvme_sc; + u32 alloc_len, xfer_len, resp_size; + u8 *response; + struct nvme_id_ctrl *id_ctrl; + u32 ll_length, lun_id; + u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; + __be32 tmp_len; + + switch (cmd[2]) { + default: + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + case ALL_LUNS_RETURNED: + case ALL_WELL_KNOWN_LUNS_RETURNED: + case RESTRICTED_LUNS_RETURNED: + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; + resp_size = ll_length + LUN_DATA_HEADER_SIZE; + + alloc_len = get_unaligned_be32(&cmd[6]); + if (alloc_len < resp_size) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_free_id; + } + + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_free_id; + } + + /* The first LUN ID will always be 0 per the SAM spec */ + for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { + /* + * Set the LUN Id and then increment to the next LUN + * location in the parameter data. + */ + __be64 tmp_id = cpu_to_be64(lun_id); + memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); + lun_id_offset += LUN_ENTRY_SIZE; + } + tmp_len = cpu_to_be32(ll_length); + memcpy(response, &tmp_len, sizeof(u32)); + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_free_id: + kfree(id_ctrl); + return res; +} + +static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res; + u8 alloc_len, xfer_len, resp_size; + u8 desc_format; + u8 *response; + + desc_format = cmd[1] & 0x01; + alloc_len = cmd[4]; + + resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : + (FIXED_FMT_SENSE_DATA_SIZE)); + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out; + } + + if (desc_format) { + /* Descriptor Format Sense Data */ + response[0] = DESC_FORMAT_SENSE_DATA; + response[1] = NO_SENSE; + /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ + response[2] = SCSI_ASC_NO_SENSE; + response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ + } else { + /* Fixed Format Sense Data */ + response[0] = FIXED_SENSE_DATA; + /* Byte 1 = Obsolete */ + response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ + /* Bytes 3-6 - Information - set to zero */ + response[7] = FIXED_SENSE_DATA_ADD_LENGTH; + /* Bytes 8-11 - Cmd Specific Information - set to zero */ + response[12] = SCSI_ASC_NO_SENSE; + response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* Byte 14 = Field Replaceable Unit Code = 0 */ + /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out: + return res; +} + +static int nvme_trans_security_protocol(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); +} + +static int nvme_trans_synchronize_cache(struct nvme_ns *ns, + struct sg_io_hdr *hdr) +{ + int nvme_sc; + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_cmd_flush; + c.common.nsid = cpu_to_le32(ns->ns_id); + + nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); + return nvme_trans_status_code(hdr, nvme_sc); +} + +static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + u8 immed, pcmod, pc, no_flush, start; + + immed = cmd[1] & 0x01; + pcmod = cmd[3] & 0x0f; + pc = (cmd[4] & 0xf0) >> 4; + no_flush = cmd[4] & 0x04; + start = cmd[4] & 0x01; + + if (immed != 0) { + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } else { + if (no_flush == 0) { + /* Issue NVME FLUSH command prior to START STOP UNIT */ + int res = nvme_trans_synchronize_cache(ns, hdr); + if (res) + return res; + } + /* Setup the expected power state transition */ + return nvme_trans_power_state(ns, hdr, pc, pcmod, start); + } +} + +static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res; + u8 parm_hdr_len = 0; + u8 nvme_pf_code = 0; + u8 format_prot_info, long_list, format_data; + + format_prot_info = (cmd[1] & 0xc0) >> 6; + long_list = cmd[1] & 0x20; + format_data = cmd[1] & 0x10; + + if (format_data != 0) { + if (format_prot_info != 0) { + if (long_list == 0) + parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; + else + parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; + } + } else if (format_data == 0 && format_prot_info != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + /* Get parm header from data-in/out buffer */ + /* + * According to the translation spec, the only fields in the parameter + * list we are concerned with are in the header. So allocate only that. + */ + if (parm_hdr_len > 0) { + res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, + format_prot_info, &nvme_pf_code); + if (res) + goto out; + } + + /* Attempt to activate any previously downloaded firmware image */ + res = nvme_trans_send_activate_fw_cmd(ns, hdr, 0); + + /* Determine Block size and count and send format command */ + res = nvme_trans_fmt_set_blk_size_count(ns, hdr); + if (res) + goto out; + + res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); + + out: + return res; +} + +static int nvme_trans_test_unit_ready(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + if (nvme_ctrl_ready(ns->ctrl)) + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + NOT_READY, SCSI_ASC_LUN_NOT_READY, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + else + return nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); +} + +static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = 0; + u32 buffer_offset, parm_list_length; + u8 buffer_id, mode; + + parm_list_length = get_unaligned_be24(&cmd[6]); + if (parm_list_length % BYTES_TO_DWORDS != 0) { + /* NVMe expects Firmware file to be a whole number of DWORDS */ + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + buffer_id = cmd[2]; + if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + mode = cmd[1] & 0x1f; + buffer_offset = get_unaligned_be24(&cmd[3]); + + switch (mode) { + case DOWNLOAD_SAVE_ACTIVATE: + res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + if (res) + goto out; + res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); + break; + case DOWNLOAD_SAVE_DEFER_ACTIVATE: + res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + case ACTIVATE_DEFERRED_MICROCODE: + res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +struct scsi_unmap_blk_desc { + __be64 slba; + __be32 nlb; + u32 resv; +}; + +struct scsi_unmap_parm_list { + __be16 unmap_data_len; + __be16 unmap_blk_desc_data_len; + u32 resv; + struct scsi_unmap_blk_desc desc[0]; +}; + +static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + struct scsi_unmap_parm_list *plist; + struct nvme_dsm_range *range; + struct nvme_command c; + int i, nvme_sc, res; + u16 ndesc, list_len; + + list_len = get_unaligned_be16(&cmd[7]); + if (!list_len) + return -EINVAL; + + plist = kmalloc(list_len, GFP_KERNEL); + if (!plist) + return -ENOMEM; + + res = nvme_trans_copy_from_user(hdr, plist, list_len); + if (res) + goto out; + + ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; + if (!ndesc || ndesc > 256) { + res = -EINVAL; + goto out; + } + + range = kcalloc(ndesc, sizeof(*range), GFP_KERNEL); + if (!range) { + res = -ENOMEM; + goto out; + } + + for (i = 0; i < ndesc; i++) { + range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); + range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba)); + range[i].cattr = 0; + } + + memset(&c, 0, sizeof(c)); + c.dsm.opcode = nvme_cmd_dsm; + c.dsm.nsid = cpu_to_le32(ns->ns_id); + c.dsm.nr = cpu_to_le32(ndesc - 1); + c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, range, + ndesc * sizeof(*range)); + res = nvme_trans_status_code(hdr, nvme_sc); + + kfree(range); + out: + kfree(plist); + return res; +} + +static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) +{ + u8 cmd[BLK_MAX_CDB]; + int retcode; + unsigned int opcode; + + if (hdr->cmdp == NULL) + return -EMSGSIZE; + if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) + return -EFAULT; + + /* + * Prime the hdr with good status for scsi commands that don't require + * an nvme command for translation. + */ + retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); + if (retcode) + return retcode; + + opcode = cmd[0]; + + switch (opcode) { + case READ_6: + case READ_10: + case READ_12: + case READ_16: + retcode = nvme_trans_io(ns, hdr, 0, cmd); + break; + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + retcode = nvme_trans_io(ns, hdr, 1, cmd); + break; + case INQUIRY: + retcode = nvme_trans_inquiry(ns, hdr, cmd); + break; + case LOG_SENSE: + retcode = nvme_trans_log_sense(ns, hdr, cmd); + break; + case MODE_SELECT: + case MODE_SELECT_10: + retcode = nvme_trans_mode_select(ns, hdr, cmd); + break; + case MODE_SENSE: + case MODE_SENSE_10: + retcode = nvme_trans_mode_sense(ns, hdr, cmd); + break; + case READ_CAPACITY: + retcode = nvme_trans_read_capacity(ns, hdr, cmd, 0); + break; + case SERVICE_ACTION_IN_16: + switch (cmd[1]) { + case SAI_READ_CAPACITY_16: + retcode = nvme_trans_read_capacity(ns, hdr, cmd, 1); + break; + default: + goto out; + } + break; + case REPORT_LUNS: + retcode = nvme_trans_report_luns(ns, hdr, cmd); + break; + case REQUEST_SENSE: + retcode = nvme_trans_request_sense(ns, hdr, cmd); + break; + case SECURITY_PROTOCOL_IN: + case SECURITY_PROTOCOL_OUT: + retcode = nvme_trans_security_protocol(ns, hdr, cmd); + break; + case START_STOP: + retcode = nvme_trans_start_stop(ns, hdr, cmd); + break; + case SYNCHRONIZE_CACHE: + retcode = nvme_trans_synchronize_cache(ns, hdr); + break; + case FORMAT_UNIT: + retcode = nvme_trans_format_unit(ns, hdr, cmd); + break; + case TEST_UNIT_READY: + retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); + break; + case WRITE_BUFFER: + retcode = nvme_trans_write_buffer(ns, hdr, cmd); + break; + case UNMAP: + retcode = nvme_trans_unmap(ns, hdr, cmd); + break; + default: + out: + retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + return retcode; +} + +int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) +{ + struct sg_io_hdr hdr; + int retcode; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) + return -EFAULT; + if (hdr.interface_id != 'S') + return -EINVAL; + if (hdr.cmd_len > BLK_MAX_CDB) + return -EINVAL; + + /* + * A positive return code means a NVMe status, which has been + * translated to sense data. + */ + retcode = nvme_scsi_translate(ns, &hdr); + if (retcode < 0) + return retcode; + if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) + return -EFAULT; + return 0; +} + +int nvme_sg_get_version_num(int __user *ip) +{ + return put_user(sg_version_num, ip); +} diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig new file mode 100644 index 0000000..3a5b9d0 --- /dev/null +++ b/drivers/nvme/target/Kconfig @@ -0,0 +1,36 @@ + +config NVME_TARGET + tristate "NVMe Target support" + depends on BLOCK + depends on CONFIGFS_FS + help + This enabled target side support for the NVMe protocol, that is + it allows the Linux kernel to implement NVMe subsystems and + controllers and export Linux block devices as NVMe namespaces. + You need to select at least one of the transports below to make this + functionality useful. + + To configure the NVMe target you probably want to use the nvmetcli + tool from http://git.infradead.org/users/hch/nvmetcli.git. + +config NVME_TARGET_LOOP + tristate "NVMe loopback device support" + depends on NVME_TARGET + select NVME_CORE + select NVME_FABRICS + select SG_POOL + help + This enables the NVMe loopback device support, which can be useful + to test NVMe host and target side features. + + If unsure, say N. + +config NVME_TARGET_RDMA + tristate "NVMe over Fabrics RDMA target support" + depends on INFINIBAND + depends on NVME_TARGET + help + This enables the NVMe RDMA target support, which allows exporting NVMe + devices over RDMA. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile new file mode 100644 index 0000000..b7a0623 --- /dev/null +++ b/drivers/nvme/target/Makefile @@ -0,0 +1,9 @@ + +obj-$(CONFIG_NVME_TARGET) += nvmet.o +obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o +obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o + +nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \ + discovery.o +nvme-loop-y += loop.o +nvmet-rdma-y += rdma.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c new file mode 100644 index 0000000..47c564b --- /dev/null +++ b/drivers/nvme/target/admin-cmd.c @@ -0,0 +1,461 @@ +/* + * NVMe admin command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "nvmet.h" + +u32 nvmet_get_log_page_len(struct nvme_command *cmd) +{ + u32 len = le16_to_cpu(cmd->get_log_page.numdu); + + len <<= 16; + len += le16_to_cpu(cmd->get_log_page.numdl); + /* NUMD is a 0's based value */ + len += 1; + len *= sizeof(u32); + + return len; +} + +static void nvmet_execute_get_log_page(struct nvmet_req *req) +{ + size_t data_len = nvmet_get_log_page_len(req->cmd); + void *buf; + u16 status = 0; + + buf = kzalloc(data_len, GFP_KERNEL); + if (!buf) { + status = NVME_SC_INTERNAL; + goto out; + } + + switch (req->cmd->get_log_page.lid) { + case 0x01: + /* + * We currently never set the More bit in the status field, + * so all error log entries are invalid and can be zeroed out. + * This is called a minum viable implementation (TM) of this + * mandatory log page. + */ + break; + case 0x02: + /* + * XXX: fill out actual smart log + * + * We might have a hard time coming up with useful values for + * many of the fields, and even when we have useful data + * available (e.g. units or commands read/written) those aren't + * persistent over power loss. + */ + break; + case 0x03: + /* + * We only support a single firmware slot which always is + * active, so we can zero out the whole firmware slot log and + * still claim to fully implement this mandatory log page. + */ + break; + default: + BUG(); + } + + status = nvmet_copy_to_sgl(req, 0, buf, data_len); + + kfree(buf); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + /* XXX: figure out how to assign real vendors IDs. */ + id->vid = 0; + id->ssvid = 0; + + memset(id->sn, ' ', sizeof(id->sn)); + snprintf(id->sn, sizeof(id->sn), "%llx", ctrl->serial); + + memset(id->mn, ' ', sizeof(id->mn)); + strncpy((char *)id->mn, "Linux", sizeof(id->mn)); + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + id->rab = 6; + + /* + * XXX: figure out how we can assign a IEEE OUI, but until then + * the safest is to leave it as zeroes. + */ + + /* we support multiple ports and multiples hosts: */ + id->mic = (1 << 0) | (1 << 1); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* XXX: figure out what to do about RTD3R/RTD3 */ + id->oaes = cpu_to_le32(1 << 8); + id->ctratt = cpu_to_le32(1 << 0); + + id->oacs = 0; + + /* + * We don't really have a practical limit on the number of abort + * comands. But we don't do anything useful for abort either, so + * no point in allowing more abort commands than the spec requires. + */ + id->acl = 3; + + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* first slot is read-only, only one slot supported */ + id->frmw = (1 << 0) | (1 << 1); + id->lpa = (1 << 0) | (1 << 2); + id->elpe = NVMET_ERROR_LOG_SLOTS - 1; + id->npss = 0; + + /* We support keep-alive timeout in granularity of seconds */ + id->kas = cpu_to_le16(NVMET_KAS); + + id->sqes = (0x6 << 4) | 0x6; + id->cqes = (0x4 << 4) | 0x4; + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->nn = cpu_to_le32(ctrl->subsys->max_nsid); + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM); + + /* XXX: don't report vwc if the underlying device is write through */ + id->vwc = NVME_CTRL_VWC_PRESENT; + + /* + * We can't support atomic writes bigger than a LBA without support + * from the backend device. + */ + id->awun = 0; + id->awupf = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + /* Max command capsule size is sqe + single page of in-capsule data */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + ctrl->ops->sqe_inline_size) / 16); + /* Max response capsule size is cqe */ + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* + * Meh, we don't really support any power state. Fake up the same + * values that qemu does. + */ + id->psd[0].max_power = cpu_to_le16(0x9c4); + id->psd[0].entry_lat = cpu_to_le32(0x10); + id->psd[0].exit_lat = cpu_to_le32(0x4); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ns(struct nvmet_req *req) +{ + struct nvmet_ns *ns; + struct nvme_id_ns *id; + u16 status = 0; + + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out_put_ns; + } + + /* + * nuse = ncap = nsze isn't aways true, but we have no way to find + * that out from the underlying device. + */ + id->ncap = id->nuse = id->nsze = + cpu_to_le64(ns->size >> ns->blksize_shift); + + /* + * We just provide a single LBA format that matches what the + * underlying device reports. + */ + id->nlbaf = 0; + id->flbas = 0; + + /* + * Our namespace might always be shared. Not just with other + * controllers, but also with any other user of the block device. + */ + id->nmic = (1 << 0); + + memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); + + id->lbaf[0].ds = ns->blksize_shift; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out_put_ns: + nvmet_put_namespace(ns); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_nslist(struct nvmet_req *req) +{ + static const int buf_size = 4096; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); + __le32 *list; + u16 status = 0; + int i = 0; + + list = kzalloc(buf_size, GFP_KERNEL); + if (!list) { + status = NVME_SC_INTERNAL; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid <= min_nsid) + continue; + list[i++] = cpu_to_le32(ns->nsid); + if (i == buf_size / sizeof(__le32)) + break; + } + rcu_read_unlock(); + + status = nvmet_copy_to_sgl(req, 0, list, buf_size); + + kfree(list); +out: + nvmet_req_complete(req, status); +} + +/* + * A "mimimum viable" abort implementation: the command is mandatory in the + * spec, but we are not required to do any useful work. We couldn't really + * do a useful abort, so don't bother even with waiting for the command + * to be exectuted and return immediately telling the command to abort + * wasn't found. + */ +static void nvmet_execute_abort(struct nvmet_req *req) +{ + nvmet_set_result(req, 1); + nvmet_req_complete(req, 0); +} + +static void nvmet_execute_set_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u64 val; + u32 val32; + u16 status = 0; + + switch (cdw10 & 0xf) { + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); + break; + case NVME_FEAT_KATO: + val = le64_to_cpu(req->cmd->prop_set.value); + val32 = val & 0xffff; + req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + nvmet_set_result(req, req->sq->ctrl->kato); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_get_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u16 status = 0; + + switch (cdw10 & 0xf) { + /* + * These features are mandatory in the spec, but we don't + * have a useful way to implement them. We'll eventually + * need to come up with some fake values for these. + */ +#if 0 + case NVME_FEAT_ARBITRATION: + break; + case NVME_FEAT_POWER_MGMT: + break; + case NVME_FEAT_TEMP_THRESH: + break; + case NVME_FEAT_ERR_RECOVERY: + break; + case NVME_FEAT_IRQ_COALESCE: + break; + case NVME_FEAT_IRQ_CONFIG: + break; + case NVME_FEAT_WRITE_ATOMIC: + break; + case NVME_FEAT_ASYNC_EVENT: + break; +#endif + case NVME_FEAT_VOLATILE_WC: + nvmet_set_result(req, 1); + break; + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); + break; + case NVME_FEAT_KATO: + nvmet_set_result(req, req->sq->ctrl->kato * 1000); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_async_event(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) { + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR); + return; + } + ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req; + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +static void nvmet_execute_keep_alive(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + pr_debug("ctrl %d update keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvmet_req_complete(req, 0); +} + +int nvmet_parse_admin_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got admin cmd %d while CC.EN == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case 0x01: + case 0x02: + case 0x03: + req->execute = nvmet_execute_get_log_page; + return 0; + } + break; + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x00: + req->execute = nvmet_execute_identify_ns; + return 0; + case 0x01: + req->execute = nvmet_execute_identify_ctrl; + return 0; + case 0x02: + req->execute = nvmet_execute_identify_nslist; + return 0; + } + break; + case nvme_admin_abort_cmd: + req->execute = nvmet_execute_abort; + req->data_len = 0; + return 0; + case nvme_admin_set_features: + req->execute = nvmet_execute_set_features; + req->data_len = 0; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_get_features; + req->data_len = 0; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + req->data_len = 0; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + req->data_len = 0; + return 0; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c new file mode 100644 index 0000000..af5e2dc --- /dev/null +++ b/drivers/nvme/target/configfs.c @@ -0,0 +1,917 @@ +/* + * Configfs interface for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +#include "nvmet.h" + +static struct config_item_type nvmet_host_type; +static struct config_item_type nvmet_subsys_type; + +/* + * nvmet_port Generic ConfigFS definitions. + * Used in any place in the ConfigFS tree that refers to an address. + */ +static ssize_t nvmet_addr_adrfam_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + return sprintf(page, "ipv4\n"); + case NVMF_ADDR_FAMILY_IP6: + return sprintf(page, "ipv6\n"); + case NVMF_ADDR_FAMILY_IB: + return sprintf(page, "ib\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_adrfam_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "ipv4")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP4; + } else if (sysfs_streq(page, "ipv6")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6; + } else if (sysfs_streq(page, "ib")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB; + } else { + pr_err("Invalid value '%s' for adrfam\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_adrfam); + +static ssize_t nvmet_addr_portid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", + le16_to_cpu(port->disc_addr.portid)); +} + +static ssize_t nvmet_addr_portid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u16 portid = 0; + + if (kstrtou16(page, 0, &portid)) { + pr_err("Invalid value '%s' for portid\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + port->disc_addr.portid = cpu_to_le16(portid); + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_portid); + +static ssize_t nvmet_addr_traddr_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.traddr); +} + +static ssize_t nvmet_addr_traddr_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRADDR_SIZE) { + pr_err("Invalid value '%s' for traddr\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.traddr, + sizeof(port->disc_addr.traddr), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_traddr); + +static ssize_t nvmet_addr_treq_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.treq) { + case NVMF_TREQ_NOT_SPECIFIED: + return sprintf(page, "not specified\n"); + case NVMF_TREQ_REQUIRED: + return sprintf(page, "required\n"); + case NVMF_TREQ_NOT_REQUIRED: + return sprintf(page, "not required\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_treq_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "not specified")) { + port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED; + } else if (sysfs_streq(page, "required")) { + port->disc_addr.treq = NVMF_TREQ_REQUIRED; + } else if (sysfs_streq(page, "not required")) { + port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED; + } else { + pr_err("Invalid value '%s' for treq\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_treq); + +static ssize_t nvmet_addr_trsvcid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.trsvcid); +} + +static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRSVCID_SIZE) { + pr_err("Invalid value '%s' for trsvcid\n", page); + return -EINVAL; + } + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.trsvcid, + sizeof(port->disc_addr.trsvcid), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_trsvcid); + +static ssize_t nvmet_addr_trtype_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.trtype) { + case NVMF_TRTYPE_RDMA: + return sprintf(page, "rdma\n"); + case NVMF_TRTYPE_LOOP: + return sprintf(page, "loop\n"); + default: + return sprintf(page, "\n"); + } +} + +static void nvmet_port_init_tsas_rdma(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_RDMA; + memset(&port->disc_addr.tsas.rdma, 0, NVMF_TSAS_SIZE); + port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED; + port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED; + port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM; +} + +static void nvmet_port_init_tsas_loop(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_LOOP; + memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE); +} + +static ssize_t nvmet_addr_trtype_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "rdma")) { + nvmet_port_init_tsas_rdma(port); + } else if (sysfs_streq(page, "loop")) { + nvmet_port_init_tsas_loop(port); + } else { + pr_err("Invalid value '%s' for trtype\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_trtype); + +/* + * Namespace structures & file operation functions below + */ +static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path); +} + +static ssize_t nvmet_ns_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret; + + mutex_lock(&subsys->lock); + ret = -EBUSY; + if (nvmet_ns_enabled(ns)) + goto out_unlock; + + kfree(ns->device_path); + + ret = -ENOMEM; + ns->device_path = kstrdup(page, GFP_KERNEL); + if (!ns->device_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + return count; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +CONFIGFS_ATTR(nvmet_ns_, device_path); + +static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); +} + +static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + u8 nguid[16]; + const char *p = page; + int i; + int ret = 0; + + mutex_lock(&subsys->lock); + if (nvmet_ns_enabled(ns)) { + ret = -EBUSY; + goto out_unlock; + } + + for (i = 0; i < 16; i++) { + if (p + 2 > page + count) { + ret = -EINVAL; + goto out_unlock; + } + if (!isxdigit(p[0]) || !isxdigit(p[1])) { + ret = -EINVAL; + goto out_unlock; + } + + nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]); + p += 2; + + if (*p == '-' || *p == ':') + p++; + } + + memcpy(&ns->nguid, nguid, sizeof(nguid)); +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, device_nguid); + +static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item))); +} + +static ssize_t nvmet_ns_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_ns_enable(ns); + else + nvmet_ns_disable(ns); + + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, enable); + +static struct configfs_attribute *nvmet_ns_attrs[] = { + &nvmet_ns_attr_device_path, + &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_enable, + NULL, +}; + +static void nvmet_ns_release(struct config_item *item) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + nvmet_ns_free(ns); +} + +static struct configfs_item_operations nvmet_ns_item_ops = { + .release = nvmet_ns_release, +}; + +static struct config_item_type nvmet_ns_type = { + .ct_item_ops = &nvmet_ns_item_ops, + .ct_attrs = nvmet_ns_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ns_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item); + struct nvmet_ns *ns; + int ret; + u32 nsid; + + ret = kstrtou32(name, 0, &nsid); + if (ret) + goto out; + + ret = -EINVAL; + if (nsid == 0 || nsid == 0xffffffff) + goto out; + + ret = -ENOMEM; + ns = nvmet_ns_alloc(subsys, nsid); + if (!ns) + goto out; + config_group_init_type_name(&ns->group, name, &nvmet_ns_type); + + pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn); + + return &ns->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_namespaces_group_ops = { + .make_group = nvmet_ns_make, +}; + +static struct config_item_type nvmet_namespaces_type = { + .ct_group_ops = &nvmet_namespaces_group_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_port_subsys_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys; + struct nvmet_subsys_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_subsys_type) { + pr_err("can only link subsystems into the subsystems dir.!\n"); + return -EINVAL; + } + subsys = to_subsys(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->subsys = subsys; + + down_write(&nvmet_config_sem); + ret = -EEXIST; + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto out_free_link; + } + + if (list_empty(&port->subsystems)) { + ret = nvmet_enable_port(port); + if (ret) + goto out_free_link; + } + + list_add_tail(&link->entry, &port->subsystems); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; + +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_port_subsys_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys = to_subsys(target); + struct nvmet_subsys_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + if (list_empty(&port->subsystems)) + nvmet_disable_port(port); + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_port_subsys_item_ops = { + .allow_link = nvmet_port_subsys_allow_link, + .drop_link = nvmet_port_subsys_drop_link, +}; + +static struct config_item_type nvmet_port_subsys_type = { + .ct_item_ops = &nvmet_port_subsys_item_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_allowed_hosts_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host; + struct nvmet_host_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_host_type) { + pr_err("can only link hosts into the allowed_hosts directory!\n"); + return -EINVAL; + } + + host = to_host(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->host = host; + + down_write(&nvmet_config_sem); + ret = -EINVAL; + if (subsys->allow_any_host) { + pr_err("can't add hosts when allow_any_host is set!\n"); + goto out_free_link; + } + + ret = -EEXIST; + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto out_free_link; + } + list_add_tail(&link->entry, &subsys->hosts); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_allowed_hosts_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host = to_host(target); + struct nvmet_host_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_allowed_hosts_item_ops = { + .allow_link = nvmet_allowed_hosts_allow_link, + .drop_link = nvmet_allowed_hosts_drop_link, +}; + +static struct config_item_type nvmet_allowed_hosts_type = { + .ct_item_ops = &nvmet_allowed_hosts_item_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + to_subsys(item)->allow_any_host); +} + +static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool allow_any_host; + int ret = 0; + + if (strtobool(page, &allow_any_host)) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (allow_any_host && !list_empty(&subsys->hosts)) { + pr_err("Can't set allow_any_host when explicit hosts are set!\n"); + ret = -EINVAL; + goto out_unlock; + } + + subsys->allow_any_host = allow_any_host; +out_unlock: + up_write(&nvmet_config_sem); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); + +static struct configfs_attribute *nvmet_subsys_attrs[] = { + &nvmet_subsys_attr_attr_allow_any_host, + NULL, +}; + +/* + * Subsystem structures & folder operation functions below + */ +static void nvmet_subsys_release(struct config_item *item) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + nvmet_subsys_put(subsys); +} + +static struct configfs_item_operations nvmet_subsys_item_ops = { + .release = nvmet_subsys_release, +}; + +static struct config_item_type nvmet_subsys_type = { + .ct_item_ops = &nvmet_subsys_item_ops, + .ct_attrs = nvmet_subsys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_subsys_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys; + + if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) { + pr_err("can't create discovery subsystem through configfs\n"); + return ERR_PTR(-EINVAL); + } + + subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); + if (!subsys) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type); + + config_group_init_type_name(&subsys->namespaces_group, + "namespaces", &nvmet_namespaces_type); + configfs_add_default_group(&subsys->namespaces_group, &subsys->group); + + config_group_init_type_name(&subsys->allowed_hosts_group, + "allowed_hosts", &nvmet_allowed_hosts_type); + configfs_add_default_group(&subsys->allowed_hosts_group, + &subsys->group); + + return &subsys->group; +} + +static struct configfs_group_operations nvmet_subsystems_group_ops = { + .make_group = nvmet_subsys_make, +}; + +static struct config_item_type nvmet_subsystems_type = { + .ct_group_ops = &nvmet_subsystems_group_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_referral_enable_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled); +} + +static ssize_t nvmet_referral_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); + struct nvmet_port *port = to_nvmet_port(item); + bool enable; + + if (strtobool(page, &enable)) + goto inval; + + if (enable) + nvmet_referral_enable(parent, port); + else + nvmet_referral_disable(port); + + return count; +inval: + pr_err("Invalid value '%s' for enable\n", page); + return -EINVAL; +} + +CONFIGFS_ATTR(nvmet_referral_, enable); + +/* + * Discovery Service subsystem definitions + */ +static struct configfs_attribute *nvmet_referral_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_portid, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + &nvmet_referral_attr_enable, + NULL, +}; + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + nvmet_referral_disable(port); + kfree(port); +} + +static struct configfs_item_operations nvmet_referral_item_ops = { + .release = nvmet_referral_release, +}; + +static struct config_item_type nvmet_referral_type = { + .ct_owner = THIS_MODULE, + .ct_attrs = nvmet_referral_attrs, + .ct_item_ops = &nvmet_referral_item_ops, +}; + +static struct config_group *nvmet_referral_make( + struct config_group *group, const char *name) +{ + struct nvmet_port *port; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + config_group_init_type_name(&port->group, name, &nvmet_referral_type); + + return &port->group; +} + +static struct configfs_group_operations nvmet_referral_group_ops = { + .make_group = nvmet_referral_make, +}; + +static struct config_item_type nvmet_referrals_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &nvmet_referral_group_ops, +}; + +/* + * Ports definitions. + */ +static void nvmet_port_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + kfree(port); +} + +static struct configfs_attribute *nvmet_port_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + NULL, +}; + +static struct configfs_item_operations nvmet_port_item_ops = { + .release = nvmet_port_release, +}; + +static struct config_item_type nvmet_port_type = { + .ct_attrs = nvmet_port_attrs, + .ct_item_ops = &nvmet_port_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ports_make(struct config_group *group, + const char *name) +{ + struct nvmet_port *port; + u16 portid; + + if (kstrtou16(name, 0, &portid)) + return ERR_PTR(-EINVAL); + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + INIT_LIST_HEAD(&port->subsystems); + INIT_LIST_HEAD(&port->referrals); + + port->disc_addr.portid = cpu_to_le16(portid); + config_group_init_type_name(&port->group, name, &nvmet_port_type); + + config_group_init_type_name(&port->subsys_group, + "subsystems", &nvmet_port_subsys_type); + configfs_add_default_group(&port->subsys_group, &port->group); + + config_group_init_type_name(&port->referrals_group, + "referrals", &nvmet_referrals_type); + configfs_add_default_group(&port->referrals_group, &port->group); + + return &port->group; +} + +static struct configfs_group_operations nvmet_ports_group_ops = { + .make_group = nvmet_ports_make, +}; + +static struct config_item_type nvmet_ports_type = { + .ct_group_ops = &nvmet_ports_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_subsystems_group; +static struct config_group nvmet_ports_group; + +static void nvmet_host_release(struct config_item *item) +{ + struct nvmet_host *host = to_host(item); + + kfree(host); +} + +static struct configfs_item_operations nvmet_host_item_ops = { + .release = nvmet_host_release, +}; + +static struct config_item_type nvmet_host_type = { + .ct_item_ops = &nvmet_host_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_hosts_make_group(struct config_group *group, + const char *name) +{ + struct nvmet_host *host; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&host->group, name, &nvmet_host_type); + + return &host->group; +} + +static struct configfs_group_operations nvmet_hosts_group_ops = { + .make_group = nvmet_hosts_make_group, +}; + +static struct config_item_type nvmet_hosts_type = { + .ct_group_ops = &nvmet_hosts_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_hosts_group; + +static struct config_item_type nvmet_root_type = { + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nvmet_configfs_subsystem = { + .su_group = { + .cg_item = { + .ci_namebuf = "nvmet", + .ci_type = &nvmet_root_type, + }, + }, +}; + +int __init nvmet_init_configfs(void) +{ + int ret; + + config_group_init(&nvmet_configfs_subsystem.su_group); + mutex_init(&nvmet_configfs_subsystem.su_mutex); + + config_group_init_type_name(&nvmet_subsystems_group, + "subsystems", &nvmet_subsystems_type); + configfs_add_default_group(&nvmet_subsystems_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_ports_group, + "ports", &nvmet_ports_type); + configfs_add_default_group(&nvmet_ports_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_hosts_group, + "hosts", &nvmet_hosts_type); + configfs_add_default_group(&nvmet_hosts_group, + &nvmet_configfs_subsystem.su_group); + + ret = configfs_register_subsystem(&nvmet_configfs_subsystem); + if (ret) { + pr_err("configfs_register_subsystem: %d\n", ret); + return ret; + } + + return 0; +} + +void __exit nvmet_exit_configfs(void) +{ + configfs_unregister_subsystem(&nvmet_configfs_subsystem); +} diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c new file mode 100644 index 0000000..6559d5a --- /dev/null +++ b/drivers/nvme/target/core.c @@ -0,0 +1,968 @@ +/* + * Common code for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "nvmet.h" + +static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; + +/* + * This read/write semaphore is used to synchronize access to configuration + * information on a target system that will result in discovery log page + * information change for at least one host. + * The full list of resources to protected by this semaphore is: + * + * - subsystems list + * - per-subsystem allowed hosts list + * - allow_any_host subsystem attribute + * - nvmet_genctr + * - the nvmet_transports array + * + * When updating any of those lists/structures write lock should be obtained, + * while when reading (popolating discovery log page or checking host-subsystem + * link) read lock is obtained to allow concurrent reads. + */ +DECLARE_RWSEM(nvmet_config_sem); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len) +{ + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) +{ + if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +static u32 nvmet_async_event_result(struct nvmet_async_event *aen) +{ + return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); +} + +static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) +{ + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + if (!ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR); + } +} + +static void nvmet_async_event_work(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, async_event_work); + struct nvmet_async_event *aen; + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + aen = list_first_entry_or_null(&ctrl->async_events, + struct nvmet_async_event, entry); + if (!aen || !ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + nvmet_set_result(req, nvmet_async_event_result(aen)); + + list_del(&aen->entry); + kfree(aen); + + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, 0); + } +} + +static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page) +{ + struct nvmet_async_event *aen; + + aen = kmalloc(sizeof(*aen), GFP_KERNEL); + if (!aen) + return; + + aen->event_type = event_type; + aen->event_info = event_info; + aen->log_page = log_page; + + mutex_lock(&ctrl->lock); + list_add_tail(&aen->entry, &ctrl->async_events); + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops) +{ + int ret = 0; + + down_write(&nvmet_config_sem); + if (nvmet_transports[ops->type]) + ret = -EINVAL; + else + nvmet_transports[ops->type] = ops; + up_write(&nvmet_config_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmet_register_transport); + +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops) +{ + down_write(&nvmet_config_sem); + nvmet_transports[ops->type] = NULL; + up_write(&nvmet_config_sem); +} +EXPORT_SYMBOL_GPL(nvmet_unregister_transport); + +int nvmet_enable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + int ret; + + lockdep_assert_held(&nvmet_config_sem); + + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + up_write(&nvmet_config_sem); + request_module("nvmet-transport-%d", port->disc_addr.trtype); + down_write(&nvmet_config_sem); + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + pr_err("transport type %d not supported\n", + port->disc_addr.trtype); + return -EINVAL; + } + } + + if (!try_module_get(ops->owner)) + return -EINVAL; + + ret = ops->add_port(port); + if (ret) { + module_put(ops->owner); + return ret; + } + + port->enabled = true; + return 0; +} + +void nvmet_disable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + + lockdep_assert_held(&nvmet_config_sem); + + port->enabled = false; + + ops = nvmet_transports[port->disc_addr.trtype]; + ops->remove_port(port); + module_put(ops->owner); +} + +static void nvmet_keep_alive_timer(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvmet_ctrl, ka_work); + + pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", + ctrl->cntlid, ctrl->kato); + + ctrl->ops->delete_ctrl(ctrl); +} + +static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d start keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); + + cancel_delayed_work_sync(&ctrl->ka_work); +} + +static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, + __le32 nsid) +{ + struct nvmet_ns *ns; + + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid == le32_to_cpu(nsid)) + return ns; + } + + return NULL; +} + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) +{ + struct nvmet_ns *ns; + + rcu_read_lock(); + ns = __nvmet_find_namespace(ctrl, nsid); + if (ns) + percpu_ref_get(&ns->ref); + rcu_read_unlock(); + + return ns; +} + +static void nvmet_destroy_namespace(struct percpu_ref *ref) +{ + struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref); + + complete(&ns->disable_done); +} + +void nvmet_put_namespace(struct nvmet_ns *ns) +{ + percpu_ref_put(&ns->ref); +} + +int nvmet_ns_enable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + int ret = 0; + + mutex_lock(&subsys->lock); + if (!list_empty(&ns->dev_link)) + goto out_unlock; + + ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, + NULL); + if (IS_ERR(ns->bdev)) { + pr_err("nvmet: failed to open block device %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->bdev)); + ret = PTR_ERR(ns->bdev); + ns->bdev = NULL; + goto out_unlock; + } + + ns->size = i_size_read(ns->bdev->bd_inode); + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + + ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, + 0, GFP_KERNEL); + if (ret) + goto out_blkdev_put; + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = ns->nsid; + + /* + * The namespaces list needs to be sorted to simplify the implementation + * of the Identify Namepace List subcommand. + */ + if (list_empty(&subsys->namespaces)) { + list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); + } else { + struct nvmet_ns *old; + + list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) { + BUG_ON(ns->nsid == old->nsid); + if (ns->nsid < old->nsid) + break; + } + + list_add_tail_rcu(&ns->dev_link, &old->dev_link); + } + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + ret = 0; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +out_blkdev_put: + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + ns->bdev = NULL; + goto out_unlock; +} + +void nvmet_ns_disable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + if (list_empty(&ns->dev_link)) { + mutex_unlock(&subsys->lock); + return; + } + list_del_init(&ns->dev_link); + mutex_unlock(&subsys->lock); + + /* + * Now that we removed the namespaces from the lookup list, we + * can kill the per_cpu ref and wait for any remaining references + * to be dropped, as well as a RCU grace period for anyone only + * using the namepace under rcu_read_lock(). Note that we can't + * use call_rcu here as we need to ensure the namespaces have + * been fully destroyed before unloading the module. + */ + percpu_ref_kill(&ns->ref); + synchronize_rcu(); + wait_for_completion(&ns->disable_done); + percpu_ref_exit(&ns->ref); + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + if (ns->bdev) + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + nvmet_ns_disable(ns); + + kfree(ns->device_path); + kfree(ns); +} + +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) +{ + struct nvmet_ns *ns; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return NULL; + + INIT_LIST_HEAD(&ns->dev_link); + init_completion(&ns->disable_done); + + ns->nsid = nsid; + ns->subsys = subsys; + + return ns; +} + +static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + if (status) + nvmet_set_status(req, status); + + /* XXX: need to fill in something useful for sq_head */ + req->rsp->sq_head = 0; + if (likely(req->sq)) /* may happen during early failure */ + req->rsp->sq_id = cpu_to_le16(req->sq->qid); + req->rsp->command_id = req->cmd->common.command_id; + + if (req->ns) + nvmet_put_namespace(req->ns); + req->ops->queue_response(req); +} + +void nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + __nvmet_req_complete(req, status); + percpu_ref_put(&req->sq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_req_complete); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, + u16 qid, u16 size) +{ + cq->qid = qid; + cq->size = size; + + ctrl->cqs[qid] = cq; +} + +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + u16 qid, u16 size) +{ + sq->qid = qid; + sq->size = size; + + ctrl->sqs[qid] = sq; +} + +void nvmet_sq_destroy(struct nvmet_sq *sq) +{ + /* + * If this is the admin queue, complete all AERs so that our + * queue doesn't have outstanding requests on it. + */ + if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq) + nvmet_async_events_free(sq->ctrl); + percpu_ref_kill(&sq->ref); + wait_for_completion(&sq->free_done); + percpu_ref_exit(&sq->ref); + + if (sq->ctrl) { + nvmet_ctrl_put(sq->ctrl); + sq->ctrl = NULL; /* allows reusing the queue later */ + } +} +EXPORT_SYMBOL_GPL(nvmet_sq_destroy); + +static void nvmet_sq_free(struct percpu_ref *ref) +{ + struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); + + complete(&sq->free_done); +} + +int nvmet_sq_init(struct nvmet_sq *sq) +{ + int ret; + + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + return ret; + } + init_completion(&sq->free_done); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmet_sq_init); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops) +{ + u8 flags = req->cmd->common.flags; + u16 status; + + req->cq = cq; + req->sq = sq; + req->ops = ops; + req->sg = NULL; + req->sg_cnt = 0; + req->rsp->status = 0; + + /* no support for fused commands yet */ + if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + /* either variant of SGLs is fine, as we don't support metadata */ + if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF && + (flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METASEG)) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + if (unlikely(!req->sq->ctrl)) + /* will return an error for any Non-connect command: */ + status = nvmet_parse_connect_cmd(req); + else if (likely(req->sq->qid != 0)) + status = nvmet_parse_io_cmd(req); + else if (req->cmd->common.opcode == nvme_fabrics_command) + status = nvmet_parse_fabrics_cmd(req); + else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) + status = nvmet_parse_discovery_cmd(req); + else + status = nvmet_parse_admin_cmd(req); + + if (status) + goto fail; + + if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + return true; + +fail: + __nvmet_req_complete(req, status); + return false; +} +EXPORT_SYMBOL_GPL(nvmet_req_init); + +static inline bool nvmet_cc_en(u32 cc) +{ + return cc & 0x1; +} + +static inline u8 nvmet_cc_css(u32 cc) +{ + return (cc >> 4) & 0x7; +} + +static inline u8 nvmet_cc_mps(u32 cc) +{ + return (cc >> 7) & 0xf; +} + +static inline u8 nvmet_cc_ams(u32 cc) +{ + return (cc >> 11) & 0x7; +} + +static inline u8 nvmet_cc_shn(u32 cc) +{ + return (cc >> 14) & 0x3; +} + +static inline u8 nvmet_cc_iosqes(u32 cc) +{ + return (cc >> 16) & 0xf; +} + +static inline u8 nvmet_cc_iocqes(u32 cc) +{ + return (cc >> 20) & 0xf; +} + +static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || + nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES || + nvmet_cc_mps(ctrl->cc) != 0 || + nvmet_cc_ams(ctrl->cc) != 0 || + nvmet_cc_css(ctrl->cc) != 0) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + ctrl->csts = NVME_CSTS_RDY; +} + +static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + /* XXX: tear down queues? */ + ctrl->csts &= ~NVME_CSTS_RDY; + ctrl->cc = 0; +} + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) +{ + u32 old; + + mutex_lock(&ctrl->lock); + old = ctrl->cc; + ctrl->cc = new; + + if (nvmet_cc_en(new) && !nvmet_cc_en(old)) + nvmet_start_ctrl(ctrl); + if (!nvmet_cc_en(new) && nvmet_cc_en(old)) + nvmet_clear_ctrl(ctrl); + if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) { + nvmet_clear_ctrl(ctrl); + ctrl->csts |= NVME_CSTS_SHST_CMPLT; + } + if (!nvmet_cc_shn(new) && nvmet_cc_shn(old)) + ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; + mutex_unlock(&ctrl->lock); +} + +static void nvmet_init_cap(struct nvmet_ctrl *ctrl) +{ + /* command sets supported: NVMe command set: */ + ctrl->cap = (1ULL << 37); + /* CC.EN timeout in 500msec units: */ + ctrl->cap |= (15ULL << 24); + /* maximum queue entries supported: */ + ctrl->cap |= NVMET_QUEUE_SIZE - 1; +} + +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + u16 status = 0; + + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->cntlid == cntlid) { + if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) { + pr_warn("hostnqn mismatch.\n"); + continue; + } + if (!kref_get_unless_zero(&ctrl->ref)) + continue; + + *ret = ctrl; + goto out; + } + } + + pr_warn("could not find controller %d for subsys %s / host %s\n", + cntlid, subsysnqn, hostnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + +out: + mutex_unlock(&subsys->lock); + nvmet_subsys_put(subsys); + return status; +} + +static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, + const char *hostnqn) +{ + struct nvmet_host_link *p; + + if (subsys->allow_any_host) + return true; + + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), hostnqn)) + return true; + } + + return false; +} + +static bool nvmet_host_discovery_allowed(struct nvmet_req *req, + const char *hostnqn) +{ + struct nvmet_subsys_link *s; + + list_for_each_entry(s, &req->port->subsystems, entry) { + if (__nvmet_host_allowed(s->subsys, hostnqn)) + return true; + } + + return false; +} + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn) +{ + lockdep_assert_held(&nvmet_config_sem); + + if (subsys->type == NVME_NQN_DISC) + return nvmet_host_discovery_allowed(req, hostnqn); + else + return __nvmet_host_allowed(subsys, hostnqn); +} + +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + int ret; + u16 status; + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + goto out; + } + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + down_read(&nvmet_config_sem); + if (!nvmet_host_allowed(req, subsys, hostnqn)) { + pr_info("connect by host %s for subsystem %s not allowed\n", + hostnqn, subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(hostnqn); + up_read(&nvmet_config_sem); + goto out_put_subsystem; + } + up_read(&nvmet_config_sem); + + status = NVME_SC_INTERNAL; + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + goto out_put_subsystem; + mutex_init(&ctrl->lock); + + nvmet_init_cap(ctrl); + + INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); + INIT_LIST_HEAD(&ctrl->async_events); + + memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); + memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + + /* generate a random serial number as our controllers are ephemeral: */ + get_random_bytes(&ctrl->serial, sizeof(ctrl->serial)); + + kref_init(&ctrl->ref); + ctrl->subsys = subsys; + + ctrl->cqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_cq *), + GFP_KERNEL); + if (!ctrl->cqs) + goto out_free_ctrl; + + ctrl->sqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_sq *), + GFP_KERNEL); + if (!ctrl->sqs) + goto out_free_cqs; + + ret = ida_simple_get(&subsys->cntlid_ida, + NVME_CNTLID_MIN, NVME_CNTLID_MAX, + GFP_KERNEL); + if (ret < 0) { + status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + goto out_free_sqs; + } + ctrl->cntlid = ret; + + ctrl->ops = req->ops; + if (ctrl->subsys->type == NVME_NQN_DISC) { + /* Don't accept keep-alive timeout for discovery controllers */ + if (kato) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out_free_sqs; + } + + /* + * Discovery controllers use some arbitrary high value in order + * to cleanup stale discovery sessions + * + * From the latest base diff RC: + * "The Keep Alive command is not supported by + * Discovery controllers. A transport may specify a + * fixed Discovery controller activity timeout value + * (e.g., 2 minutes). If no commands are received + * by a Discovery controller within that time + * period, the controller may perform the + * actions for Keep Alive Timer expiration". + */ + ctrl->kato = NVMET_DISC_KATO; + } else { + /* keep-alive timeout in seconds */ + ctrl->kato = DIV_ROUND_UP(kato, 1000); + } + nvmet_start_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + mutex_unlock(&subsys->lock); + + *ctrlp = ctrl; + return 0; + +out_free_sqs: + kfree(ctrl->sqs); +out_free_cqs: + kfree(ctrl->cqs); +out_free_ctrl: + kfree(ctrl); +out_put_subsystem: + nvmet_subsys_put(subsys); +out: + return status; +} + +static void nvmet_ctrl_free(struct kref *ref) +{ + struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); + struct nvmet_subsys *subsys = ctrl->subsys; + + nvmet_stop_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_del(&ctrl->subsys_entry); + mutex_unlock(&subsys->lock); + + ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid); + nvmet_subsys_put(subsys); + + kfree(ctrl->sqs); + kfree(ctrl->cqs); + kfree(ctrl); +} + +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) +{ + kref_put(&ctrl->ref, nvmet_ctrl_free); +} + +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) +{ + ctrl->csts |= NVME_CSTS_CFS; + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); + schedule_work(&ctrl->fatal_err_work); +} +EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn) +{ + struct nvmet_subsys_link *p; + + if (!port) + return NULL; + + if (!strncmp(NVME_DISC_SUBSYS_NAME, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&nvmet_disc_subsys->ref)) + return NULL; + return nvmet_disc_subsys; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (!strncmp(p->subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&p->subsys->ref)) + break; + up_read(&nvmet_config_sem); + return p->subsys; + } + } + up_read(&nvmet_config_sem); + return NULL; +} + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type) +{ + struct nvmet_subsys *subsys; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return NULL; + + subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */ + + switch (type) { + case NVME_NQN_NVME: + subsys->max_qid = NVMET_NR_QUEUES; + break; + case NVME_NQN_DISC: + subsys->max_qid = 0; + break; + default: + pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); + kfree(subsys); + return NULL; + } + subsys->type = type; + subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, + GFP_KERNEL); + if (!subsys->subsysnqn) { + kfree(subsys); + return NULL; + } + + kref_init(&subsys->ref); + + mutex_init(&subsys->lock); + INIT_LIST_HEAD(&subsys->namespaces); + INIT_LIST_HEAD(&subsys->ctrls); + + ida_init(&subsys->cntlid_ida); + + INIT_LIST_HEAD(&subsys->hosts); + + return subsys; +} + +static void nvmet_subsys_free(struct kref *ref) +{ + struct nvmet_subsys *subsys = + container_of(ref, struct nvmet_subsys, ref); + + WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + + ida_destroy(&subsys->cntlid_ida); + kfree(subsys->subsysnqn); + kfree(subsys); +} + +void nvmet_subsys_put(struct nvmet_subsys *subsys) +{ + kref_put(&subsys->ref, nvmet_subsys_free); +} + +static int __init nvmet_init(void) +{ + int error; + + error = nvmet_init_discovery(); + if (error) + goto out; + + error = nvmet_init_configfs(); + if (error) + goto out_exit_discovery; + return 0; + +out_exit_discovery: + nvmet_exit_discovery(); +out: + return error; +} + +static void __exit nvmet_exit(void) +{ + nvmet_exit_configfs(); + nvmet_exit_discovery(); + + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); +} + +module_init(nvmet_init); +module_exit(nvmet_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c new file mode 100644 index 0000000..6f65646 --- /dev/null +++ b/drivers/nvme/target/discovery.c @@ -0,0 +1,221 @@ +/* + * Discovery service for the NVMe over Fabrics target. + * Copyright (C) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "nvmet.h" + +struct nvmet_subsys *nvmet_disc_subsys; + +u64 nvmet_genctr; + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (list_empty(&port->entry)) { + list_add_tail(&port->entry, &parent->referrals); + port->enabled = true; + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +void nvmet_referral_disable(struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (!list_empty(&port->entry)) { + port->enabled = false; + list_del_init(&port->entry); + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, + struct nvmet_port *port, char *subsys_nqn, u8 type, u32 numrec) +{ + struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec]; + + e->trtype = port->disc_addr.trtype; + e->adrfam = port->disc_addr.adrfam; + e->treq = port->disc_addr.treq; + e->portid = port->disc_addr.portid; + /* we support only dynamic controllers */ + e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); + e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); + e->nqntype = type; + memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); + memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); + memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); +} + +static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) +{ + const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry); + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_disc_rsp_page_hdr *hdr; + size_t data_len = nvmet_get_log_page_len(req->cmd); + size_t alloc_len = max(data_len, sizeof(*hdr)); + int residual_len = data_len - sizeof(*hdr); + struct nvmet_subsys_link *p; + struct nvmet_port *r; + u32 numrec = 0; + u16 status = 0; + + /* + * Make sure we're passing at least a buffer of response header size. + * If host provided data len is less than the header size, only the + * number of bytes requested by host will be sent to host. + */ + hdr = kzalloc(alloc_len, GFP_KERNEL); + if (!hdr) { + status = NVME_SC_INTERNAL; + goto out; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &req->port->subsystems, entry) { + if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn)) + continue; + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, req->port, + p->subsys->subsysnqn, + NVME_NQN_NVME, numrec); + residual_len -= entry_size; + } + numrec++; + } + + list_for_each_entry(r, &req->port->referrals, entry) { + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, r, + NVME_DISC_SUBSYS_NAME, + NVME_NQN_DISC, numrec); + residual_len -= entry_size; + } + numrec++; + } + + hdr->genctr = cpu_to_le64(nvmet_genctr); + hdr->numrec = cpu_to_le64(numrec); + hdr->recfmt = cpu_to_le16(0); + + up_read(&nvmet_config_sem); + + status = nvmet_copy_to_sgl(req, 0, hdr, data_len); + kfree(hdr); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + id->lpa = (1 << 2); + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +int nvmet_parse_discovery_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got cmd %d while not ready\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case NVME_LOG_DISC: + req->execute = nvmet_execute_get_disc_log_page; + return 0; + default: + pr_err("nvmet: unsupported get_log_page lid %d\n", + cmd->get_log_page.lid); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x01: + req->execute = + nvmet_execute_identify_disc_ctrl; + return 0; + default: + pr_err("nvmet: unsupported identify cns %d\n", + le32_to_cpu(cmd->identify.cns)); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + default: + pr_err("nvmet: unsupported cmd %d\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} + +int __init nvmet_init_discovery(void) +{ + nvmet_disc_subsys = + nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); + if (!nvmet_disc_subsys) + return -ENOMEM; + return 0; +} + +void nvmet_exit_discovery(void) +{ + nvmet_subsys_put(nvmet_disc_subsys); +} diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c new file mode 100644 index 0000000..9a97ae6 --- /dev/null +++ b/drivers/nvme/target/fabrics-cmd.c @@ -0,0 +1,240 @@ +/* + * NVMe Fabrics command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include "nvmet.h" + +static void nvmet_execute_prop_set(struct nvmet_req *req) +{ + u16 status = 0; + + if (!(req->cmd->prop_set.attrib & 1)) { + u64 val = le64_to_cpu(req->cmd->prop_set.value); + + switch (le32_to_cpu(req->cmd->prop_set.offset)) { + case NVME_REG_CC: + nvmet_update_cc(req->sq->ctrl, val); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_prop_get(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; + u64 val = 0; + + if (req->cmd->prop_get.attrib & 1) { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_CAP: + val = ctrl->cap; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_VS: + val = ctrl->subsys->ver; + break; + case NVME_REG_CC: + val = ctrl->cc; + break; + case NVME_REG_CSTS: + val = ctrl->csts; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } + + req->rsp->result64 = cpu_to_le64(val); + nvmet_req_complete(req, status); +} + +int nvmet_parse_fabrics_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + switch (cmd->fabrics.fctype) { + case nvme_fabrics_type_property_set: + req->data_len = 0; + req->execute = nvmet_execute_prop_set; + break; + case nvme_fabrics_type_property_get: + req->data_len = 0; + req->execute = nvmet_execute_prop_get; + break; + default: + pr_err("received unknown capsule type 0x%x\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return 0; +} + +static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + u16 qid = le16_to_cpu(c->qid); + u16 sqsize = le16_to_cpu(c->sqsize); + struct nvmet_ctrl *old; + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + + nvmet_cq_setup(ctrl, req->cq, qid, sqsize); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize); + return 0; +} + +static void nvmet_execute_admin_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + if (unlikely(d->cntlid != cpu_to_le16(0xffff))) { + pr_warn("connect attempt for invalid controller ID %#x\n", + d->cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + goto out; + } + + status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, + le32_to_cpu(c->kato), &ctrl); + if (status) + goto out; + + status = nvmet_install_queue(ctrl, req); + if (status) { + nvmet_ctrl_put(ctrl); + goto out; + } + + pr_info("creating controller %d for NQN %s.\n", + ctrl->cntlid, ctrl->hostnqn); + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); +} + +static void nvmet_execute_io_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 qid = le16_to_cpu(c->qid); + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, + le16_to_cpu(d->cntlid), + req, &ctrl); + if (status) + goto out; + + if (unlikely(qid > ctrl->subsys->max_qid)) { + pr_warn("invalid queue id (%d)\n", qid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_SQE(qid); + goto out_ctrl_put; + } + + status = nvmet_install_queue(ctrl, req); + if (status) { + /* pass back cntlid that had the issue of installing queue */ + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + goto out_ctrl_put; + } + + pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); + return; + +out_ctrl_put: + nvmet_ctrl_put(ctrl); + goto out; +} + +int nvmet_parse_connect_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (req->cmd->common.opcode != nvme_fabrics_command) { + pr_err("invalid command 0x%x on unconnected queue.\n", + cmd->fabrics.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { + pr_err("invalid capsule type 0x%x on unconnected queue.\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + req->data_len = sizeof(struct nvmf_connect_data); + if (cmd->connect.qid == 0) + req->execute = nvmet_execute_admin_connect; + else + req->execute = nvmet_execute_io_connect; + return 0; +} diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c new file mode 100644 index 0000000..2cd069b --- /dev/null +++ b/drivers/nvme/target/io-cmd.c @@ -0,0 +1,215 @@ +/* + * NVMe I/O command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "nvmet.h" + +static void nvmet_bio_done(struct bio *bio) +{ + struct nvmet_req *req = bio->bi_private; + + nvmet_req_complete(req, + bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + + if (bio != &req->inline_bio) + bio_put(bio); +} + +static inline u32 nvmet_rw_len(struct nvmet_req *req) +{ + return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << + req->ns->blksize_shift; +} + +static void nvmet_inline_bio_init(struct nvmet_req *req) +{ + struct bio *bio = &req->inline_bio; + + bio_init(bio); + bio->bi_max_vecs = NVMET_MAX_INLINE_BIOVEC; + bio->bi_io_vec = req->inline_bvec; +} + +static void nvmet_execute_rw(struct nvmet_req *req) +{ + int sg_cnt = req->sg_cnt; + struct scatterlist *sg; + struct bio *bio; + sector_t sector; + blk_qc_t cookie; + int op, op_flags = 0, i; + + if (!req->sg_cnt) { + nvmet_req_complete(req, 0); + return; + } + + if (req->cmd->rw.opcode == nvme_cmd_write) { + op = REQ_OP_WRITE; + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + op_flags |= REQ_FUA; + } else { + op = REQ_OP_READ; + } + + sector = le64_to_cpu(req->cmd->rw.slba); + sector <<= (req->ns->blksize_shift - 9); + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, op, op_flags); + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) + != sg->length) { + struct bio *prev = bio; + + bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio_set_op_attrs(bio, op, op_flags); + + bio_chain(bio, prev); + cookie = submit_bio(prev); + } + + sector += sg->length >> 9; + sg_cnt--; + } + + cookie = submit_bio(bio); + + blk_poll(bdev_get_queue(req->ns->bdev), cookie); +} + +static void nvmet_execute_flush(struct nvmet_req *req) +{ + struct bio *bio; + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + + bio->bi_bdev = req->ns->bdev; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + + submit_bio(bio); +} + +static u16 nvmet_discard_range(struct nvmet_ns *ns, + struct nvme_dsm_range *range, struct bio **bio) +{ + if (__blkdev_issue_discard(ns->bdev, + le64_to_cpu(range->slba) << (ns->blksize_shift - 9), + le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), + GFP_KERNEL, 0, bio)) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + +static void nvmet_execute_discard(struct nvmet_req *req) +{ + struct nvme_dsm_range range; + struct bio *bio = NULL; + int i; + u16 status; + + for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (status) + break; + + status = nvmet_discard_range(req->ns, &range, &bio); + if (status) + break; + } + + if (bio) { + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + if (status) { + bio->bi_error = -EIO; + bio_endio(bio); + } else { + submit_bio(bio); + } + } else { + nvmet_req_complete(req, status); + } +} + +static void nvmet_execute_dsm(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->dsm.attributes)) { + case NVME_DSMGMT_AD: + nvmet_execute_discard(req); + return; + case NVME_DSMGMT_IDR: + case NVME_DSMGMT_IDW: + default: + /* Not supported yet */ + nvmet_req_complete(req, 0); + return; + } +} + +int nvmet_parse_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got io cmd %d while CC.EN == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); + if (!req->ns) + return NVME_SC_INVALID_NS | NVME_SC_DNR; + + switch (cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->execute = nvmet_execute_rw; + req->data_len = nvmet_rw_len(req); + return 0; + case nvme_cmd_flush: + req->execute = nvmet_execute_flush; + req->data_len = 0; + return 0; + case nvme_cmd_dsm: + req->execute = nvmet_execute_dsm; + req->data_len = le32_to_cpu(cmd->dsm.nr) * + sizeof(struct nvme_dsm_range); + return 0; + default: + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c new file mode 100644 index 0000000..395e60d --- /dev/null +++ b/drivers/nvme/target/loop.c @@ -0,0 +1,752 @@ +/* + * NVMe over Fabrics loopback device. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include "nvmet.h" +#include "../host/nvme.h" +#include "../host/fabrics.h" + +#define NVME_LOOP_AQ_DEPTH 256 + +#define NVME_LOOP_MAX_SEGMENTS 256 + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_LOOP_NR_AEN_COMMANDS 1 +#define NVME_LOOP_AQ_BLKMQ_DEPTH \ + (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) + +struct nvme_loop_iod { + struct nvme_command cmd; + struct nvme_completion rsp; + struct nvmet_req req; + struct nvme_loop_queue *queue; + struct work_struct work; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +struct nvme_loop_ctrl { + spinlock_t lock; + struct nvme_loop_queue *queues; + u32 queue_count; + + struct blk_mq_tag_set admin_tag_set; + + struct list_head list; + u64 cap; + struct blk_mq_tag_set tag_set; + struct nvme_loop_iod async_event_iod; + struct nvme_ctrl ctrl; + + struct nvmet_ctrl *target_ctrl; + struct work_struct delete_work; + struct work_struct reset_work; +}; + +static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_loop_ctrl, ctrl); +} + +struct nvme_loop_queue { + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + struct nvme_loop_ctrl *ctrl; +}; + +static struct nvmet_port *nvmet_loop_port; + +static LIST_HEAD(nvme_loop_ctrl_list); +static DEFINE_MUTEX(nvme_loop_ctrl_mutex); + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req); +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl); + +static struct nvmet_fabrics_ops nvme_loop_ops; + +static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static void nvme_loop_complete_rq(struct request *req) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int error = 0; + + nvme_cleanup_cmd(req); + sg_free_table_chained(&iod->sg_table, true); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + nvme_requeue_req(req); + return; + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } + + blk_mq_end_request(req, error); +} + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req) +{ + struct nvme_loop_iod *iod = + container_of(nvme_req, struct nvme_loop_iod, req); + struct nvme_completion *cqe = &iod->rsp; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 && + cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe); + } else { + struct request *req = blk_mq_rq_from_pdu(iod); + + if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) + memcpy(req->special, cqe, sizeof(*cqe)); + blk_mq_complete_request(req, le16_to_cpu(cqe->status) >> 1); + } +} + +static void nvme_loop_execute_work(struct work_struct *work) +{ + struct nvme_loop_iod *iod = + container_of(work, struct nvme_loop_iod, work); + + iod->req.execute(&iod->req); +} + +static enum blk_eh_timer_return +nvme_loop_timeout(struct request *rq, bool reserved) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq); + + /* queue error recovery */ + schedule_work(&iod->queue->ctrl->reset_work); + + /* fail with DNR on admin cmd timeout */ + rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR; + + return BLK_EH_HANDLED; +} + +static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_loop_queue *queue = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int ret; + + ret = nvme_setup_cmd(ns, req, &iod->cmd); + if (ret) + return ret; + + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + iod->req.port = nvmet_loop_port; + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, + &queue->nvme_sq, &nvme_loop_ops)) { + nvme_cleanup_cmd(req); + blk_mq_start_request(req); + nvme_loop_queue_response(&iod->req); + return 0; + } + + if (blk_rq_bytes(req)) { + iod->sg_table.sgl = iod->first_sgl; + ret = sg_alloc_table_chained(&iod->sg_table, + req->nr_phys_segments, iod->sg_table.sgl); + if (ret) + return BLK_MQ_RQ_QUEUE_BUSY; + + iod->req.sg = iod->sg_table.sgl; + iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + BUG_ON(iod->req.sg_cnt > req->nr_phys_segments); + } + + iod->cmd.common.command_id = req->tag; + blk_mq_start_request(req); + + schedule_work(&iod->work); + return 0; +} + +static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); + struct nvme_loop_queue *queue = &ctrl->queues[0]; + struct nvme_loop_iod *iod = &ctrl->async_event_iod; + + memset(&iod->cmd, 0, sizeof(iod->cmd)); + iod->cmd.common.opcode = nvme_admin_async_event; + iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, + &nvme_loop_ops)) { + dev_err(ctrl->ctrl.device, "failed async event work\n"); + return; + } + + schedule_work(&iod->work); +} + +static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl, + struct nvme_loop_iod *iod, unsigned int queue_idx) +{ + BUG_ON(queue_idx >= ctrl->queue_count); + + iod->req.cmd = &iod->cmd; + iod->req.rsp = &iod->rsp; + iod->queue = &ctrl->queues[queue_idx]; + INIT_WORK(&iod->work, nvme_loop_execute_work); + return 0; +} + +static int nvme_loop_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1); +} + +static int nvme_loop_init_admin_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0); +} + +static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static struct blk_mq_ops nvme_loop_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_request, + .init_hctx = nvme_loop_init_hctx, + .timeout = nvme_loop_timeout, +}; + +static struct blk_mq_ops nvme_loop_admin_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_admin_request, + .init_hctx = nvme_loop_init_admin_hctx, + .timeout = nvme_loop_timeout, +}; + +static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); +} + +static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (nctrl->tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + int error; + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + ctrl->queues[0].ctrl = ctrl; + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); + if (error) + return error; + ctrl->queue_count = 1; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_free_sq; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + nvme_start_keep_alive(&ctrl->ctrl); + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_free_sq: + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + return error; +} + +static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) +{ + int i; + + nvme_stop_keep_alive(&ctrl->ctrl); + + if (ctrl->queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + } + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_loop_destroy_admin_queue(ctrl); +} + +static void nvme_loop_del_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, delete_work); + + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_loop_shutdown_ctrl(ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + + if (!schedule_work(&ctrl->delete_work)) + return -EBUSY; + + return 0; +} + +static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + int ret; + + ret = __nvme_loop_del_ctrl(ctrl); + if (ret) + return ret; + + flush_work(&ctrl->delete_work); + + return 0; +} + +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { + if (ctrl->ctrl.cntlid == nctrl->cntlid) + __nvme_loop_del_ctrl(ctrl); + } + mutex_unlock(&nvme_loop_ctrl_mutex); +} + +static void nvme_loop_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, reset_work); + bool changed; + int i, ret; + + nvme_loop_shutdown_ctrl(ctrl); + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_disable; + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_free_queues; + + ctrl->queue_count++; + } + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_free_queues; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + + nvme_start_queues(&ctrl->ctrl); + + return; + +out_free_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + nvme_loop_destroy_admin_queue(ctrl); +out_disable: + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + + if (!schedule_work(&ctrl->reset_work)) + return -EBUSY; + + flush_work(&ctrl->reset_work); + + return 0; +} + +static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { + .name = "loop", + .module = THIS_MODULE, + .is_fabrics = true, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .reset_ctrl = nvme_loop_reset_ctrl, + .free_ctrl = nvme_loop_free_ctrl, + .submit_async_event = nvme_loop_submit_async_event, + .delete_ctrl = nvme_loop_del_ctrl, + .get_subsysnqn = nvmf_get_subsysnqn, +}; + +static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + int ret, i; + + ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + if (ret || !opts->nr_io_queues) + return ret; + + dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", + opts->nr_io_queues); + + for (i = 1; i <= opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_destroy_queues; + + ctrl->queue_count++; + } + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_loop_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; + ctrl->tag_set.reserved_tags = 1; /* fabric connect */ + ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_destroy_queues; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tagset; + } + + for (i = 1; i <= opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_cleanup_connect_q; + } + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->tag_set); +out_destroy_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + return ret; +} + +static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_loop_ctrl *ctrl; + bool changed; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); + INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work); + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_put_ctrl; + + spin_lock_init(&ctrl->lock); + + ret = -ENOMEM; + + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + + ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_free_queues; + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_loop_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0); + + dev_info(ctrl->ctrl.device, + "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); + + kref_get(&ctrl->ctrl.kref); + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (opts->nr_io_queues) { + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_loop_destroy_admin_queue(ctrl); +out_free_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +} + +static int nvme_loop_add_port(struct nvmet_port *port) +{ + /* + * XXX: disalow adding more than one port so + * there is no connection rejections when a + * a subsystem is assigned to a port for which + * loop doesn't have a pointer. + * This scenario would be possible if we allowed + * more than one port to be added and a subsystem + * was assigned to a port other than nvmet_loop_port. + */ + + if (nvmet_loop_port) + return -EPERM; + + nvmet_loop_port = port; + return 0; +} + +static void nvme_loop_remove_port(struct nvmet_port *port) +{ + if (port == nvmet_loop_port) + nvmet_loop_port = NULL; +} + +static struct nvmet_fabrics_ops nvme_loop_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_LOOP, + .add_port = nvme_loop_add_port, + .remove_port = nvme_loop_remove_port, + .queue_response = nvme_loop_queue_response, + .delete_ctrl = nvme_loop_delete_ctrl, +}; + +static struct nvmf_transport_ops nvme_loop_transport = { + .name = "loop", + .create_ctrl = nvme_loop_create_ctrl, +}; + +static int __init nvme_loop_init_module(void) +{ + int ret; + + ret = nvmet_register_transport(&nvme_loop_ops); + if (ret) + return ret; + nvmf_register_transport(&nvme_loop_transport); + return 0; +} + +static void __exit nvme_loop_cleanup_module(void) +{ + struct nvme_loop_ctrl *ctrl, *next; + + nvmf_unregister_transport(&nvme_loop_transport); + nvmet_unregister_transport(&nvme_loop_ops); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) + __nvme_loop_del_ctrl(ctrl); + mutex_unlock(&nvme_loop_ctrl_mutex); + + flush_scheduled_work(); +} + +module_init(nvme_loop_init_module); +module_exit(nvme_loop_cleanup_module); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */ diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h new file mode 100644 index 0000000..76b6eed --- /dev/null +++ b/drivers/nvme/target/nvmet.h @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVMET_H +#define _NVMET_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NVMET_ASYNC_EVENTS 4 +#define NVMET_ERROR_LOG_SLOTS 128 + +/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM + * The 16 bit shift is to set IATTR bit to 1, which means offending + * offset starts in the data section of connect() + */ +#define IPO_IATTR_CONNECT_DATA(x) \ + (cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x)))) +#define IPO_IATTR_CONNECT_SQE(x) \ + (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) + +struct nvmet_ns { + struct list_head dev_link; + struct percpu_ref ref; + struct block_device *bdev; + u32 nsid; + u32 blksize_shift; + loff_t size; + u8 nguid[16]; + + struct nvmet_subsys *subsys; + const char *device_path; + + struct config_group device_group; + struct config_group group; + + struct completion disable_done; +}; + +static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ns, group); +} + +static inline bool nvmet_ns_enabled(struct nvmet_ns *ns) +{ + return !list_empty_careful(&ns->dev_link); +} + +struct nvmet_cq { + u16 qid; + u16 size; +}; + +struct nvmet_sq { + struct nvmet_ctrl *ctrl; + struct percpu_ref ref; + u16 qid; + u16 size; + struct completion free_done; +}; + +/** + * struct nvmet_port - Common structure to keep port + * information for the target. + * @entry: List head for holding a list of these elements. + * @disc_addr: Address information is stored in a format defined + * for a discovery log page entry. + * @group: ConfigFS group for this element's folder. + * @priv: Private data for the transport. + */ +struct nvmet_port { + struct list_head entry; + struct nvmf_disc_rsp_page_entry disc_addr; + struct config_group group; + struct config_group subsys_group; + struct list_head subsystems; + struct config_group referrals_group; + struct list_head referrals; + void *priv; + bool enabled; +}; + +static inline struct nvmet_port *to_nvmet_port(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + group); +} + +struct nvmet_ctrl { + struct nvmet_subsys *subsys; + struct nvmet_cq **cqs; + struct nvmet_sq **sqs; + + struct mutex lock; + u64 cap; + u64 serial; + u32 cc; + u32 csts; + + u16 cntlid; + u32 kato; + + struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; + unsigned int nr_async_event_cmds; + struct list_head async_events; + struct work_struct async_event_work; + + struct list_head subsys_entry; + struct kref ref; + struct delayed_work ka_work; + struct work_struct fatal_err_work; + + struct nvmet_fabrics_ops *ops; + + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; +}; + +struct nvmet_subsys { + enum nvme_subsys_type type; + + struct mutex lock; + struct kref ref; + + struct list_head namespaces; + unsigned int max_nsid; + + struct list_head ctrls; + struct ida cntlid_ida; + + struct list_head hosts; + bool allow_any_host; + + u16 max_qid; + + u64 ver; + char *subsysnqn; + + struct config_group group; + + struct config_group namespaces_group; + struct config_group allowed_hosts_group; +}; + +static inline struct nvmet_subsys *to_subsys(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, group); +} + +static inline struct nvmet_subsys *namespaces_to_subsys( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, + namespaces_group); +} + +struct nvmet_host { + struct config_group group; +}; + +static inline struct nvmet_host *to_host(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_host, group); +} + +static inline char *nvmet_host_name(struct nvmet_host *host) +{ + return config_item_name(&host->group.cg_item); +} + +struct nvmet_host_link { + struct list_head entry; + struct nvmet_host *host; +}; + +struct nvmet_subsys_link { + struct list_head entry; + struct nvmet_subsys *subsys; +}; + +struct nvmet_req; +struct nvmet_fabrics_ops { + struct module *owner; + unsigned int type; + unsigned int sqe_inline_size; + unsigned int msdbd; + bool has_keyed_sgls : 1; + void (*queue_response)(struct nvmet_req *req); + int (*add_port)(struct nvmet_port *port); + void (*remove_port)(struct nvmet_port *port); + void (*delete_ctrl)(struct nvmet_ctrl *ctrl); +}; + +#define NVMET_MAX_INLINE_BIOVEC 8 + +struct nvmet_req { + struct nvme_command *cmd; + struct nvme_completion *rsp; + struct nvmet_sq *sq; + struct nvmet_cq *cq; + struct nvmet_ns *ns; + struct scatterlist *sg; + struct bio inline_bio; + struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + int sg_cnt; + size_t data_len; + + struct nvmet_port *port; + + void (*execute)(struct nvmet_req *req); + struct nvmet_fabrics_ops *ops; +}; + +static inline void nvmet_set_status(struct nvmet_req *req, u16 status) +{ + req->rsp->status = cpu_to_le16(status << 1); +} + +static inline void nvmet_set_result(struct nvmet_req *req, u32 result) +{ + req->rsp->result = cpu_to_le32(result); +} + +/* + * NVMe command writes actually are DMA reads for us on the target side. + */ +static inline enum dma_data_direction +nvmet_data_dir(struct nvmet_req *req) +{ + return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +struct nvmet_async_event { + struct list_head entry; + u8 event_type; + u8 event_info; + u8 log_page; +}; + +int nvmet_parse_connect_cmd(struct nvmet_req *req); +int nvmet_parse_io_cmd(struct nvmet_req *req); +int nvmet_parse_admin_cmd(struct nvmet_req *req); +int nvmet_parse_discovery_cmd(struct nvmet_req *req); +int nvmet_parse_fabrics_cmd(struct nvmet_req *req); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); +void nvmet_req_complete(struct nvmet_req *req, u16 status); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, + u16 size); +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, + u16 size); +void nvmet_sq_destroy(struct nvmet_sq *sq); +int nvmet_sq_init(struct nvmet_sq *sq); + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret); +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type); +void nvmet_subsys_put(struct nvmet_subsys *subsys); + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid); +void nvmet_put_namespace(struct nvmet_ns *ns); +int nvmet_ns_enable(struct nvmet_ns *ns); +void nvmet_ns_disable(struct nvmet_ns *ns); +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); +void nvmet_ns_free(struct nvmet_ns *ns); + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops); +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops); + +int nvmet_enable_port(struct nvmet_port *port); +void nvmet_disable_port(struct nvmet_port *port); + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); +void nvmet_referral_disable(struct nvmet_port *port); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len); +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, + size_t len); + +u32 nvmet_get_log_page_len(struct nvme_command *cmd); + +#define NVMET_QUEUE_SIZE 1024 +#define NVMET_NR_QUEUES 64 +#define NVMET_MAX_CMD NVMET_QUEUE_SIZE +#define NVMET_KAS 10 +#define NVMET_DISC_KATO 120 + +int __init nvmet_init_configfs(void); +void __exit nvmet_exit_configfs(void); + +int __init nvmet_init_discovery(void); +void nvmet_exit_discovery(void); + +extern struct nvmet_subsys *nvmet_disc_subsys; +extern u64 nvmet_genctr; +extern struct rw_semaphore nvmet_config_sem; + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn); + +#endif /* _NVMET_H */ diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c new file mode 100644 index 0000000..1cbe6e0 --- /dev/null +++ b/drivers/nvme/target/rdma.c @@ -0,0 +1,1497 @@ +/* + * NVMe over Fabrics RDMA target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "nvmet.h" + +/* + * We allow up to a page of inline data to go with the SQE + */ +#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE + +struct nvmet_rdma_cmd { + struct ib_sge sge[2]; + struct ib_cqe cqe; + struct ib_recv_wr wr; + struct scatterlist inline_sg; + struct page *inline_page; + struct nvme_command *nvme_cmd; + struct nvmet_rdma_queue *queue; +}; + +enum { + NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), + NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), +}; + +struct nvmet_rdma_rsp { + struct ib_sge send_sge; + struct ib_cqe send_cqe; + struct ib_send_wr send_wr; + + struct nvmet_rdma_cmd *cmd; + struct nvmet_rdma_queue *queue; + + struct ib_cqe read_cqe; + struct rdma_rw_ctx rw; + + struct nvmet_req req; + + u8 n_rdma; + u32 flags; + u32 invalidate_rkey; + + struct list_head wait_list; + struct list_head free_list; +}; + +enum nvmet_rdma_queue_state { + NVMET_RDMA_Q_CONNECTING, + NVMET_RDMA_Q_LIVE, + NVMET_RDMA_Q_DISCONNECTING, + NVMET_RDMA_IN_DEVICE_REMOVAL, +}; + +struct nvmet_rdma_queue { + struct rdma_cm_id *cm_id; + struct nvmet_port *port; + struct ib_cq *cq; + atomic_t sq_wr_avail; + struct nvmet_rdma_device *dev; + spinlock_t state_lock; + enum nvmet_rdma_queue_state state; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + struct nvmet_rdma_rsp *rsps; + struct list_head free_rsps; + spinlock_t rsps_lock; + struct nvmet_rdma_cmd *cmds; + + struct work_struct release_work; + struct list_head rsp_wait_list; + struct list_head rsp_wr_wait_list; + spinlock_t rsp_wr_wait_lock; + + int idx; + int host_qid; + int recv_queue_size; + int send_queue_size; + + struct list_head queue_list; +}; + +struct nvmet_rdma_device { + struct ib_device *device; + struct ib_pd *pd; + struct ib_srq *srq; + struct nvmet_rdma_cmd *srq_cmds; + size_t srq_size; + struct kref ref; + struct list_head entry; +}; + +static bool nvmet_rdma_use_srq; +module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); +MODULE_PARM_DESC(use_srq, "Use shared receive queue."); + +static DEFINE_IDA(nvmet_rdma_queue_ida); +static LIST_HEAD(nvmet_rdma_queue_list); +static DEFINE_MUTEX(nvmet_rdma_queue_mutex); + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); + +static struct nvmet_fabrics_ops nvmet_rdma_ops; + +/* XXX: really should move to a generic header sooner or later.. */ +static inline u32 get_unaligned_le24(const u8 *p) +{ + return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; +} + +static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) +{ + return nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) +{ + return !nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !rsp->req.rsp->status && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline struct nvmet_rdma_rsp * +nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_rsp *rsp; + unsigned long flags; + + spin_lock_irqsave(&queue->rsps_lock, flags); + rsp = list_first_entry(&queue->free_rsps, + struct nvmet_rdma_rsp, free_list); + list_del(&rsp->free_list); + spin_unlock_irqrestore(&queue->rsps_lock, flags); + + return rsp; +} + +static inline void +nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + + spin_lock_irqsave(&rsp->queue->rsps_lock, flags); + list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); + spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); +} + +static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents) +{ + struct scatterlist *sg; + int count; + + if (!sgl || !nents) + return; + + for_each_sg(sgl, sg, nents, count) + __free_page(sg_page(sg)); + kfree(sgl); +} + +static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, + u32 length) +{ + struct scatterlist *sg; + struct page *page; + unsigned int nent; + int i = 0; + + nent = DIV_ROUND_UP(length, PAGE_SIZE); + sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); + if (!sg) + goto out; + + sg_init_table(sg, nent); + + while (length) { + u32 page_len = min_t(u32, length, PAGE_SIZE); + + page = alloc_page(GFP_KERNEL); + if (!page) + goto out_free_pages; + + sg_set_page(&sg[i], page, page_len, 0); + length -= page_len; + i++; + } + *sgl = sg; + *nents = nent; + return 0; + +out_free_pages: + while (i > 0) { + i--; + __free_page(sg_page(&sg[i])); + } + kfree(sg); +out: + return NVME_SC_INTERNAL; +} + +static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + /* NVMe command / RDMA RECV */ + c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); + if (!c->nvme_cmd) + goto out; + + c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) + goto out_free_cmd; + + c->sge[0].length = sizeof(*c->nvme_cmd); + c->sge[0].lkey = ndev->pd->local_dma_lkey; + + if (!admin) { + c->inline_page = alloc_pages(GFP_KERNEL, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + if (!c->inline_page) + goto out_unmap_cmd; + c->sge[1].addr = ib_dma_map_page(ndev->device, + c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) + goto out_free_inline_page; + c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; + c->sge[1].lkey = ndev->pd->local_dma_lkey; + } + + c->cqe.done = nvmet_rdma_recv_done; + + c->wr.wr_cqe = &c->cqe; + c->wr.sg_list = c->sge; + c->wr.num_sge = admin ? 1 : 2; + + return 0; + +out_free_inline_page: + if (!admin) { + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } +out_unmap_cmd: + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); +out_free_cmd: + kfree(c->nvme_cmd); + +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + if (!admin) { + ib_dma_unmap_page(ndev->device, c->sge[1].addr, + NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + kfree(c->nvme_cmd); +} + +static struct nvmet_rdma_cmd * +nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, + int nr_cmds, bool admin) +{ + struct nvmet_rdma_cmd *cmds; + int ret = -EINVAL, i; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); + if (ret) + goto out_free; + } + + return cmds; + +out_free: + while (--i >= 0) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +out: + return ERR_PTR(ret); +} + +static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) +{ + int i; + + for (i = 0; i < nr_cmds; i++) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +} + +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + /* NVMe CQE / RDMA SEND */ + r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); + if (!r->req.rsp) + goto out; + + r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) + goto out_free_rsp; + + r->send_sge.length = sizeof(*r->req.rsp); + r->send_sge.lkey = ndev->pd->local_dma_lkey; + + r->send_cqe.done = nvmet_rdma_send_done; + + r->send_wr.wr_cqe = &r->send_cqe; + r->send_wr.sg_list = &r->send_sge; + r->send_wr.num_sge = 1; + r->send_wr.send_flags = IB_SEND_SIGNALED; + + /* Data In / RDMA READ */ + r->read_cqe.done = nvmet_rdma_read_data_done; + return 0; + +out_free_rsp: + kfree(r->req.rsp); +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + ib_dma_unmap_single(ndev->device, r->send_sge.addr, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + kfree(r->req.rsp); +} + +static int +nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int nr_rsps = queue->recv_queue_size * 2; + int ret = -EINVAL, i; + + queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), + GFP_KERNEL); + if (!queue->rsps) + goto out; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + ret = nvmet_rdma_alloc_rsp(ndev, rsp); + if (ret) + goto out_free; + + list_add_tail(&rsp->free_list, &queue->free_rsps); + } + + return 0; + +out_free: + while (--i >= 0) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +out: + return ret; +} + +static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int i, nr_rsps = queue->recv_queue_size * 2; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +} + +static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmd) +{ + struct ib_recv_wr *bad_wr; + + if (ndev->srq) + return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); + return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); +} + +static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) +{ + spin_lock(&queue->rsp_wr_wait_lock); + while (!list_empty(&queue->rsp_wr_wait_list)) { + struct nvmet_rdma_rsp *rsp; + bool ret; + + rsp = list_entry(queue->rsp_wr_wait_list.next, + struct nvmet_rdma_rsp, wait_list); + list_del(&rsp->wait_list); + + spin_unlock(&queue->rsp_wr_wait_lock); + ret = nvmet_rdma_execute_command(rsp); + spin_lock(&queue->rsp_wr_wait_lock); + + if (!ret) { + list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); + break; + } + } + spin_unlock(&queue->rsp_wr_wait_lock); +} + + +static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + + if (rsp->n_rdma) { + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + } + + if (rsp->req.sg != &rsp->cmd->inline_sg) + nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); + + if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) + nvmet_rdma_process_wr_wait_list(queue); + + nvmet_rdma_put_rsp(rsp); +} + +static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) +{ + if (queue->nvme_sq.ctrl) { + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + } else { + /* + * we didn't setup the controller yet in case + * of admin connect error, just disconnect and + * cleanup the queue + */ + nvmet_rdma_queue_disconnect(queue); + } +} + +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + + nvmet_rdma_release_rsp(rsp); + + if (unlikely(wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR)) { + pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(rsp->queue); + } +} + +static void nvmet_rdma_queue_response(struct nvmet_req *req) +{ + struct nvmet_rdma_rsp *rsp = + container_of(req, struct nvmet_rdma_rsp, req); + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct ib_send_wr *first_wr, *bad_wr; + + if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; + rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; + } else { + rsp->send_wr.opcode = IB_WR_SEND; + } + + if (nvmet_rdma_need_data_out(rsp)) + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, NULL, &rsp->send_wr); + else + first_wr = &rsp->send_wr; + + nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); + if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } +} + +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + rsp->req.execute(&rsp->req); +} + +static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, + u64 off) +{ + sg_init_table(&rsp->cmd->inline_sg, 1); + sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); + rsp->req.sg = &rsp->cmd->inline_sg; + rsp->req.sg_cnt = 1; +} + +static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; + u64 off = le64_to_cpu(sgl->addr); + u32 len = le32_to_cpu(sgl->length); + + if (!nvme_is_write(rsp->req.cmd)) + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + + if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + pr_err("invalid inline data offset!\n"); + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + } + + /* no data command? */ + if (!len) + return 0; + + nvmet_rdma_use_inline_sg(rsp, len, off); + rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + return 0; +} + +static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, + struct nvme_keyed_sgl_desc *sgl, bool invalidate) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + u64 addr = le64_to_cpu(sgl->addr); + u32 len = get_unaligned_le24(sgl->length); + u32 key = get_unaligned_le32(sgl->key); + int ret; + u16 status; + + /* no data command? */ + if (!len) + return 0; + + status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, + len); + if (status) + return status; + + ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, + rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, + nvmet_data_dir(&rsp->req)); + if (ret < 0) + return NVME_SC_INTERNAL; + rsp->n_rdma += ret; + + if (invalidate) { + rsp->invalidate_rkey = key; + rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; + } + + return 0; +} + +static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; + + switch (sgl->type >> 4) { + case NVME_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_OFFSET: + return nvmet_rdma_map_sgl_inline(rsp); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + case NVME_KEY_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); + case NVME_SGL_FMT_ADDRESS: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + default: + pr_err("invalid SGL type: %#x\n", sgl->type); + return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + } +} + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + if (unlikely(atomic_sub_return(1 + rsp->n_rdma, + &queue->sq_wr_avail) < 0)) { + pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", + 1 + rsp->n_rdma, queue->idx, + queue->nvme_sq.ctrl->cntlid); + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + return false; + } + + if (nvmet_rdma_need_data_in(rsp)) { + if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, &rsp->read_cqe, NULL)) + nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); + } else { + rsp->req.execute(&rsp->req); + } + + return true; +} + +static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *cmd) +{ + u16 status; + + cmd->queue = queue; + cmd->n_rdma = 0; + cmd->req.port = queue->port; + + if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_rdma_ops)) + return; + + status = nvmet_rdma_map_sgl(cmd); + if (status) + goto out_err; + + if (unlikely(!nvmet_rdma_execute_command(cmd))) { + spin_lock(&queue->rsp_wr_wait_lock); + list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); + spin_unlock(&queue->rsp_wr_wait_lock); + } + + return; + +out_err: + nvmet_req_complete(&cmd->req, status); +} + +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_cmd *cmd = + container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_rsp *rsp; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), + wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { + pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); + nvmet_rdma_error_comp(queue); + return; + } + + cmd->queue = queue; + rsp = nvmet_rdma_get_rsp(queue); + rsp->cmd = cmd; + rsp->flags = 0; + rsp->req.cmd = cmd->nvme_cmd; + + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return; + } + + nvmet_rdma_handle_command(queue, rsp); +} + +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) +{ + if (!ndev->srq) + return; + + nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); + ib_destroy_srq(ndev->srq); +} + +static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) +{ + struct ib_srq_init_attr srq_attr = { NULL, }; + struct ib_srq *srq; + size_t srq_size; + int ret, i; + + srq_size = 4095; /* XXX: tune */ + + srq_attr.attr.max_wr = srq_size; + srq_attr.attr.max_sge = 2; + srq_attr.attr.srq_limit = 0; + srq_attr.srq_type = IB_SRQT_BASIC; + srq = ib_create_srq(ndev->pd, &srq_attr); + if (IS_ERR(srq)) { + /* + * If SRQs aren't supported we just go ahead and use normal + * non-shared receive queues. + */ + pr_info("SRQ requested but not supported.\n"); + return 0; + } + + ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(ndev->srq_cmds)) { + ret = PTR_ERR(ndev->srq_cmds); + goto out_destroy_srq; + } + + ndev->srq = srq; + ndev->srq_size = srq_size; + + for (i = 0; i < srq_size; i++) + nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + + return 0; + +out_destroy_srq: + ib_destroy_srq(srq); + return ret; +} + +static void nvmet_rdma_free_dev(struct kref *ref) +{ + struct nvmet_rdma_device *ndev = + container_of(ref, struct nvmet_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + nvmet_rdma_destroy_srq(ndev); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static struct nvmet_rdma_device * +nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvmet_rdma_device *ndev; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->device->node_guid == cm_id->device->node_guid && + kref_get_unless_zero(&ndev->ref)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->device = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->device); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (nvmet_rdma_use_srq) { + ret = nvmet_rdma_init_srq(ndev); + if (ret) + goto out_free_pd; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + pr_debug("added %s.\n", ndev->device->name); + return ndev; + +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) +{ + struct ib_qp_init_attr qp_attr; + struct nvmet_rdma_device *ndev = queue->dev; + int comp_vector, nr_cqe, ret, i; + + /* + * Spread the io queues across completion vectors, + * but still keep all admin queues on vector 0. + */ + comp_vector = !queue->host_qid ? 0 : + queue->idx % ndev->device->num_comp_vectors; + + /* + * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. + */ + nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; + + queue->cq = ib_alloc_cq(ndev->device, queue, + nr_cqe + 1, comp_vector, + IB_POLL_WORKQUEUE); + if (IS_ERR(queue->cq)) { + ret = PTR_ERR(queue->cq); + pr_err("failed to create CQ cqe= %d ret= %d\n", + nr_cqe + 1, ret); + goto out; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_context = queue; + qp_attr.event_handler = nvmet_rdma_qp_event; + qp_attr.send_cq = queue->cq; + qp_attr.recv_cq = queue->cq; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + /* +1 for drain */ + qp_attr.cap.max_send_wr = queue->send_queue_size + 1; + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, + ndev->device->attrs.max_sge); + + if (ndev->srq) { + qp_attr.srq = ndev->srq; + } else { + /* +1 for drain */ + qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; + qp_attr.cap.max_recv_sge = 2; + } + + ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); + if (ret) { + pr_err("failed to create_qp ret= %d\n", ret); + goto err_destroy_cq; + } + + atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); + + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, + qp_attr.cap.max_send_wr, queue->cm_id); + + if (!ndev->srq) { + for (i = 0; i < queue->recv_queue_size; i++) { + queue->cmds[i].queue = queue; + nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + } + } + +out: + return ret; + +err_destroy_cq: + ib_free_cq(queue->cq); + goto out; +} + +static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) +{ + rdma_destroy_qp(queue->cm_id); + ib_free_cq(queue->cq); +} + +static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) +{ + pr_info("freeing queue %d\n", queue->idx); + + nvmet_sq_destroy(&queue->nvme_sq); + + nvmet_rdma_destroy_queue_ib(queue); + if (!queue->dev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } + nvmet_rdma_free_rsps(queue); + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + kfree(queue); +} + +static void nvmet_rdma_release_queue_work(struct work_struct *w) +{ + struct nvmet_rdma_queue *queue = + container_of(w, struct nvmet_rdma_queue, release_work); + struct rdma_cm_id *cm_id = queue->cm_id; + struct nvmet_rdma_device *dev = queue->dev; + enum nvmet_rdma_queue_state state = queue->state; + + nvmet_rdma_free_queue(queue); + + if (state != NVMET_RDMA_IN_DEVICE_REMOVAL) + rdma_destroy_id(cm_id); + + kref_put(&dev->ref, nvmet_rdma_free_dev); +} + +static int +nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, + struct nvmet_rdma_queue *queue) +{ + struct nvme_rdma_cm_req *req; + + req = (struct nvme_rdma_cm_req *)conn->private_data; + if (!req || conn->private_data_len == 0) + return NVME_RDMA_CM_INVALID_LEN; + + if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) + return NVME_RDMA_CM_INVALID_RECFMT; + + queue->host_qid = le16_to_cpu(req->qid); + + /* + * req->hsqsize corresponds to our recv queue size plus 1 + * req->hrqsize corresponds to our send queue size + */ + queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; + queue->send_queue_size = le16_to_cpu(req->hrqsize); + + if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) + return NVME_RDMA_CM_INVALID_HSQSIZE; + + /* XXX: Should we enforce some kind of max for IO queues? */ + + return 0; +} + +static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, + enum nvme_rdma_cm_status status) +{ + struct nvme_rdma_cm_rej rej; + + rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + rej.sts = cpu_to_le16(status); + + return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); +} + +static struct nvmet_rdma_queue * +nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, + struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_reject; + } + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_free_queue; + + ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); + if (ret) + goto out_destroy_sq; + + /* + * Schedules the actual release because calling rdma_destroy_id from + * inside a CM callback would trigger a deadlock. (great API design..) + */ + INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); + queue->dev = ndev; + queue->cm_id = cm_id; + + spin_lock_init(&queue->state_lock); + queue->state = NVMET_RDMA_Q_CONNECTING; + INIT_LIST_HEAD(&queue->rsp_wait_list); + INIT_LIST_HEAD(&queue->rsp_wr_wait_list); + spin_lock_init(&queue->rsp_wr_wait_lock); + INIT_LIST_HEAD(&queue->free_rsps); + spin_lock_init(&queue->rsps_lock); + + queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_queue; + } + + ret = nvmet_rdma_alloc_rsps(queue); + if (ret) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_ida_remove; + } + + if (!ndev->srq) { + queue->cmds = nvmet_rdma_alloc_cmds(ndev, + queue->recv_queue_size, + !queue->host_qid); + if (IS_ERR(queue->cmds)) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_responses; + } + } + + ret = nvmet_rdma_create_queue_ib(queue); + if (ret) { + pr_err("%s: creating RDMA queue failed (%d).\n", + __func__, ret); + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_cmds; + } + + return queue; + +out_free_cmds: + if (!ndev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } +out_free_responses: + nvmet_rdma_free_rsps(queue); +out_ida_remove: + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); +out_destroy_sq: + nvmet_sq_destroy(&queue->nvme_sq); +out_free_queue: + kfree(queue); +out_reject: + nvmet_rdma_cm_reject(cm_id, ret); + return NULL; +} + +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) +{ + struct nvmet_rdma_queue *queue = priv; + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(queue->cm_id, event->event); + break; + default: + pr_err("received unrecognized IB QP event %d\n", event->event); + break; + } +} + +static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue, + struct rdma_conn_param *p) +{ + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_rep priv = { }; + int ret = -ENOMEM; + + param.rnr_retry_count = 7; + param.flow_control = 1; + param.initiator_depth = min_t(u8, p->initiator_depth, + queue->dev->device->attrs.max_qp_init_rd_atom); + param.private_data = &priv; + param.private_data_len = sizeof(priv); + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.crqsize = cpu_to_le16(queue->recv_queue_size); + + ret = rdma_accept(cm_id, ¶m); + if (ret) + pr_err("rdma_accept failed (error code = %d)\n", ret); + + return ret; +} + +static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_device *ndev; + struct nvmet_rdma_queue *queue; + int ret = -EINVAL; + + ndev = nvmet_rdma_find_get_device(cm_id); + if (!ndev) { + pr_err("no client data!\n"); + nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); + return -ECONNREFUSED; + } + + queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); + if (!queue) { + ret = -ENOMEM; + goto put_device; + } + queue->port = cm_id->context; + + ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); + if (ret) + goto release_queue; + + mutex_lock(&nvmet_rdma_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + return 0; + +release_queue: + nvmet_rdma_free_queue(queue); +put_device: + kref_put(&ndev->ref, nvmet_rdma_free_dev); + + return ret; +} + +static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) +{ + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state != NVMET_RDMA_Q_CONNECTING) { + pr_warn("trying to establish a connected queue\n"); + goto out_unlock; + } + queue->state = NVMET_RDMA_Q_LIVE; + + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *cmd; + + cmd = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, wait_list); + list_del(&cmd->wait_list); + + spin_unlock_irqrestore(&queue->state_lock, flags); + nvmet_rdma_handle_command(queue, cmd); + spin_lock_irqsave(&queue->state_lock, flags); + } + +out_unlock: + spin_unlock_irqrestore(&queue->state_lock, flags); +} + +static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + unsigned long flags; + + pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); + + spin_lock_irqsave(&queue->state_lock, flags); + switch (queue->state) { + case NVMET_RDMA_Q_CONNECTING: + case NVMET_RDMA_Q_LIVE: + queue->state = NVMET_RDMA_Q_DISCONNECTING; + case NVMET_RDMA_IN_DEVICE_REMOVAL: + disconnect = true; + break; + case NVMET_RDMA_Q_DISCONNECTING: + break; + } + spin_unlock_irqrestore(&queue->state_lock, flags); + + if (disconnect) { + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->cm_id->qp); + schedule_work(&queue->release_work); + } +} + +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + + mutex_lock(&nvmet_rdma_queue_mutex); + if (!list_empty(&queue->queue_list)) { + list_del_init(&queue->queue_list); + disconnect = true; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + if (disconnect) + __nvmet_rdma_queue_disconnect(queue); +} + +static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue) +{ + WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); + + pr_err("failed to connect queue\n"); + schedule_work(&queue->release_work); +} + +/** + * nvme_rdma_device_removal() - Handle RDMA device removal + * @queue: nvmet rdma queue (cm id qp_context) + * @addr: nvmet address (cm_id context) + * + * DEVICE_REMOVAL event notifies us that the RDMA device is about + * to unplug so we should take care of destroying our RDMA resources. + * This event will be generated for each allocated cm_id. + * + * Note that this event can be generated on a normal queue cm_id + * and/or a device bound listener cm_id (where in this case + * queue will be null). + * + * we claim ownership on destroying the cm_id. For queues we move + * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port + * we nullify the priv to prevent double cm_id destruction and destroying + * the cm_id implicitely by returning a non-zero rc to the callout. + */ +static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue) +{ + unsigned long flags; + + if (!queue) { + struct nvmet_port *port = cm_id->context; + + /* + * This is a listener cm_id. Make sure that + * future remove_port won't invoke a double + * cm_id destroy. use atomic xchg to make sure + * we don't compete with remove_port. + */ + if (xchg(&port->priv, NULL) != cm_id) + return 0; + } else { + /* + * This is a queue cm_id. Make sure that + * release queue will not destroy the cm_id + * and schedule all ctrl queues removal (only + * if the queue is not disconnecting already). + */ + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state != NVMET_RDMA_Q_DISCONNECTING) + queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL; + spin_unlock_irqrestore(&queue->state_lock, flags); + nvmet_rdma_queue_disconnect(queue); + flush_scheduled_work(); + } + + /* + * We need to return 1 so that the core will destroy + * it's own ID. What a great API design.. + */ + return 1; +} + +static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue = NULL; + int ret = 0; + + if (cm_id->qp) + queue = cm_id->qp->qp_context; + + pr_debug("%s (%d): status %d id %p\n", + rdma_event_msg(event->event), event->event, + event->status, cm_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = nvmet_rdma_queue_connect(cm_id, event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + nvmet_rdma_queue_established(queue); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + nvmet_rdma_queue_disconnect(queue); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + ret = nvmet_rdma_device_removal(cm_id, queue); + break; + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_CONNECT_ERROR: + nvmet_rdma_queue_connect_fail(cm_id, queue); + break; + default: + pr_err("received unrecognized RDMA CM event %d\n", + event->event); + break; + } + + return ret; +} + +static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_queue *queue; + +restart: + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->nvme_sq.ctrl == ctrl) { + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + __nvmet_rdma_queue_disconnect(queue); + goto restart; + } + } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + +static int nvmet_rdma_add_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id; + struct sockaddr_in addr_in; + u16 port_in; + int ret; + + switch (port->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + break; + default: + pr_err("address family %d not supported\n", + port->disc_addr.adrfam); + return -EINVAL; + } + + ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in); + if (ret) + return ret; + + addr_in.sin_family = AF_INET; + addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr); + addr_in.sin_port = htons(port_in); + + cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) { + pr_err("CM ID creation failed\n"); + return PTR_ERR(cm_id); + } + + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in); + if (ret) { + pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + pr_info("enabling port %d (%pISpc)\n", + le16_to_cpu(port->disc_addr.portid), &addr_in); + port->priv = cm_id; + return 0; + +out_destroy_id: + rdma_destroy_id(cm_id); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); + + if (cm_id) + rdma_destroy_id(cm_id); +} + +static struct nvmet_fabrics_ops nvmet_rdma_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_RDMA, + .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, + .msdbd = 1, + .has_keyed_sgls = 1, + .add_port = nvmet_rdma_add_port, + .remove_port = nvmet_rdma_remove_port, + .queue_response = nvmet_rdma_queue_response, + .delete_ctrl = nvmet_rdma_delete_ctrl, +}; + +static int __init nvmet_rdma_init(void) +{ + return nvmet_register_transport(&nvmet_rdma_ops); +} + +static void __exit nvmet_rdma_exit(void) +{ + struct nvmet_rdma_queue *queue; + + nvmet_unregister_transport(&nvmet_rdma_ops); + + flush_scheduled_work(); + + mutex_lock(&nvmet_rdma_queue_mutex); + while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list, + struct nvmet_rdma_queue, queue_list))) { + list_del_init(&queue->queue_list); + + mutex_unlock(&nvmet_rdma_queue_mutex); + __nvmet_rdma_queue_disconnect(queue); + mutex_lock(&nvmet_rdma_queue_mutex); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + flush_scheduled_work(); + ida_destroy(&nvmet_rdma_queue_ida); +} + +module_init(nvmet_rdma_init); +module_exit(nvmet_rdma_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h new file mode 100644 index 0000000..bf240a3 --- /dev/null +++ b/include/linux/nvme-rdma.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_RDMA_H +#define _LINUX_NVME_RDMA_H + +enum nvme_rdma_cm_fmt { + NVME_RDMA_CM_FMT_1_0 = 0x0, +}; + +enum nvme_rdma_cm_status { + NVME_RDMA_CM_INVALID_LEN = 0x01, + NVME_RDMA_CM_INVALID_RECFMT = 0x02, + NVME_RDMA_CM_INVALID_QID = 0x03, + NVME_RDMA_CM_INVALID_HSQSIZE = 0x04, + NVME_RDMA_CM_INVALID_HRQSIZE = 0x05, + NVME_RDMA_CM_NO_RSC = 0x06, + NVME_RDMA_CM_INVALID_IRD = 0x07, + NVME_RDMA_CM_INVALID_ORD = 0x08, +}; + +/** + * struct nvme_rdma_cm_req - rdma connect request + * + * @recfmt: format of the RDMA Private Data + * @qid: queue Identifier for the Admin or I/O Queue + * @hrqsize: host receive queue size to be created + * @hsqsize: host send queue size to be created + */ +struct nvme_rdma_cm_req { + __le16 recfmt; + __le16 qid; + __le16 hrqsize; + __le16 hsqsize; + u8 rsvd[24]; +}; + +/** + * struct nvme_rdma_cm_rep - rdma connect reply + * + * @recfmt: format of the RDMA Private Data + * @crqsize: controller receive queue size + */ +struct nvme_rdma_cm_rep { + __le16 recfmt; + __le16 crqsize; + u8 rsvd[28]; +}; + +/** + * struct nvme_rdma_cm_rej - rdma connect reject + * + * @recfmt: format of the RDMA Private Data + * @fsts: error status for the associated connect request + */ +struct nvme_rdma_cm_rej { + __le16 recfmt; + __le16 sts; +}; + +#endif /* _LINUX_NVME_RDMA_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h new file mode 100644 index 0000000..7676557 --- /dev/null +++ b/include/linux/nvme.h @@ -0,0 +1,965 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_H +#define _LINUX_NVME_H + +#include +#include + +/* NQN names in commands fields specified one size */ +#define NVMF_NQN_FIELD_LEN 256 + +/* However the max length of a qualified name is another size */ +#define NVMF_NQN_SIZE 223 + +#define NVMF_TRSVCID_SIZE 32 +#define NVMF_TRADDR_SIZE 256 +#define NVMF_TSAS_SIZE 256 + +#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" + +#define NVME_RDMA_IP_PORT 4420 + +enum nvme_subsys_type { + NVME_NQN_DISC = 1, /* Discovery type target subsystem */ + NVME_NQN_NVME = 2, /* NVME type target subsystem */ +}; + +/* Address Family codes for Discovery Log Page entry ADRFAM field */ +enum { + NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ + NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */ + NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ + NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ + NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ +}; + +/* Transport Type codes for Discovery Log Page entry TRTYPE field */ +enum { + NVMF_TRTYPE_RDMA = 1, /* RDMA */ + NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ + NVMF_TRTYPE_MAX, +}; + +/* Transport Requirements codes for Discovery Log Page entry TREQ field */ +enum { + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */ +}; + +/* RDMA Connection Management Service Type codes for Discovery Log Page + * entry TSAS RDMA_CMS field + */ +enum { + NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */ +}; + +#define NVMF_AQ_DEPTH 32 + +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Clear */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ +}; + +#define NVME_CAP_MQES(cap) ((cap) & 0xffff) +#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) +#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) +#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) +#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) + +#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) +#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) +#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) +#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) + +#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) +#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) +#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) +#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) +#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) + +/* + * Submission and Completion Queue Entry Sizes for the NVM command set. + * (In bytes and specified as a power of two (2^n)). + */ +#define NVME_NVM_IOSQES 6 +#define NVME_NVM_IOCQES 4 + +enum { + NVME_CC_ENABLE = 1 << 0, + NVME_CC_CSS_NVM = 0 << 4, + NVME_CC_MPS_SHIFT = 7, + NVME_CC_ARB_RR = 0 << 11, + NVME_CC_ARB_WRRU = 1 << 11, + NVME_CC_ARB_VS = 7 << 11, + NVME_CC_SHN_NONE = 0 << 14, + NVME_CC_SHN_NORMAL = 1 << 14, + NVME_CC_SHN_ABRUPT = 2 << 14, + NVME_CC_SHN_MASK = 3 << 14, + NVME_CC_IOSQES = NVME_NVM_IOSQES << 16, + NVME_CC_IOCQES = NVME_NVM_IOCQES << 20, + NVME_CSTS_RDY = 1 << 0, + NVME_CSTS_CFS = 1 << 1, + NVME_CSTS_NSSRO = 1 << 4, + NVME_CSTS_SHST_NORMAL = 0 << 2, + NVME_CSTS_SHST_OCCUR = 1 << 2, + NVME_CSTS_SHST_CMPLT = 2 << 2, + NVME_CSTS_SHST_MASK = 3 << 2, +}; + +struct nvme_id_power_state { + __le16 max_power; /* centiwatts */ + __u8 rsvd2; + __u8 flags; + __le32 entry_lat; /* microseconds */ + __le32 exit_lat; /* microseconds */ + __u8 read_tput; + __u8 read_lat; + __u8 write_tput; + __u8 write_lat; + __le16 idle_power; + __u8 idle_scale; + __u8 rsvd19; + __le16 active_power; + __u8 active_work_scale; + __u8 rsvd23[9]; +}; + +enum { + NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0, + NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 mic; + __u8 mdts; + __le16 cntlid; + __le32 ver; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __le32 ctratt; + __u8 rsvd100[156]; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 avscc; + __u8 apsta; + __le16 wctemp; + __le16 cctemp; + __u8 rsvd270[50]; + __le16 kas; + __u8 rsvd322[190]; + __u8 sqes; + __u8 cqes; + __le16 maxcmd; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 nvscc; + __u8 rsvd531; + __le16 acwu; + __u8 rsvd534[2]; + __le32 sgls; + __u8 rsvd540[228]; + char subnqn[256]; + __u8 rsvd1024[768]; + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 ctrattr; + __u8 msdbd; + __u8 rsvd1804[244]; + struct nvme_id_power_state psd[32]; + __u8 vs[1024]; +}; + +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, + NVME_CTRL_VWC_PRESENT = 1 << 0, +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __u8 rsvd33; + __le16 nawun; + __le16 nawupf; + __le16 nacwu; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __u16 rsvd46; + __le64 nvmcap[2]; + __u8 rsvd64[40]; + __u8 nguid[16]; + __u8 eui64[8]; + struct nvme_lbaf lbaf[16]; + __u8 rsvd192[192]; + __u8 vs[3712]; +}; + +enum { + NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, + NVME_LBAF_RP_BEST = 0, + NVME_LBAF_RP_BETTER = 1, + NVME_LBAF_RP_GOOD = 2, + NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, +}; + +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __le32 warning_temp_time; + __le32 critical_comp_time; + __le16 temp_sensor[8]; + __u8 rsvd216[296]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + +enum { + NVME_AER_NOTICE_NS_CHANGED = 0x0002, +}; + +struct nvme_lba_range_type { + __u8 type; + __u8 attributes; + __u8 rsvd2[14]; + __u64 slba; + __u64 nlb; + __u8 guid[16]; + __u8 rsvd48[16]; +}; + +enum { + NVME_LBART_TYPE_FS = 0x01, + NVME_LBART_TYPE_RAID = 0x02, + NVME_LBART_TYPE_CACHE = 0x03, + NVME_LBART_TYPE_SWAP = 0x04, + + NVME_LBART_ATTRIB_TEMP = 1 << 0, + NVME_LBART_ATTRIB_HIDE = 1 << 1, +}; + +struct nvme_reservation_status { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[13]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 hostid; + __le64 rkey; + } regctl_ds[]; +}; + +enum nvme_async_event_type { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, +}; + +/* I/O commands */ + +enum nvme_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_write_zeroes = 0x08, + nvme_cmd_dsm = 0x09, + nvme_cmd_resv_register = 0x0d, + nvme_cmd_resv_report = 0x0e, + nvme_cmd_resv_acquire = 0x11, + nvme_cmd_resv_release = 0x15, +}; + +/* + * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier + * + * @NVME_SGL_FMT_ADDRESS: absolute address of the data block + * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation + * request subtype + */ +enum { + NVME_SGL_FMT_ADDRESS = 0x00, + NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_INVALIDATE = 0x0f, +}; + +/* + * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier + * + * For struct nvme_sgl_desc: + * @NVME_SGL_FMT_DATA_DESC: data block descriptor + * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor + * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor + * + * For struct nvme_keyed_sgl_desc: + * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + */ +enum { + NVME_SGL_FMT_DATA_DESC = 0x00, + NVME_SGL_FMT_SEG_DESC = 0x02, + NVME_SGL_FMT_LAST_SEG_DESC = 0x03, + NVME_KEY_SGL_FMT_DATA_DESC = 0x04, +}; + +struct nvme_sgl_desc { + __le64 addr; + __le32 length; + __u8 rsvd[3]; + __u8 type; +}; + +struct nvme_keyed_sgl_desc { + __le64 addr; + __u8 length[3]; + __u8 key[4]; + __u8 type; +}; + +union nvme_data_ptr { + struct { + __le64 prp1; + __le64 prp2; + }; + struct nvme_sgl_desc sgl; + struct nvme_keyed_sgl_desc ksgl; +}; + +/* + * Lowest two bits of our flags field (FUSE field in the spec): + * + * @NVME_CMD_FUSE_FIRST: Fused Operation, first command + * @NVME_CMD_FUSE_SECOND: Fused Operation, second command + * + * Highest two bits in our flags field (PSDT field in the spec): + * + * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer, + * If used, MPTR contains addr of single physical buffer (byte aligned). + * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer, + * If used, MPTR contains an address of an SGL segment containing + * exactly 1 SGL descriptor (qword aligned). + */ +enum { + NVME_CMD_FUSE_FIRST = (1 << 0), + NVME_CMD_FUSE_SECOND = (1 << 1), + + NVME_CMD_SGL_METABUF = (1 << 6), + NVME_CMD_SGL_METASEG = (1 << 7), + NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG, +}; + +struct nvme_common_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + union nvme_data_ptr dptr; + __le32 cdw10[6]; +}; + +struct nvme_rw_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, +}; + +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +/* Admin commands */ + +enum nvme_admin_opcode { + nvme_admin_delete_sq = 0x00, + nvme_admin_create_sq = 0x01, + nvme_admin_get_log_page = 0x02, + nvme_admin_delete_cq = 0x04, + nvme_admin_create_cq = 0x05, + nvme_admin_identify = 0x06, + nvme_admin_abort_cmd = 0x08, + nvme_admin_set_features = 0x09, + nvme_admin_get_features = 0x0a, + nvme_admin_async_event = 0x0c, + nvme_admin_activate_fw = 0x10, + nvme_admin_download_fw = 0x11, + nvme_admin_keep_alive = 0x18, + nvme_admin_format_nvm = 0x80, + nvme_admin_security_send = 0x81, + nvme_admin_security_recv = 0x82, +}; + +enum { + NVME_QUEUE_PHYS_CONTIG = (1 << 0), + NVME_CQ_IRQ_ENABLED = (1 << 1), + NVME_SQ_PRIO_URGENT = (0 << 1), + NVME_SQ_PRIO_HIGH = (1 << 1), + NVME_SQ_PRIO_MEDIUM = (2 << 1), + NVME_SQ_PRIO_LOW = (3 << 1), + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MGMT = 0x02, + NVME_FEAT_LBA_RANGE = 0x03, + NVME_FEAT_TEMP_THRESH = 0x04, + NVME_FEAT_ERR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WC = 0x06, + NVME_FEAT_NUM_QUEUES = 0x07, + NVME_FEAT_IRQ_COALESCE = 0x08, + NVME_FEAT_IRQ_CONFIG = 0x09, + NVME_FEAT_WRITE_ATOMIC = 0x0a, + NVME_FEAT_ASYNC_EVENT = 0x0b, + NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_KATO = 0x0f, + NVME_FEAT_SW_PROGRESS = 0x80, + NVME_FEAT_HOST_ID = 0x81, + NVME_FEAT_RESV_MASK = 0x82, + NVME_FEAT_RESV_PERSIST = 0x83, + NVME_LOG_ERROR = 0x01, + NVME_LOG_SMART = 0x02, + NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_DISC = 0x70, + NVME_LOG_RESERVATION = 0x80, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), +}; + +struct nvme_identify { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 cns; + __u32 rsvd11[5]; +}; + +struct nvme_features { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 fid; + __le32 dword11; + __u32 rsvd12[4]; +}; + +struct nvme_create_cq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 cqid; + __le16 qsize; + __le16 cq_flags; + __le16 irq_vector; + __u32 rsvd12[4]; +}; + +struct nvme_create_sq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 sqid; + __le16 qsize; + __le16 sq_flags; + __le16 cqid; + __u32 rsvd12[4]; +}; + +struct nvme_delete_queue { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 qid; + __u16 rsvd10; + __u32 rsvd11[5]; +}; + +struct nvme_abort_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 sqid; + __u16 cid; + __u32 rsvd11[5]; +}; + +struct nvme_download_firmware { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + union nvme_data_ptr dptr; + __le32 numd; + __le32 offset; + __u32 rsvd12[4]; +}; + +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + +struct nvme_get_log_page_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 lid; + __u8 rsvd10; + __le16 numdl; + __le16 numdu; + __u16 rsvd11; + __le32 lpol; + __le32 lpou; + __u32 rsvd14[2]; +}; + +/* + * Fabrics subcommands. + */ +enum nvmf_fabrics_opcode { + nvme_fabrics_command = 0x7f, +}; + +enum nvmf_capsule_command { + nvme_fabrics_type_property_set = 0x00, + nvme_fabrics_type_connect = 0x01, + nvme_fabrics_type_property_get = 0x04, +}; + +struct nvmf_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 ts[24]; +}; + +/* + * The legal cntlid range a NVMe Target will provide. + * Note that cntlid of value 0 is considered illegal in the fabrics world. + * Devices based on earlier specs did not have the subsystem concept; + * therefore, those devices had their cntlid value set to 0 as a result. + */ +#define NVME_CNTLID_MIN 1 +#define NVME_CNTLID_MAX 0xffef +#define NVME_CNTLID_DYNAMIC 0xffff + +#define MAX_DISC_LOGS 255 + +/* Discovery log page entry */ +struct nvmf_disc_rsp_page_entry { + __u8 trtype; + __u8 adrfam; + __u8 nqntype; + __u8 treq; + __le16 portid; + __le16 cntlid; + __le16 asqsz; + __u8 resv8[22]; + char trsvcid[NVMF_TRSVCID_SIZE]; + __u8 resv64[192]; + char subnqn[NVMF_NQN_FIELD_LEN]; + char traddr[NVMF_TRADDR_SIZE]; + union tsas { + char common[NVMF_TSAS_SIZE]; + struct rdma { + __u8 qptype; + __u8 prtype; + __u8 cms; + __u8 resv3[5]; + __u16 pkey; + __u8 resv10[246]; + } rdma; + } tsas; +}; + +/* Discovery log page header */ +struct nvmf_disc_rsp_page_hdr { + __le64 genctr; + __le64 numrec; + __le16 recfmt; + __u8 resv14[1006]; + struct nvmf_disc_rsp_page_entry entries[0]; +}; + +struct nvmf_connect_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __le16 recfmt; + __le16 qid; + __le16 sqsize; + __u8 cattr; + __u8 resv3; + __le32 kato; + __u8 resv4[12]; +}; + +struct nvmf_connect_data { + uuid_be hostid; + __le16 cntlid; + char resv4[238]; + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + char resv5[256]; +}; + +struct nvmf_property_set_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __le64 value; + __u8 resv4[8]; +}; + +struct nvmf_property_get_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __u8 resv4[16]; +}; + +struct nvme_command { + union { + struct nvme_common_command common; + struct nvme_rw_command rw; + struct nvme_identify identify; + struct nvme_features features; + struct nvme_create_cq create_cq; + struct nvme_create_sq create_sq; + struct nvme_delete_queue delete_queue; + struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; + struct nvme_dsm_cmd dsm; + struct nvme_abort_cmd abort; + struct nvme_get_log_page_command get_log_page; + struct nvmf_common_command fabrics; + struct nvmf_connect_command connect; + struct nvmf_property_set_command prop_set; + struct nvmf_property_get_command prop_get; + }; +}; + +static inline bool nvme_is_write(struct nvme_command *cmd) +{ + /* + * What a mess... + * + * Why can't we simply have a Fabrics In and Fabrics out command? + */ + if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + return cmd->fabrics.opcode & 1; + return cmd->common.opcode & 1; +} + +enum { + /* + * Generic Command Status: + */ + NVME_SC_SUCCESS = 0x0, + NVME_SC_INVALID_OPCODE = 0x1, + NVME_SC_INVALID_FIELD = 0x2, + NVME_SC_CMDID_CONFLICT = 0x3, + NVME_SC_DATA_XFER_ERROR = 0x4, + NVME_SC_POWER_LOSS = 0x5, + NVME_SC_INTERNAL = 0x6, + NVME_SC_ABORT_REQ = 0x7, + NVME_SC_ABORT_QUEUE = 0x8, + NVME_SC_FUSED_FAIL = 0x9, + NVME_SC_FUSED_MISSING = 0xa, + NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_SGL_INVALID_LAST = 0xd, + NVME_SC_SGL_INVALID_COUNT = 0xe, + NVME_SC_SGL_INVALID_DATA = 0xf, + NVME_SC_SGL_INVALID_METADATA = 0x10, + NVME_SC_SGL_INVALID_TYPE = 0x11, + + NVME_SC_SGL_INVALID_OFFSET = 0x16, + NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + + NVME_SC_LBA_RANGE = 0x80, + NVME_SC_CAP_EXCEEDED = 0x81, + NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, + + /* + * Command Specific Status: + */ + NVME_SC_CQ_INVALID = 0x100, + NVME_SC_QID_INVALID = 0x101, + NVME_SC_QUEUE_SIZE = 0x102, + NVME_SC_ABORT_LIMIT = 0x103, + NVME_SC_ABORT_MISSING = 0x104, + NVME_SC_ASYNC_LIMIT = 0x105, + NVME_SC_FIRMWARE_SLOT = 0x106, + NVME_SC_FIRMWARE_IMAGE = 0x107, + NVME_SC_INVALID_VECTOR = 0x108, + NVME_SC_INVALID_LOG_PAGE = 0x109, + NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_FIRMWARE_NEEDS_RESET = 0x10b, + NVME_SC_INVALID_QUEUE = 0x10c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, + NVME_SC_FEATURE_NOT_PER_NS = 0x10f, + NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, + + /* + * I/O Command Set Specific - NVM commands: + */ + NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_INVALID_PI = 0x181, + NVME_SC_READ_ONLY = 0x182, + + /* + * I/O Command Set Specific - Fabrics commands: + */ + NVME_SC_CONNECT_FORMAT = 0x180, + NVME_SC_CONNECT_CTRL_BUSY = 0x181, + NVME_SC_CONNECT_INVALID_PARAM = 0x182, + NVME_SC_CONNECT_RESTART_DISC = 0x183, + NVME_SC_CONNECT_INVALID_HOST = 0x184, + + NVME_SC_DISCOVERY_RESTART = 0x190, + NVME_SC_AUTH_REQUIRED = 0x191, + + /* + * Media and Data Integrity Errors: + */ + NVME_SC_WRITE_FAULT = 0x280, + NVME_SC_READ_ERROR = 0x281, + NVME_SC_GUARD_CHECK = 0x282, + NVME_SC_APPTAG_CHECK = 0x283, + NVME_SC_REFTAG_CHECK = 0x284, + NVME_SC_COMPARE_FAILED = 0x285, + NVME_SC_ACCESS_DENIED = 0x286, + + NVME_SC_DNR = 0x4000, +}; + +struct nvme_completion { + /* + * Used by Admin and Fabrics commands to return data: + */ + union { + __le16 result16; + __le32 result; + __le64 result64; + }; + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) + +#endif /* _LINUX_NVME_H */ diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h new file mode 100644 index 0000000..50ff21f --- /dev/null +++ b/include/uapi/linux/nvme_ioctl.h @@ -0,0 +1,66 @@ +/* + * Definitions for the NVM Express ioctl interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _UAPI_LINUX_NVME_IOCTL_H +#define _UAPI_LINUX_NVME_IOCTL_H + +#include + +struct nvme_user_io { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +}; + +struct nvme_passthru_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 result; +}; + +#define nvme_admin_cmd nvme_passthru_cmd + +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) +#define NVME_IOCTL_RESET _IO('N', 0x44) +#define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) +#define NVME_IOCTL_RESCAN _IO('N', 0x46) + +#endif /* _UAPI_LINUX_NVME_IOCTL_H */ -- 2.41.0