Implements TP4159-based live migration support in vfio-nvme driver by integrating command execution, controller state handling, and vfio migration state transitions. Key features: - Use nvme_submit_vf_cmd() and nvme_get_ctrl_id() helpers in the NVMe core PCI driver for submitting admin commands on VFs. - Implements Migration Send (opcode 0x43) and Receive (opcode 0x42) command handling for suspend, resume, get/set controller state. _Remark_:- We are currently in the process of defining the state in TP4193, so the current state management code will be replaced with TP4193. However, in this patch we include TP4159-compatible state management code for the sake of completeness. - Adds parsing and serialization of controller state including: - NVMeCS v0 controller state format (SCS-FIG6, FIG7, FIG8) - Supported Controller State Formats (CNS=0x20 response) - Migration file abstraction with read/write fileops - Adds debug decoders to log IOSQ/IOCQ state during migration save - Allocates anon inodes to handle save and resume file interfaces exposed via VFIO migration file descriptors - Adds vfio migration state machine transitions: - RUNNING → STOP: sends suspend command - STOP → STOP_COPY: extracts controller state (save) - STOP_COPY → STOP: disables file and frees buffer - STOP → RESUMING: allocates resume file buffer - RESUMING → STOP: loads controller state via set state - STOP → RUNNING: resumes controller via resume command - Hooks vfio_migration_ops into vfio_pci_ops using: - `migration_set_state()` and `migration_get_state()` - Uses state_mutex + reset_lock for proper concurrency - Queries Identify Controller (CNS=01h) to check for HMLMS bit in OACS field, indicating controller migration capability - Applies runtime checks for buffer alignment, format support, and state size bounds to ensure spec compliance With this patch, vfio-nvme enables live migration of VF-based NVMe devices by implementing TP4159 migration command flows and vfio device state transitions required by QEMU/VMM. Signed-off-by: Lei Rao Signed-off-by: Max Gurtovoy Signed-off-by: Chaitanya Kulkarni --- drivers/vfio/pci/nvme/Makefile | 3 + drivers/vfio/pci/nvme/nvme.c | 840 +++++++++++++++++++++++++++++++++ drivers/vfio/pci/nvme/nvme.h | 3 + 3 files changed, 846 insertions(+) diff --git a/drivers/vfio/pci/nvme/Makefile b/drivers/vfio/pci/nvme/Makefile index 2f4a0ad3d9cf..d434c943436b 100644 --- a/drivers/vfio/pci/nvme/Makefile +++ b/drivers/vfio/pci/nvme/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only + +KBUILD_EXTRA_SYMBOLS := $(srctree)/drivers/nvme/Module.symvers + obj-$(CONFIG_NVME_VFIO_PCI) += nvme-vfio-pci.o nvme-vfio-pci-y := nvme.o diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c index 08bee3274207..5283d6b606dc 100644 --- a/drivers/vfio/pci/nvme/nvme.c +++ b/drivers/vfio/pci/nvme/nvme.c @@ -19,6 +19,8 @@ #include "nvme.h" +#define MAX_MIGRATION_SIZE (256 * 1024) + static void nvmevf_disable_fd(struct nvmevf_migration_file *migf) { mutex_lock(&migf->lock); @@ -71,6 +73,842 @@ static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev *pdev) core_device); } +/* + * Convert byte length to nvme's 0-based num dwords + */ +static inline u32 bytes_to_nvme_numd(size_t len) +{ + if (len < 4) + return 0; + return (len >> 2) - 1; +} + +static int nvmevf_cmd_suspend_device(struct nvmevf_pci_core_device *nvmevf_dev) +{ + struct pci_dev *dev = nvmevf_dev->core_device.pdev; + struct nvme_command c = { }; + u32 cdw11 = NVME_LM_SUSPEND_TYPE_SUSPEND << 16 | nvme_get_ctrl_id(dev); + int ret; + + c.lm.send.opcode = nvme_admin_lm_send; + c.lm.send.cdw10 = cpu_to_le32(NVME_LM_SEND_SEL_SUSPEND); + c.lm.send.cdw11 = cpu_to_le32(cdw11); + + ret = nvme_submit_vf_cmd(dev, &c, NULL, NULL, 0); + if (ret) { + dev_warn(&dev->dev, + "Suspend virtual function failed (ret=0x%x)\n", + ret); + return ret; + } + + dev_dbg(&dev->dev, "Suspend command successful\n"); + return 0; +} + +static int nvmevf_cmd_resume_device(struct nvmevf_pci_core_device *nvmevf_dev) +{ + struct pci_dev *dev = nvmevf_dev->core_device.pdev; + struct nvme_command c = { }; + int ret; + + c.lm.send.opcode = nvme_admin_lm_send; + c.lm.send.cdw10 = cpu_to_le32(NVME_LM_SEND_SEL_RESUME); + c.lm.send.cdw11 = cpu_to_le32(nvme_get_ctrl_id(dev)); + + ret = nvme_submit_vf_cmd(dev, &c, NULL, NULL, 0); + if (ret) { + dev_warn(&dev->dev, + "Resume virtual function failed (ret=0x%x)\n", ret); + return ret; + } + dev_dbg(&dev->dev, "Resume command successful\n"); + return 0; +} + +/** + * Figure SCSF-FIG1: Supported Controller State Formats Data Structure + * nvme_lm_get_ctrl_state_fmts - Query and parse CNS=0x20 format list + * @dev: Controller pci device + * @fmt: Output struct populated with NV, NUUID, and pointers + * + * Issues Identify CNS=0x20 (Supported Controller State Formats), + * allocates a buffer, and parses the result into the provided struct. + * + * The caller must free fmt->ctrl_state_raw_buf using kfree(). + * + * Returns 0 on success, or a negative errno on failure. + */ +static int nvme_lm_id_ctrl_state(struct pci_dev *dev, + struct nvme_lm_ctrl_state_fmts_info *fmt) +{ + struct nvme_command c = { }; + void *buf; + int ret; + __u8 nv, nuuid; + size_t len; + + if (!fmt) + return -EINVAL; + + /* Step 1: Read first 2 bytes to get NV and NUUID */ + buf = kzalloc(2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_LM_CTRL_STATE_FMT; + c.identify.nsid = cpu_to_le32(0); + + ret = nvme_submit_vf_cmd(dev, &c, NULL, buf, 2); + if (ret) + goto out_free; + + nv = ((__u8 *)buf)[0]; + nuuid = ((__u8 *)buf)[1]; + + kfree(buf); + + /* + * Compute total buffer length for the full Identify CNS=0x20 response: + * + * - The first 2 bytes hold the header: + * * Byte 0: NV — number of NVMe-defined format versions + * * Byte 1: NUUID — number of vendor-specific UUID entries + * + * - Each version entry is 2 bytes (VERSION_ENTRY_SIZE) + * - Each UUID entry is 16 bytes (UUID_ENTRY_SIZE) + * + * Therefore: + * Total length = 2 + (NV * 2) + (NUUID * 16) + */ + len = NVME_LM_CTRL_STATE_HDR_SIZE + + nv * NVME_LM_VERSION_ENTRY_SIZE + nuuid * NVME_LM_UUID_ENTRY_SIZE; + + buf = kzalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_LM_CTRL_STATE_FMT; + c.identify.nsid = cpu_to_le32(0); + + ret = nvme_submit_vf_cmd(dev, &c, NULL, buf, len); + if (ret) + goto out_free; + + /* Parse the result in-place */ + fmt->nv = nv; + fmt->nuuid = nuuid; + fmt->vers = ((struct nvme_lm_supported_ctrl_state_fmts *)buf)->vers; + fmt->uuids = (const void *)(fmt->vers + nv); + fmt->ctrl_state_raw_buf = buf; + fmt->raw_len = len; + + return 0; + +out_free: + kfree(buf); + return ret; +} + +static int nvme_lm_get_ctrl_state_fmt(struct pci_dev *dev, bool debug, + struct nvme_lm_ctrl_state_fmts_info *fmt) +{ + __u8 i; + int ret; + + ret = nvme_lm_id_ctrl_state(dev, fmt); + if (ret) { + pr_err("Failed to get ctrl state formats (ret=%d)\n", ret); + return ret; + } + + if (debug) + pr_info("NV = %u, NUUID = %u\n", fmt->nv, fmt->nuuid); + + if (debug) { + for (i = 0; i < fmt->nv; i++) { + pr_info(" Format[%d] Version = 0x%04x\n", + i, le16_to_cpu(fmt->vers[i])); + } + + for (i = 0; i < fmt->nuuid; i++) { + char uuid_str[37]; /* 36 chars + null */ + + snprintf(uuid_str, sizeof(uuid_str), + "%02x%02x%02x%02x-%02x%02x-%02x%02x-" + "%02x%02x-%02x%02x%02x%02x%02x%02x", + fmt->uuids[i][0], fmt->uuids[i][1], + fmt->uuids[i][2], fmt->uuids[i][3], + fmt->uuids[i][4], fmt->uuids[i][5], + fmt->uuids[i][6], fmt->uuids[i][7], + fmt->uuids[i][8], fmt->uuids[i][9], + fmt->uuids[i][10], fmt->uuids[i][11], + fmt->uuids[i][12], fmt->uuids[i][13], + fmt->uuids[i][14], fmt->uuids[i][15]); + + pr_info(" UUID[%d] = %s\n", i, uuid_str); + } + } + + return ret; +} + +static void nvmevf_init_get_ctrl_state_cmd(struct nvme_command *c, __u16 cntlid, + __u8 csvi, __u8 csuuidi, + __u8 csuidxp, size_t buf_len) +{ + c->lm.recv.opcode = nvme_admin_lm_recv; + c->lm.recv.sel = NVME_LM_RECV_GET_CTRL_STATE; + /* + * MOS fields treated as ctrl state version index, Use NVME V1 state. + */ + /* + * For upstream read the supported controller state formats using + * identify command with cns value 0x20 and make sure NVME_LM_CSVI + * matches the on of the reported formats for NVMe states. + */ + c->lm.recv.mos = cpu_to_le16(csvi); + /* Target Controller is this a right way to get the controller ID */ + c->lm.recv.cntlid = cpu_to_le16(cntlid); + + /* + * For upstream read the supported controller state formats using + * identify command with cns value 0x20 and make sure NVME_LM_CSVI + * matches the on of the reported formats for Vender specific states. + */ + /* adjust the state as per needed by setting the macro values */ + c->lm.recv.csuuidi = cpu_to_le32(csuuidi); + c->lm.recv.csuidxp = cpu_to_le32(csuidxp); + + /* + * Associates the Migration Receive command with the correct migration + * session UUID currently we set to 0. For now asssume that initiaor + * and target has agreed on the UUIDX 0 for all the live migration + * sessions. + */ + c->lm.recv.uuid_index = cpu_to_le32(0); + + /* + * Assume that data buffer is big enoough to hold the state, + * 0-based dword count. + */ + c->lm.recv.numd = cpu_to_le32(bytes_to_nvme_numd(buf_len)); +} + +#define NVME_LM_MAX_NVMECS 1024 +#define NVME_LM_MAX_VSD 1024 + +static int nvmevf_get_ctrl_state(struct pci_dev *dev, + __u8 csvi, __u8 csuuidi, __u8 csuidxp, + struct nvmevf_migration_file *migf, + struct nvme_lm_ctrl_state_info *state) +{ + struct nvme_command c = { }; + struct nvme_lm_ctrl_state *hdr; + /* Make sure hdr_len is a multiple of 4 */ + size_t hdr_len = ALIGN(sizeof(*hdr), 4); + __u16 id = nvme_get_ctrl_id(dev); + void *local_buf; + size_t len; + int ret; + + /* Step 1: Issue Migration Receive (Select = 0) to get header */ + local_buf = kzalloc(hdr_len, GFP_KERNEL); + if (!local_buf) + return -ENOMEM; + + nvmevf_init_get_ctrl_state_cmd(&c, id, csvi, csuuidi, csuidxp, hdr_len); + ret = nvme_submit_vf_cmd(dev, &c, NULL, local_buf, hdr_len); + if (ret) { + dev_warn(&dev->dev, + "nvme_admin_lm_recv failed (ret=0x%x)\n", ret); + kfree(local_buf); + return ret; + } + + if (le16_to_cpu(hdr->nvmecss) > NVME_LM_MAX_NVMECS || + le16_to_cpu(hdr->vss) > NVME_LM_MAX_VSD) { + kfree(local_buf); + return -EINVAL; + } + + hdr = local_buf; + len = hdr_len + 4 * (le16_to_cpu(hdr->nvmecss) + le16_to_cpu(hdr->vss)); + + kfree(local_buf); + + if (len == hdr_len) + dev_warn(&dev->dev, "nvmecss == 0 or vss = 0\n"); + + /* Step 2: Allocate full buffer */ + migf->total_length = len; + migf->vf_data = kvzalloc(migf->total_length, GFP_KERNEL); + if (!migf->vf_data) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + nvmevf_init_get_ctrl_state_cmd(&c, id, csvi, csuuidi, csuidxp, len); + ret = nvme_submit_vf_cmd(dev, &c, NULL, migf->vf_data, len); + if (ret) + goto free_big; + + /* Populate state struct */ + hdr = (struct nvme_lm_ctrl_state *)migf->vf_data; + state->raw = hdr; + state->total_len = len; + state->version = hdr->version; + state->csattr = hdr->csattr; + state->nvmecss = hdr->nvmecss; + state->vss = hdr->vss; + state->nvme_cs = hdr->data; + state->vsd = hdr->data + le16_to_cpu(hdr->nvmecss) * 4; + + return ret; + +free_big: + kvfree(migf->vf_data); + return ret; +} + +static const struct nvme_lm_nvme_cs_v0_state * +nvme_lm_parse_nvme_cs_v0_state(const void *data, size_t len, u16 *niosq, + u16 *niocq) +{ + const struct nvme_lm_nvme_cs_v0_state *hdr = data; + size_t hdr_len = sizeof(*hdr); + size_t iosq_sz, iocq_sz, total; + u16 sq, cq; + + if (!data || len < hdr_len) + return NULL; + + sq = le16_to_cpu(hdr->niosq); + cq = le16_to_cpu(hdr->niocq); + + iosq_sz = sq * sizeof(struct nvme_lm_iosq_state); + iocq_sz = cq * sizeof(struct nvme_lm_iocq_state); + total = hdr_len + iosq_sz + iocq_sz; + + if (len < total) + return NULL; + + if (niosq) + *niosq = sq; + if (niocq) + *niocq = cq; + + return hdr; +} + +static void nvme_lm_debug_ctrl_state(struct nvme_lm_ctrl_state_info *state) +{ + const struct nvme_lm_nvme_cs_v0_state *cs; + const struct nvme_lm_iosq_state *iosq; + const struct nvme_lm_iocq_state *iocq; + u16 niosq, niocq; + int i; + + pr_info("Controller State:\n"); + pr_info("Version : 0x%04x\n", le16_to_cpu(state->version)); + pr_info("CSATTR : 0x%02x\n", state->csattr); + pr_info("NVMECS Len : %u bytes\n", le16_to_cpu(state->nvmecss) * 4); + pr_info("VSD Len : %u bytes\n", le16_to_cpu(state->vss) * 4); + + cs = nvme_lm_parse_nvme_cs_v0_state(state->nvme_cs, + le16_to_cpu(state->nvmecss) * 4, + &niosq, &niocq); + if (!cs) { + pr_warn("Failed to parse NVMECS\n"); + return; + } + + iosq = cs->iosq; + iocq = (const void *)(iosq + niosq); + + for (i = 0; i < niosq; i++) { + pr_info("IOSQ[%d]: SIZE=%u QID=%u CQID=%u ATTR=0x%x Head=%u " + "Tail=%u\n", i, + le16_to_cpu(iosq[i].qsize), + le16_to_cpu(iosq[i].qid), + le16_to_cpu(iosq[i].cqid), + le16_to_cpu(iosq[i].attr), + le16_to_cpu(iosq[i].head), + le16_to_cpu(iosq[i].tail)); + } + + for (i = 0; i < niocq; i++) { + pr_info("IOCQ[%d]: SIZE=%u QID=%u ATTR=%u Head=%u Tail=%u\n", i, + le16_to_cpu(iocq[i].qsize), + le16_to_cpu(iocq[i].qid), + le16_to_cpu(iocq[i].attr), + le16_to_cpu(iocq[i].head), + le16_to_cpu(iocq[i].tail)); + } +} + +#define NVME_LM_CSUUIDI 0 +#define NVME_LM_CSVI NVME_LM_RECV_CSVI_NVME_V1 + +static int nvmevf_cmd_get_ctrl_state(struct nvmevf_pci_core_device *nvmevf_dev, + struct nvmevf_migration_file *migf) +{ + struct pci_dev *dev = nvmevf_dev->core_device.pdev; + struct nvme_lm_ctrl_state_fmts_info fmt = { }; + struct nvme_lm_ctrl_state_info state = { }; + __u8 csvi = NVME_LM_CSVI; + __u8 csuuidi = NVME_LM_CSUUIDI; + __u8 csuidxp = 0; + int ret; + + /* + * Read the supported controller state formats to make sure they match + * csvi value specified in vfio-nvme without this check we'd not know + * which controller state format we are working with. + */ + ret = nvme_lm_get_ctrl_state_fmt(dev, true, &fmt); + if (ret) + return ret; + /* + * Number of versions NV cannot be less than controller state version + * index we are using, it's an error. Please note that CSVI is + * a configurable value user can define this macro at the compile time + * to select the required NVMe controller state version index from + * Supported Controller State Formats Data Structure. + */ + if (fmt.nv < csvi) { + dev_warn(&dev->dev, + "required ctrl state format not found\n"); + ret = -EINVAL; + goto out; + } + + ret = nvmevf_get_ctrl_state(dev, csvi, csuuidi, csuidxp, migf, &state); + if (ret) + goto out; + + if (le16_to_cpu(state.version) != csvi) { + dev_warn(&dev->dev, + "Unexpected controller state version: 0x%04x\n", + le16_to_cpu(state.version)); + ret = -EINVAL; + goto out; + } + + /* + * Now that we have received the controller state decode the state + * properly for debugging purpose + */ + + nvme_lm_debug_ctrl_state(&state); + + dev_info(&dev->dev, "Get controller state successful\n"); + +out: + kfree(fmt.ctrl_state_raw_buf); + return ret; +} + +static int nvmevf_cmd_set_ctrl_state(struct nvmevf_pci_core_device *nvmevf_dev, + struct nvmevf_migration_file *migf) +{ + struct pci_dev *dev = nvmevf_dev->core_device.pdev; + struct nvme_command c = { }; + u32 sel = NVME_LM_SEND_SEL_SET_CTRL_STATE; + /* assume that data buffer is big enough to hold state in one cmd */ + u32 mos = NVME_LM_SEQIND_ONLY; + u32 cntlid = nvme_get_ctrl_id(dev); + u32 csvi = NVME_LM_CSVI; + u32 csuuidi = NVME_LM_CSUUIDI; + int ret; + + c.lm.send.opcode = nvme_admin_lm_send; + /* mos = SEQIND = 0b11 (Only) in MOS bits [17:16] */ + c.lm.send.cdw10 = cpu_to_le32((mos << 16) | sel); + /* + * Assume that we are only working on NVMe state and not on vendor + * specific state. + */ + c.lm.send.cdw11 = cpu_to_le32(csuuidi << 24 | csvi << 16 | cntlid); + + /* + * Associates the Migration Send command with the correct migration + * session UUID currently we set to 0. For now asssume that initiaor + * and target has agreed on the UUIDX 0 for all the live migration + * sessions. + */ + c.lm.send.cdw14 = cpu_to_le32(0); + /* + * Assume that data buffer is big enoough to hold the state, + * 0-based dword count. + */ + c.lm.send.cdw15 = cpu_to_le32(bytes_to_nvme_numd(migf->total_length)); + + ret = nvme_submit_vf_cmd(dev, &c, NULL, migf->vf_data, + migf->total_length); + if (ret) { + dev_warn(&dev->dev, + "Load the device states failed (ret=0x%x)\n", ret); + return ret; + } + + dev_info(&dev->dev, "Set controller state successful\n"); + return 0; +} + +static int nvmevf_release_file(struct inode *inode, struct file *filp) +{ + struct nvmevf_migration_file *migf = filp->private_data; + + nvmevf_disable_fd(migf); + mutex_destroy(&migf->lock); + kfree(migf); + return 0; +} + +static ssize_t nvmevf_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct nvmevf_migration_file *migf = filp->private_data; + loff_t requested_length; + ssize_t done = 0; + int ret; + + if (pos) + return -ESPIPE; + pos = &filp->f_pos; + + if (*pos < 0 || + check_add_overflow((loff_t)len, *pos, &requested_length)) + return -EINVAL; + + if (requested_length > MAX_MIGRATION_SIZE) + return -ENOMEM; + mutex_lock(&migf->lock); + if (migf->disabled) { + done = -ENODEV; + goto out_unlock; + } + + ret = copy_from_user(migf->vf_data + *pos, buf, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *pos += len; + done = len; + migf->total_length += len; + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static const struct file_operations nvmevf_resume_fops = { + .owner = THIS_MODULE, + .write = nvmevf_resume_write, + .release = nvmevf_release_file, + .llseek = noop_llseek, +}; + +static struct nvmevf_migration_file * +nvmevf_pci_resume_device_data(struct nvmevf_pci_core_device *nvmevf_dev) +{ + struct nvmevf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("nvmevf_mig", &nvmevf_resume_fops, migf, + O_WRONLY); + if (IS_ERR(migf->filp)) { + int err = PTR_ERR(migf->filp); + + kfree(migf); + return ERR_PTR(err); + } + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + /* Allocate buffer to load the device states and max states is 256K */ + migf->vf_data = kvzalloc(MAX_MIGRATION_SIZE, GFP_KERNEL); + if (!migf->vf_data) { + ret = -ENOMEM; + goto out_free; + } + + return migf; + +out_free: + fput(migf->filp); + return ERR_PTR(ret); +} + +static ssize_t nvmevf_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct nvmevf_migration_file *migf = filp->private_data; + ssize_t done = 0; + int ret; + + if (pos) + return -ESPIPE; + pos = &filp->f_pos; + + mutex_lock(&migf->lock); + if (*pos > migf->total_length) { + done = -EINVAL; + goto out_unlock; + } + + if (migf->disabled) { + done = -EINVAL; + goto out_unlock; + } + + len = min_t(size_t, migf->total_length - *pos, len); + if (len) { + ret = copy_to_user(buf, migf->vf_data + *pos, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *pos += len; + done = len; + } + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static const struct file_operations nvmevf_save_fops = { + .owner = THIS_MODULE, + .read = nvmevf_save_read, + .release = nvmevf_release_file, + .llseek = noop_llseek, +}; + +static struct nvmevf_migration_file * +nvmevf_pci_save_device_data(struct nvmevf_pci_core_device *nvmevf_dev) +{ + struct nvmevf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("nvmevf_mig", &nvmevf_save_fops, migf, + O_RDONLY); + if (IS_ERR(migf->filp)) { + int err = PTR_ERR(migf->filp); + + kfree(migf); + return ERR_PTR(err); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + ret = nvmevf_cmd_get_ctrl_state(nvmevf_dev, migf); + if (ret) + goto out_free; + + return migf; +out_free: + fput(migf->filp); + return ERR_PTR(ret); +} + +static struct file * +nvmevf_pci_step_device_state_locked(struct nvmevf_pci_core_device *nvmevf_dev, + u32 new) +{ + u32 cur = nvmevf_dev->mig_state; + int ret; + + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) { + ret = nvmevf_cmd_suspend_device(nvmevf_dev); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_STOP_COPY) { + struct nvmevf_migration_file *migf; + + migf = nvmevf_pci_save_device_data(nvmevf_dev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + nvmevf_dev->saving_migf = migf; + return migf->filp; + } + + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && + new == VFIO_DEVICE_STATE_STOP) { + nvmevf_disable_fds(nvmevf_dev); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_RESUMING) { + struct nvmevf_migration_file *migf; + + migf = nvmevf_pci_resume_device_data(nvmevf_dev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + nvmevf_dev->resuming_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && + new == VFIO_DEVICE_STATE_STOP) { + ret = nvmevf_cmd_set_ctrl_state(nvmevf_dev, + nvmevf_dev->resuming_migf); + if (ret) + return ERR_PTR(ret); + nvmevf_disable_fds(nvmevf_dev); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_RUNNING) { + nvmevf_cmd_resume_device(nvmevf_dev); + return NULL; + } + + /* vfio_mig_get_next_state() does not use arcs other than the above */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static struct file * +nvmevf_pci_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct nvmevf_pci_core_device *nvmevf_dev = container_of(vdev, + struct nvmevf_pci_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *res = NULL; + int ret; + + mutex_lock(&nvmevf_dev->state_mutex); + while (new_state != nvmevf_dev->mig_state) { + ret = vfio_mig_get_next_state(vdev, nvmevf_dev->mig_state, + new_state, &next_state); + if (ret) { + res = ERR_PTR(-EINVAL); + break; + } + + res = nvmevf_pci_step_device_state_locked(nvmevf_dev, + next_state); + if (IS_ERR(res)) + break; + nvmevf_dev->mig_state = next_state; + if (WARN_ON(res && new_state != nvmevf_dev->mig_state)) { + fput(res); + res = ERR_PTR(-EINVAL); + break; + } + } + nvmevf_state_mutex_unlock(nvmevf_dev); + return res; +} + +static int nvmevf_pci_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *curr_state) +{ + struct nvmevf_pci_core_device *nvmevf_dev = container_of( + vdev, struct nvmevf_pci_core_device, core_device.vdev); + + mutex_lock(&nvmevf_dev->state_mutex); + *curr_state = nvmevf_dev->mig_state; + nvmevf_state_mutex_unlock(nvmevf_dev); + return 0; +} + +static const struct vfio_migration_ops nvmevf_pci_mig_ops = { + .migration_set_state = nvmevf_pci_set_device_state, + .migration_get_state = nvmevf_pci_get_device_state, +}; + +static bool nvmevf_migration_supp(struct pci_dev *pdev) +{ + struct nvme_command c = { }; + u8 lm_supported = false; + struct nvme_id_ctrl *id; + __u16 oacs; + int ret; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CTRL; + + id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!id) + return false; + + ret = nvme_submit_vf_cmd(pdev, &c, NULL, id, + sizeof(struct nvme_id_ctrl)); + if (ret) { + dev_warn(&pdev->dev, "Get identify ctrl failed (ret=0x%x)\n", + ret); + lm_supported = false; + goto out; + } + + oacs = le16_to_cpu(id->oacs); + lm_supported = oacs & NVME_CTRL_OACS_HMLMS ? true : false; +out: + kfree(id); + return lm_supported; +} + +static int nvmevf_migration_init_dev(struct vfio_device *core_vdev) +{ + struct nvmevf_pci_core_device *nvmevf_dev; + struct pci_dev *pdev; + int vf_id; + int ret = -1; + + nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device, + core_device.vdev); + pdev = to_pci_dev(core_vdev->dev); + + if (!pdev->is_virtfn) + return ret; + + /* + * Get the identify controller data structure to check the live + * migration support. + */ + if (!nvmevf_migration_supp(pdev)) + return ret; + + nvmevf_dev->migrate_cap = 1; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return ret; + nvmevf_dev->vf_id = vf_id + 1; + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + + mutex_init(&nvmevf_dev->state_mutex); + spin_lock_init(&nvmevf_dev->reset_lock); + core_vdev->mig_ops = &nvmevf_pci_mig_ops; + + return vfio_pci_core_init_dev(core_vdev); +} + static int nvmevf_pci_open_device(struct vfio_device *core_vdev) { struct nvmevf_pci_core_device *nvmevf_dev; @@ -109,6 +947,7 @@ static void nvmevf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_device_ops nvmevf_pci_ops = { .name = "nvme-vfio-pci", + .init = nvmevf_migration_init_dev, .release = vfio_pci_core_release_dev, .open_device = nvmevf_pci_open_device, .close_device = nvmevf_pci_close_device, @@ -193,4 +1032,5 @@ module_pci_driver(nvmevf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Chaitanya Kulkarni "); +MODULE_AUTHOR("Lei Rao "); MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live migration support for NVMe"); diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h index ee602254679e..80dd75d33762 100644 --- a/drivers/vfio/pci/nvme/nvme.h +++ b/drivers/vfio/pci/nvme/nvme.h @@ -33,4 +33,7 @@ struct nvmevf_pci_core_device { struct nvmevf_migration_file *saving_migf; }; +extern int nvme_submit_vf_cmd(struct pci_dev *dev, struct nvme_command *cmd, + size_t *result, void *buffer, unsigned int bufflen); +extern u16 nvme_get_ctrl_id(struct pci_dev *dev); #endif /* NVME_VFIO_PCI_H */ -- 2.40.0