This patch is a slightly different take on the ioregionfd mechanism previously described here: https://lore.kernel.org/all/88ca79d2e378dcbfb3988b562ad2c16c4f929ac7.camel@gmail.com/ The goal of this new mechanism is to speed up doorbell writes on NVMe controllers emulated outside of the VMM. Currently, a doorbell write to an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the VMM communicating the event, along with the doorbell value, to the NVMe controller emulation task. With the shadow ioeventfd, the NVMe emulation task is directly notified of the doorbell write and can find the doorbell value in a known location, without the interference of the VMM. Signed-off-by: Thanos Makatos --- include/uapi/linux/kvm.h | 11 ++++++++++- tools/include/uapi/linux/kvm.h | 2 ++ virt/kvm/eventfd.c | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 65500f5db379..f3ff559de60d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -639,6 +639,7 @@ enum { kvm_ioeventfd_flag_nr_deassign, kvm_ioeventfd_flag_nr_virtio_ccw_notify, kvm_ioeventfd_flag_nr_fast_mmio, + kvm_ioevetnfd_flag_nr_post_write, kvm_ioeventfd_flag_nr_max, }; @@ -648,6 +649,12 @@ enum { #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) +/* + * KVM does not provide any guarantees regarding read-after-write ordering for + * such updates. + */ +#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioevetnfd_flag_nr_post_write) + #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) struct kvm_ioeventfd { @@ -656,8 +663,10 @@ struct kvm_ioeventfd { __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */ __s32 fd; __u32 flags; - __u8 pad[36]; + void __user *post_addr; /* address to write to if POST_WRITE is set */ + __u8 pad[24]; }; +_Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size"); #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index dddb781b0507..1fb481c90b57 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -629,6 +629,7 @@ enum { kvm_ioeventfd_flag_nr_deassign, kvm_ioeventfd_flag_nr_virtio_ccw_notify, kvm_ioeventfd_flag_nr_fast_mmio, + kvm_ioevetnfd_flag_nr_commit_write, kvm_ioeventfd_flag_nr_max, }; @@ -637,6 +638,7 @@ enum { #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) +#define KVM_IOEVENTFD_FLAG_COMMIT_WRITE (1 << kvm_ioevetnfd_flag_nr_commit_write) #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 0e8b8a2c5b79..019cf3606aef 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -741,6 +741,7 @@ struct _ioeventfd { struct kvm_io_device dev; u8 bus_idx; bool wildcard; + void __user *post_addr; }; static inline struct _ioeventfd * @@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; + if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val, len)) + return -EFAULT; + eventfd_signal(p->eventfd); return 0; } @@ -879,6 +883,27 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, goto fail; } + if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) { + /* + * Although a NULL pointer it technically valid for userspace, it's + * unlikely that any use case actually cares. + */ + if (!args->len || !args->post_addr || + args->post_addr != untagged_addr(args->post_addr) || + !access_ok((void __user *)(unsigned long)args->post_addr, args->len)) { + ret = -EINVAL; + goto free_fail; + } + p->post_addr = args->post_addr; + } else if (!args->post_addr) { + /* + * Ensure that post_addr isn't set without POST_WRITE to avoid accidental + * userspace errors. + */ + ret = -EINVAL; + goto free_fail; + } + INIT_LIST_HEAD(&p->list); p->addr = args->addr; p->bus_idx = bus_idx; @@ -915,8 +940,8 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, unlock_fail: mutex_unlock(&kvm->slots_lock); +free_fail: kfree(p); - fail: eventfd_ctx_put(eventfd); @@ -932,12 +957,14 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_bus *bus; int ret = -ENOENT; bool wildcard; + void __user *post_addr; eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + post_addr = args->post_addr; mutex_lock(&kvm->slots_lock); @@ -946,7 +973,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, p->eventfd != eventfd || p->addr != args->addr || p->length != args->len || - p->wildcard != wildcard) + p->wildcard != wildcard || + p->post_addr != post_addr) continue; if (!p->wildcard && p->datamatch != args->datamatch) -- 2.47.3