Provide defined memory content modes so that KVM can make guarantees about memory content after setting memory attributes, according to userspace requests. Suggested-by: Sean Christoperson Signed-off-by: Ackerley Tng --- Documentation/virt/kvm/api.rst | 61 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 4 +++ virt/kvm/guest_memfd.c | 56 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 15148c80cfdb6..90587a9c09d3f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6571,6 +6571,8 @@ Errors: EAGAIN Some page within requested range had unexpected refcounts. The offset of the page will be returned in `error_offset`. ENOMEM Ran out of memory trying to track private/shared state + EOPNOTSUPP There is no way for KVM to guarantee in-memory contents as + requested. ========== =============================================================== KVM_SET_MEMORY_ATTRIBUTES2 is an extension to @@ -6619,6 +6621,65 @@ on the shared pages, such as refcounts taken by get_user_pages(), and try the ioctl again. A possible source of these long term refcounts is if the guest_memfd memory was pinned in IOMMU page tables. +By default, KVM makes no guarantees about the in-memory values after +memory is convert to/from shared/private. Optionally, userspace may +instruct KVM to ensure the contents of memory are zeroed or preserved, +e.g. to enable in-place sharing of data, or as an optimization to +avoid having to re-zero memory when userspace could have relied on the +trusted entity to guarantee the memory will be zeroed as part of the +entire conversion process. + +The content modes available are as follows: + +``KVM_SET_MEMORY_ATTRIBUTES2_ZERO`` + + On conversion, KVM guarantees all entities that have "allowed" + access to the memory will read zeros. E.g. on private to shared + conversion, both trusted and untrusted code will read zeros. + + Zeroing is currently only supported for private-to-shared + conversions, as KVM in general is untrusted and thus cannot + guarantee the guest (or any trusted entity) will read zeros after + conversion. Note, some CoCo implementations do zero memory contents + such that the guest reads zeros after conversion, and the guest may + choose to rely on that behavior. However, that's a contract between + the trusted CoCo entity and the guest, not between KVM and the + guest. + +``KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE`` + + On conversion, KVM guarantees memory contents will be preserved with + respect to the last written unencrypted value. As a concrete + example, if the host writes ``0xbeef`` to shared memory and converts + the memory to private, the guest will also read ``0xbeef``, even if + the in-memory data is encrypted as part of the conversion. And vice + versa, if the guest writes ``0xbeef`` to private memory and then + converts the memory to shared, the host (and guest) will read + ``0xbeef`` (if the memory is accessible). + +Note: These content modes apply to the entire requested range, not +just the parts of the range that underwent conversion. For example, if +this was the initial state: + + * [0x0000, 0x1000): shared + * [0x1000, 0x2000): private + * [0x2000, 0x3000): shared + +and range [0x0000, 0x3000) was set to shared, the content mode would +apply to all memory in [0x0000, 0x3000), not just the range that +underwent conversion [0x1000, 0x2000). + +Note: These content modes apply only to allocated memory. No +guarantees are made on offset ranges that do not have memory allocated +(yet). For example, if this was the initial state: + + * [0x0000, 0x1000): shared + * [0x1000, 0x2000): not allocated + * [0x2000, 0x3000): shared + +and range [0x0000, 0x3000) was set to shared, the content mode would +apply to only to offset ranges [0x0000, 0x1000) and [0x2000, 0x3000). + See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`. .. _kvm_run: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 29baaa60de35a..0fc9ad4ea0d93 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1642,6 +1642,10 @@ struct kvm_memory_attributes { /* Available with KVM_CAP_MEMORY_ATTRIBUTES2 */ #define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd2, struct kvm_memory_attributes2) +#define KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED 0 +#define KVM_SET_MEMORY_ATTRIBUTES2_ZERO (1ULL << 0) +#define KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE (1ULL << 1) + struct kvm_memory_attributes2 { union { __u64 address; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e270e54e030f0..eeac7678fcf4e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -677,6 +677,19 @@ u64 __weak kvm_arch_gmem_supported_content_modes(struct kvm *kvm) return 0; } +static bool kvm_gmem_content_mode_is_supported(struct kvm *kvm, + u64 content_mode, + bool to_private) +{ + if (content_mode == KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED) + return true; + + if (content_mode == KVM_SET_MEMORY_ATTRIBUTES2_ZERO && to_private) + return false; + + return kvm_arch_gmem_supported_content_modes(kvm) & content_mode; +} + int kvm_gmem_apply_content_mode_zero(struct inode *inode, pgoff_t start, pgoff_t end) { @@ -736,8 +749,26 @@ int __weak kvm_arch_gmem_apply_content_mode_preserve(struct kvm *kvm, return -EOPNOTSUPP; } +static int kvm_gmem_apply_content_mode(struct kvm *kvm, uint64_t content_mode, + struct inode *inode, pgoff_t start, + pgoff_t end) +{ + switch (content_mode) { + case KVM_SET_MEMORY_ATTRIBUTES2_MODE_UNSPECIFIED: + return kvm_arch_gmem_apply_content_mode_unspecified(kvm, inode, start, end); + case KVM_SET_MEMORY_ATTRIBUTES2_ZERO: + return kvm_arch_gmem_apply_content_mode_zero(kvm, inode, start, end); + case KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE: + return kvm_arch_gmem_apply_content_mode_preserve(kvm, inode, start, end); + default: + WARN_ONCE(1, "Unexpected policy requested."); + return -EOPNOTSUPP; + } +} + static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start, size_t nr_pages, uint64_t attrs, + struct kvm *kvm, uint64_t content_mode, pgoff_t *err_index) { bool to_private = attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE; @@ -752,9 +783,23 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start, filemap_invalidate_lock(mapping); + if (!kvm_gmem_content_mode_is_supported(kvm, content_mode, + to_private)) { + r = -EOPNOTSUPP; + *err_index = start; + goto out; + } + mas_init(&mas, mt, start); if (kvm_gmem_range_has_attributes(mt, start, nr_pages, attrs)) { + /* + * Even if no update is required to attributes, the + * requested content mode is applied. + */ + WARN_ON(kvm_gmem_apply_content_mode(kvm, content_mode, + inode, start, end)); + r = 0; goto out; } @@ -786,6 +831,9 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start, if (!to_private) kvm_gmem_invalidate(inode, start, end); + WARN_ON(kvm_gmem_apply_content_mode(kvm, content_mode, inode, + start, end)); + mas_store_prealloc(&mas, xa_mk_value(attrs)); kvm_gmem_invalidate_end(inode, start, end); @@ -807,7 +855,11 @@ static long kvm_gmem_set_attributes(struct file *file, void __user *argp) if (copy_from_user(&attrs, argp, sizeof(attrs))) return -EFAULT; - if (attrs.flags) + if (attrs.flags & ~(KVM_SET_MEMORY_ATTRIBUTES2_ZERO | + KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE)) + return -EINVAL; + if ((attrs.flags & KVM_SET_MEMORY_ATTRIBUTES2_ZERO) && + (attrs.flags & KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE)) return -EINVAL; if (attrs.error_offset) return -EINVAL; @@ -829,7 +881,7 @@ static long kvm_gmem_set_attributes(struct file *file, void __user *argp) nr_pages = attrs.size >> PAGE_SHIFT; index = attrs.offset >> PAGE_SHIFT; r = __kvm_gmem_set_attributes(inode, index, nr_pages, attrs.attributes, - &err_index); + f->kvm, attrs.flags, &err_index); if (r) { attrs.error_offset = ((uint64_t)err_index) << PAGE_SHIFT; -- 2.53.0.1018.g2bb0e51243-goog