From: Ackerley Tng Introduce KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES to advertise the availability of the KVM_SET_MEMORY_ATTRIBUTES2 ioctl. KVM_SET_MEMORY_ATTRIBUTES2 is a guest_memfd-scoped version of the existing KVM_SET_MEMORY_ATTRIBUTES VM ioctl. It allows userspace to manage memory attributes, such as KVM_MEMORY_ATTRIBUTE_PRIVATE, directly on a guest_memfd file descriptor. This new version uses struct kvm_memory_attributes2, which adds an error_offset field to the output. This allows KVM to return the specific offset that triggered an error, which is especially useful for handling EAGAIN results caused by transient page reference counts during attribute conversions. Update the KVM API documentation to define the new ioctl and its behavior, and add the necessary UAPI definitions and capability checks. Suggested-by: Sean Christopherson Suggested-by: Michael Roth Signed-off-by: Ackerley Tng --- Documentation/virt/kvm/api.rst | 78 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 2 ++ virt/kvm/kvm_main.c | 23 +++++++++---- 3 files changed, 95 insertions(+), 8 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a833d90845b95..73878f34f6d2e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -117,7 +117,7 @@ description: x86 includes both i386 and x86_64. Type: - system, vm, or vcpu. + system, vm, vcpu or guest_memfd. Parameters: what parameters are accepted by the ioctl. @@ -6373,6 +6373,8 @@ S390: Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set. Returns -EINVAL if called on a protected VM. +.. _KVM_SET_MEMORY_ATTRIBUTES: + 4.141 KVM_SET_MEMORY_ATTRIBUTES ------------------------------- @@ -6566,6 +6568,80 @@ KVM_S390_KEYOP_SSKE Sets the storage key for the guest address ``guest_addr`` to the key specified in ``key``, returning the previous value in ``key``. +4.145 KVM_SET_MEMORY_ATTRIBUTES2 +--------------------------------- + +:Capability: KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES +:Architectures: all +:Type: guest_memfd ioctl +:Parameters: struct kvm_memory_attributes2 (in/out) +:Returns: 0 on success, <0 on error + +Errors: + + ========== =============================================================== + EINVAL The specified `offset` or `size` were invalid (e.g. not + page aligned, causes an overflow, or size is zero). + EFAULT The parameter address was invalid. + EAGAIN Some page within requested range had unexpected refcounts. The + offset of the page will be returned in `error_offset`. + ENOMEM Ran out of memory trying to track private/shared state + ========== =============================================================== + +KVM_SET_MEMORY_ATTRIBUTES2 is an extension to +KVM_SET_MEMORY_ATTRIBUTES that supports returning (writing) values to +userspace. The original (pre-extension) fields are shared with +KVM_SET_MEMORY_ATTRIBUTES identically. + +Attribute values are shared with KVM_SET_MEMORY_ATTRIBUTES. + +:: + + struct kvm_memory_attributes2 { + /* in */ + union { + __u64 address; + __u64 offset; + }; + __u64 size; + __u64 attributes; + __u64 flags; + /* out */ + __u64 error_offset; + __u64 reserved[11]; + }; + + #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) + +Set attributes for a range of offsets within a guest_memfd to +KVM_MEMORY_ATTRIBUTE_PRIVATE to limit the specified guest_memfd backed +memory range for guest_use. Even if KVM_CAP_GUEST_MEMFD_MMAP is +supported, after a successful call to set +KVM_MEMORY_ATTRIBUTE_PRIVATE, the requested range will not be mappable +into host userspace and will only be mappable by the guest. + +To allow the range to be mappable into host userspace again, call +KVM_SET_MEMORY_ATTRIBUTES2 on the guest_memfd again with +KVM_MEMORY_ATTRIBUTE_PRIVATE unset. + +KVM does not directly manipulate the memory contents of pages during +attribute updates. However, the process of setting these attributes, +which includes operations such as unmapping pages from the host or +stage-2 page tables, may result in side effects on memory contents +that vary across different trusted firmware implementations. + +If this ioctl returns -EAGAIN, the offset of the page with unexpected +refcounts will be returned in `error_offset`. This can occur if there +are transient refcounts on the pages, taken by other parts of the +kernel. + +Userspace is expected to figure out how to remove all known refcounts +on the shared pages, such as refcounts taken by get_user_pages(), and +try the ioctl again. A possible source of these long term refcounts is +if the guest_memfd memory was pinned in IOMMU page tables. + +See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`. + .. _kvm_run: 5. The kvm_run structure diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 876c0429f9d4e..129d6f6303251 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -997,6 +997,7 @@ struct kvm_enable_cap { #define KVM_CAP_S390_KEYOP 247 #define KVM_CAP_S390_VSIE_ESAMODE 248 #define KVM_CAP_S390_HPAGE_2G 249 +#define KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES 250 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1649,6 +1650,7 @@ struct kvm_memory_attributes { __u64 flags; }; +/* Available with KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES */ #define KVM_SET_MEMORY_ATTRIBUTES2 _IOWR(KVMIO, 0xd2, struct kvm_memory_attributes2) struct kvm_memory_attributes2 { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a08b518cdb175..044486f128c37 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2434,18 +2434,22 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, } #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ +#ifdef kvm_arch_has_private_mem +static u64 kvm_supports_private_mem(struct kvm *kvm) +{ + return !kvm || kvm_arch_has_private_mem(kvm); +} +#else +#define kvm_supports_private_mem(kvm) false +#endif + #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES static u64 kvm_supported_vm_mem_attributes(struct kvm *kvm) { -#ifdef kvm_arch_has_private_mem - if (gmem_in_place_conversion) + if (gmem_in_place_conversion || !kvm_supports_private_mem(kvm)) return 0; - if (!kvm || kvm_arch_has_private_mem(kvm)) - return KVM_MEMORY_ATTRIBUTE_PRIVATE; -#endif - - return 0; + return KVM_MEMORY_ATTRIBUTE_PRIVATE; } /* @@ -4969,6 +4973,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return 1; case KVM_CAP_GUEST_MEMFD_FLAGS: return kvm_gmem_get_supported_flags(kvm); + case KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES: + if (!gmem_in_place_conversion || !kvm_supports_private_mem(kvm)) + return 0; + + return KVM_MEMORY_ATTRIBUTE_PRIVATE; #endif default: break; -- 2.55.0.rc0.738.g0c8ab3ebcc-goog