From: Fuad Tabba Rename the Kconfig option CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GUEST_MEMFD. The original name implied that the feature only supported "private" memory. However, CONFIG_KVM_PRIVATE_MEM enables guest_memfd in general, which is not exclusively for private memory. Subsequent patches in this series will add guest_memfd support for non-CoCo VMs, whose memory is not private. Renaming the Kconfig option to CONFIG_KVM_GUEST_MEMFD more accurately reflects its broader scope as the main Kconfig option for all guest_memfd-backed memory. This provides clearer semantics for the option and avoids confusion as new features are introduced. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- include/linux/kvm_host.h | 14 +++++++------- virt/kvm/Kconfig | 8 ++++---- virt/kvm/Makefile.kvm | 2 +- virt/kvm/kvm_main.c | 4 ++-- virt/kvm/kvm_mm.h | 4 ++-- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..7b0f2b3e492d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2276,7 +2276,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) #else #define kvm_arch_has_private_mem(kvm) false diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..8cdc0b3cc1b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -602,7 +602,7 @@ struct kvm_memory_slot { short id; u16 as_id; -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD struct { /* * Writes protected by kvm->slots_lock. Acquiring a @@ -720,10 +720,10 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) #endif /* - * Arch code must define kvm_arch_has_private_mem if support for private memory - * is enabled. + * Arch code must define kvm_arch_has_private_mem if support for guest_memfd is + * enabled. */ -#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; @@ -2505,7 +2505,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { - return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && + return IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) && kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else @@ -2515,7 +2515,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order); @@ -2528,7 +2528,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, KVM_BUG_ON(1, kvm); return -EIO; } -#endif /* CONFIG_KVM_PRIVATE_MEM */ +#endif /* CONFIG_KVM_GUEST_MEMFD */ #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 727b542074e7..e4b400feff94 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -112,19 +112,19 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES depends on KVM_GENERIC_MMU_NOTIFIER bool -config KVM_PRIVATE_MEM +config KVM_GUEST_MEMFD select XARRAY_MULTI bool config KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_PRIVATE_MEM + select KVM_GUEST_MEMFD bool config HAVE_KVM_ARCH_GMEM_PREPARE bool - depends on KVM_PRIVATE_MEM + depends on KVM_GUEST_MEMFD config HAVE_KVM_ARCH_GMEM_INVALIDATE bool - depends on KVM_PRIVATE_MEM + depends on KVM_GUEST_MEMFD diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm index 724c89af78af..d047d4cf58c9 100644 --- a/virt/kvm/Makefile.kvm +++ b/virt/kvm/Makefile.kvm @@ -12,4 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o -kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o +kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..25a94eed75fd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4915,7 +4915,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_MEMORY_ATTRIBUTES: return kvm_supported_mem_attributes(kvm); #endif -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return !kvm || kvm_arch_has_private_mem(kvm); #endif @@ -5352,7 +5352,7 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_GET_STATS_FD: r = kvm_vm_ioctl_get_stats_fd(kvm); break; -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CREATE_GUEST_MEMFD: { struct kvm_create_guest_memfd guest_memfd; diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index acef3f5c582a..31defb08ccba 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -67,7 +67,7 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, } #endif /* HAVE_KVM_PFNCACHE */ -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD void kvm_gmem_init(struct module *module); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -91,6 +91,6 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot) { WARN_ON_ONCE(1); } -#endif /* CONFIG_KVM_PRIVATE_MEM */ +#endif /* CONFIG_KVM_GUEST_MEMFD */ #endif /* __KVM_MM_H__ */ -- 2.50.1.552.g942d659e1b-goog Make all vendor neutral KVM x86 configs depend on KVM_X86, not just KVM, i.e. gate them on at least one vendor module being enabled and thus on kvm.ko actually being built. Depending on just KVM allows the user to select the configs even though they won't actually take effect, and more importantly, makes it all too easy to create unmet dependencies. E.g. KVM_GENERIC_PRIVATE_MEM can't be selected by KVM_SW_PROTECTED_VM, because the KVM_GENERIC_MMU_NOTIFIER dependency is select by KVM_X86. Hiding all sub-configs when neither KVM_AMD nor KVM_INTEL is selected also helps communicate to the user that nothing "interesting" is going on, e.g. --- Virtualization Kernel-based Virtual Machine (KVM) support < > KVM for Intel (and compatible) processors support < > KVM for AMD processors support Fixes: ea4290d77bda ("KVM: x86: leave kvm.ko out of the build if no vendor module is requested") Reviewed-by: David Hildenbrand Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson --- arch/x86/kvm/Kconfig | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2c86673155c9..9895fc3cd901 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -74,7 +74,7 @@ config KVM_WERROR # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. # Building KVM with -Werror and KASAN is still doable via enabling # the kernel-wide WERROR=y. - depends on KVM && ((EXPERT && !KASAN) || WERROR) + depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR) help Add -Werror to the build flags for KVM. @@ -83,7 +83,7 @@ config KVM_WERROR config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT - depends on KVM && X86_64 + depends on KVM_X86 && X86_64 help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -169,7 +169,7 @@ config KVM_AMD_SEV config KVM_IOAPIC bool "I/O APIC, PIC, and PIT emulation" default y - depends on KVM + depends on KVM_X86 help Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. for full in-kernel APIC emulation. @@ -179,7 +179,7 @@ config KVM_IOAPIC config KVM_SMM bool "System Management Mode emulation" default y - depends on KVM + depends on KVM_X86 help Provides support for KVM to emulate System Management Mode (SMM) in virtual machines. This can be used by the virtual machine @@ -189,7 +189,7 @@ config KVM_SMM config KVM_HYPERV bool "Support for Microsoft Hyper-V emulation" - depends on KVM + depends on KVM_X86 default y help Provides KVM support for emulating Microsoft Hyper-V. This allows KVM @@ -203,7 +203,7 @@ config KVM_HYPERV config KVM_XEN bool "Support for Xen hypercall interface" - depends on KVM + depends on KVM_X86 help Provides KVM support for the hosting Xen HVM guests and passing Xen hypercalls to userspace. @@ -213,7 +213,7 @@ config KVM_XEN config KVM_PROVE_MMU bool "Prove KVM MMU correctness" depends on DEBUG_KERNEL - depends on KVM + depends on KVM_X86 depends on EXPERT help Enables runtime assertions in KVM's MMU that are too costly to enable @@ -228,7 +228,7 @@ config KVM_EXTERNAL_WRITE_TRACKING config KVM_MAX_NR_VCPUS int "Maximum number of vCPUs per KVM guest" - depends on KVM + depends on KVM_X86 range 1024 4096 default 4096 if MAXSMP default 1024 -- 2.50.1.552.g942d659e1b-goog Now that KVM_SW_PROTECTED_VM doesn't have a hidden dependency on KVM_X86, select KVM_GENERIC_PRIVATE_MEM from within KVM_SW_PROTECTED_VM instead of conditionally selecting it from KVM_X86. No functional change intended. Reviewed-by: Xiaoyao Li Reviewed-by: David Hildenbrand Signed-off-by: Sean Christopherson --- arch/x86/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 9895fc3cd901..402ba00fdf45 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,7 +46,6 @@ config KVM_X86 select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY - select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM select KVM_WERROR if WERROR config KVM @@ -84,6 +83,7 @@ config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM_X86 && X86_64 + select KVM_GENERIC_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for -- 2.50.1.552.g942d659e1b-goog Select KVM_GENERIC_PRIVATE_MEM and KVM_GENERIC_MEMORY_ATTRIBUTES directly from KVM_INTEL_TDX, i.e. if and only if TDX support is fully enabled in KVM. There is no need to enable KVM's private memory support just because the core kernel's INTEL_TDX_HOST is enabled. Reviewed-by: Xiaoyao Li Reviewed-by: David Hildenbrand Signed-off-by: Sean Christopherson --- arch/x86/kvm/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 402ba00fdf45..13ab7265b505 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -95,8 +95,6 @@ config KVM_SW_PROTECTED_VM config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL - select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST - select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT extensions, a.k.a. Virtual Machine Extensions (VMX). @@ -135,6 +133,8 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST + select KVM_GENERIC_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES help Provides support for launching Intel Trust Domain Extensions (TDX) confidential VMs on Intel processors. -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba The original name was vague regarding its functionality. This Kconfig option specifically enables and gates the kvm_gmem_populate() function, which is responsible for populating a GPA range with guest data. The new name, HAVE_KVM_ARCH_GMEM_POPULATE, describes the purpose of the option: to enable arch-specific guest_memfd population mechanisms. It also follows the same pattern as the other HAVE_KVM_ARCH_* configuration options. This improves clarity for developers and ensures the name accurately reflects the functionality it controls, especially as guest_memfd support expands beyond purely "private" memory scenarios. Temporarily keep KVM_GENERIC_PRIVATE_MEM as an x86-only config so as to minimize churn, and to hopefully make it easier to see what features require HAVE_KVM_ARCH_GMEM_POPULATE. On that note, omit GMEM_POPULATE for KVM_X86_SW_PROTECTED_VM, as regular ol' memset() suffices for software-protected VMs. As for KVM_GENERIC_PRIVATE_MEM, a future change will select KVM_GUEST_MEMFD for all 64-bit KVM builds, at which point the intermediate config will become obsolete and can/will be dropped. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- arch/x86/kvm/Kconfig | 14 ++++++++++---- include/linux/kvm_host.h | 2 +- virt/kvm/Kconfig | 9 ++++----- virt/kvm/guest_memfd.c | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 13ab7265b505..c763446d9b9f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -79,11 +79,16 @@ config KVM_WERROR If in doubt, say "N". +config KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_GUEST_MEMFD + bool + config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM_X86 && X86_64 - select KVM_GENERIC_PRIVATE_MEM + select KVM_X86_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -133,8 +138,8 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST - select KVM_GENERIC_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_X86_PRIVATE_MEM + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) confidential VMs on Intel processors. @@ -157,9 +162,10 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_GENERIC_PRIVATE_MEM + select KVM_X86_PRIVATE_MEM select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching encrypted VMs which use Secure Encrypted Virtualization (SEV), Secure Encrypted Virtualization with diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8cdc0b3cc1b1..ddfb6cfe20a6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2534,7 +2534,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif -#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e4b400feff94..1b7d5be0b6c4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -116,11 +116,6 @@ config KVM_GUEST_MEMFD select XARRAY_MULTI bool -config KVM_GENERIC_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_GUEST_MEMFD - bool - config HAVE_KVM_ARCH_GMEM_PREPARE bool depends on KVM_GUEST_MEMFD @@ -128,3 +123,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE config HAVE_KVM_ARCH_GMEM_INVALIDATE bool depends on KVM_GUEST_MEMFD + +config HAVE_KVM_ARCH_GMEM_POPULATE + bool + depends on KVM_GUEST_MEMFD diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 7d85cc33c0bb..b2b50560e80e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -627,7 +627,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); -#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Rename kvm_slot_can_be_private() to kvm_slot_has_gmem() to improve clarity and accurately reflect its purpose. The function kvm_slot_can_be_private() was previously used to check if a given kvm_memory_slot is backed by guest_memfd. However, its name implied that the memory in such a slot was exclusively "private". As guest_memfd support expands to include non-private memory (e.g., shared host mappings), it's important to remove this association. The new name, kvm_slot_has_gmem(), states that the slot is backed by guest_memfd without making assumptions about the memory's privacy attributes. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/svm/sev.c | 4 ++-- include/linux/kvm_host.h | 2 +- virt/kvm/guest_memfd.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..fdc2824755ee 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3312,7 +3312,7 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn) { - bool is_private = kvm_slot_can_be_private(slot) && + bool is_private = kvm_slot_has_gmem(slot) && kvm_mem_is_private(kvm, gfn); return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); @@ -4551,7 +4551,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, { int max_order, r; - if (!kvm_slot_can_be_private(fault->slot)) { + if (!kvm_slot_has_gmem(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 2fbdebf79fbb..7744c210f947 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2365,7 +2365,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) mutex_lock(&kvm->slots_lock); memslot = gfn_to_memslot(kvm, params.gfn_start); - if (!kvm_slot_can_be_private(memslot)) { + if (!kvm_slot_has_gmem(memslot)) { ret = -EINVAL; goto out; } @@ -4719,7 +4719,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) } slot = gfn_to_memslot(kvm, gfn); - if (!kvm_slot_can_be_private(slot)) { + if (!kvm_slot_has_gmem(slot)) { pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", gpa); return; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ddfb6cfe20a6..4c5e0a898652 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -615,7 +615,7 @@ struct kvm_memory_slot { #endif }; -static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) +static inline bool kvm_slot_has_gmem(const struct kvm_memory_slot *slot) { return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); } diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index b2b50560e80e..a99e11b8b77f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -643,7 +643,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long return -EINVAL; slot = gfn_to_memslot(kvm, start_gfn); - if (!kvm_slot_can_be_private(slot)) + if (!kvm_slot_has_gmem(slot)) return -EINVAL; file = kvm_gmem_get_file(slot); -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Fix comments so that they refer to slots_lock instead of slots_locks (remove trailing s). Reviewed-by: David Hildenbrand Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4c5e0a898652..5c25b03d3d50 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -860,7 +860,7 @@ struct kvm { struct notifier_block pm_notifier; #endif #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES - /* Protected by slots_locks (for writes) and RCU (for reads) */ + /* Protected by slots_lock (for writes) and RCU (for reads) */ struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 25a94eed75fd..aa86dfd757db 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -331,7 +331,7 @@ void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, * All current use cases for flushing the TLBs for a specific memslot * are related to dirty logging, and many do the TLB flush out of * mmu_lock. The interaction between the various operations on memslot - * must be serialized by slots_locks to ensure the TLB flush from one + * must be serialized by slots_lock to ensure the TLB flush from one * operation is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba The comment that points to the path where the user-visible memslot flags are refers to an outdated path and has a typo. Update the comment to refer to the correct path. Reviewed-by: David Hildenbrand Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c25b03d3d50..56ea8c862cfd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -52,7 +52,7 @@ /* * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally * used in kvm, other bits are visible for userspace which are defined in - * include/linux/kvm_h. + * include/uapi/linux/kvm.h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Enable KVM_GUEST_MEMFD for all KVM x86 64-bit builds, i.e. for "default" VM types when running on 64-bit KVM. This will allow using guest_memfd to back non-private memory for all VM shapes, by supporting mmap() on guest_memfd. Opportunistically clean up various conditionals that become tautologies once x86 selects KVM_GUEST_MEMFD more broadly. Specifically, because SW protected VMs, SEV, and TDX are all 64-bit only, private memory no longer needs to take explicit dependencies on KVM_GUEST_MEMFD, because it is effectively a prerequisite. Suggested-by: Sean Christopherson Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Reviewed-by: David Hildenbrand Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 4 +--- arch/x86/kvm/Kconfig | 12 ++++-------- include/linux/kvm_host.h | 9 ++------- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 9 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7b0f2b3e492d..50366a1ca192 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); -#ifdef CONFIG_KVM_GUEST_MEMFD +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) -#else -#define kvm_arch_has_private_mem(kvm) false #endif #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c763446d9b9f..4e43923656d0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -47,6 +47,7 @@ config KVM_X86 select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY select KVM_WERROR if WERROR + select KVM_GUEST_MEMFD if X86_64 config KVM tristate "Kernel-based Virtual Machine (KVM) support" @@ -79,16 +80,11 @@ config KVM_WERROR If in doubt, say "N". -config KVM_X86_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_GUEST_MEMFD - bool - config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM_X86 && X86_64 - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -138,7 +134,7 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) @@ -162,7 +158,7 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE select HAVE_KVM_ARCH_GMEM_POPULATE diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 56ea8c862cfd..4d1c44622056 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -719,11 +719,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif -/* - * Arch code must define kvm_arch_has_private_mem if support for guest_memfd is - * enabled. - */ -#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) +#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; @@ -2505,8 +2501,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { - return IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) && - kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; + return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aa86dfd757db..4f57cb92e109 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1588,7 +1588,7 @@ static int check_memory_region_flags(struct kvm *kvm, { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; - if (kvm_arch_has_private_mem(kvm)) + if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD)) valid_flags |= KVM_MEM_GUEST_MEMFD; /* Dirty logging private memory is not currently supported. */ @@ -4917,7 +4917,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: - return !kvm || kvm_arch_has_private_mem(kvm); + return 1; #endif default: break; -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Introduce the core infrastructure to enable host userspace to mmap() guest_memfd-backed memory. This is needed for several evolving KVM use cases: * Non-CoCo VM backing: Allows VMMs like Firecracker to run guests entirely backed by guest_memfd, even for non-CoCo VMs [1]. This provides a unified memory management model and simplifies guest memory handling. * Direct map removal for enhanced security: This is an important step for direct map removal of guest memory [2]. By allowing host userspace to fault in guest_memfd pages directly, we can avoid maintaining host kernel direct maps of guest memory. This provides additional hardening against Spectre-like transient execution attacks by removing a potential attack surface within the kernel. * Future guest_memfd features: This also lays the groundwork for future enhancements to guest_memfd, such as supporting huge pages and enabling in-place sharing of guest memory with the host for CoCo platforms that permit it [3]. Enable the basic mmap and fault handling logic within guest_memfd, but hold off on allow userspace to actually do mmap() until the architecture support is also in place. [1] https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hiding [2] https://lore.kernel.org/linux-mm/cc1bb8e9bc3e1ab637700a4d3defeec95b55060a.camel@amazon.com [3] https://lore.kernel.org/all/c1c9591d-218a-495c-957b-ba356c8f8e09@redhat.com/T/#u Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Acked-by: David Hildenbrand Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 11 +++++++ include/linux/kvm_host.h | 4 +++ virt/kvm/guest_memfd.c | 70 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..e5cd54ba1eaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13518,6 +13518,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_KVM_GUEST_MEMFD +/* + * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory + * (the private vs. shared tracking needs to be moved into guest_memfd). + */ +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm) +{ + return !kvm_arch_has_private_mem(kvm); +} + #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { @@ -13531,6 +13541,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) kvm_x86_call(gmem_invalidate)(start, end); } #endif +#endif int kvm_spec_ctrl_test_value(u64 value) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4d1c44622056..26bad600f9fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -726,6 +726,10 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) } #endif +#ifdef CONFIG_KVM_GUEST_MEMFD +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm); +#endif + #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index a99e11b8b77f..67e7cd7210ef 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -312,7 +312,72 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) return gfn - slot->base_gfn + slot->gmem.pgoff; } +static bool kvm_gmem_supports_mmap(struct inode *inode) +{ + return false; +} + +static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct folio *folio; + vm_fault_t ret = VM_FAULT_LOCKED; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + folio = kvm_gmem_get_folio(inode, vmf->pgoff); + if (IS_ERR(folio)) { + int err = PTR_ERR(folio); + + if (err == -EAGAIN) + return VM_FAULT_RETRY; + + return vmf_error(err); + } + + if (WARN_ON_ONCE(folio_test_large(folio))) { + ret = VM_FAULT_SIGBUS; + goto out_folio; + } + + if (!folio_test_uptodate(folio)) { + clear_highpage(folio_page(folio, 0)); + kvm_gmem_mark_prepared(folio); + } + + vmf->page = folio_file_page(folio, vmf->pgoff); + +out_folio: + if (ret != VM_FAULT_LOCKED) { + folio_unlock(folio); + folio_put(folio); + } + + return ret; +} + +static const struct vm_operations_struct kvm_gmem_vm_ops = { + .fault = kvm_gmem_fault_user_mapping, +}; + +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!kvm_gmem_supports_mmap(file_inode(file))) + return -ENODEV; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != + (VM_SHARED | VM_MAYSHARE)) { + return -EINVAL; + } + + vma->vm_ops = &kvm_gmem_vm_ops; + + return 0; +} + static struct file_operations kvm_gmem_fops = { + .mmap = kvm_gmem_mmap, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -391,6 +456,11 @@ static const struct inode_operations kvm_gmem_iops = { .setattr = kvm_gmem_setattr, }; +bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm) +{ + return true; +} + static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Add a new internal flag, KVM_MEMSLOT_GMEM_ONLY, to the top half of memslot->flags (which makes it strictly for KVM's internal use). This flag tracks when a guest_memfd-backed memory slot supports host userspace mmap operations, which implies that all memory, not just private memory for CoCo VMs, is consumed through guest_memfd: "gmem only". This optimization avoids repeatedly checking the underlying guest_memfd file for mmap support, which would otherwise require taking and releasing a reference on the file for each check. By caching this information directly in the memslot, we reduce overhead and simplify the logic involved in handling guest_memfd-backed pages for host mappings. Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Xiaoyao Li Acked-by: David Hildenbrand Suggested-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 11 ++++++++++- virt/kvm/guest_memfd.c | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 26bad600f9fa..8b47891adca1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -54,7 +54,8 @@ * used in kvm, other bits are visible for userspace which are defined in * include/uapi/linux/kvm.h. */ -#define KVM_MEMSLOT_INVALID (1UL << 16) +#define KVM_MEMSLOT_INVALID (1UL << 16) +#define KVM_MEMSLOT_GMEM_ONLY (1UL << 17) /* * Bit 63 of the memslot generation number is an "update in-progress flag", @@ -2490,6 +2491,14 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } +static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot) +{ + if (!IS_ENABLED(CONFIG_KVM_GUEST_MEMFD)) + return false; + + return slot->flags & KVM_MEMSLOT_GMEM_ONLY; +} + #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 67e7cd7210ef..d5b445548af4 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -578,6 +578,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, */ WRITE_ONCE(slot->gmem.file, file); slot->gmem.pgoff = start; + if (kvm_gmem_supports_mmap(inode)) + slot->flags |= KVM_MEMSLOT_GMEM_ONLY; xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); -- 2.50.1.552.g942d659e1b-goog From: Ackerley Tng Rename kvm_x86_ops.private_max_mapping_level() to .gmem_max_mapping_level() in anticipation of extending guest_memfd support to non-private memory. No functional change intended. Reviewed-by: Xiaoyao Li Acked-by: David Hildenbrand Signed-off-by: Ackerley Tng Signed-off-by: Fuad Tabba Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 4 ++-- arch/x86/kvm/vmx/main.c | 6 +++--- arch/x86/kvm/vmx/tdx.c | 2 +- arch/x86/kvm/vmx/x86_ops.h | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 18a5c3119e1a..62c3e4de3303 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) -KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) +KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level) KVM_X86_OP_OPTIONAL(gmem_invalidate) #undef KVM_X86_OP diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 50366a1ca192..c0a739bf3829 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1922,7 +1922,7 @@ struct kvm_x86_ops { void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); - int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); + int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); }; struct kvm_x86_nested_ops { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fdc2824755ee..b735611e8fcd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4532,7 +4532,7 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); + req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn); if (req_max_level) max_level = min(max_level, req_max_level); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7744c210f947..be1c80d79331 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4947,7 +4947,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) } } -int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { int level, rc; bool assigned; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc6..8a66e2e985a4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5180,7 +5180,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .gmem_prepare = sev_gmem_prepare, .gmem_invalidate = sev_gmem_invalidate, - .private_max_mapping_level = sev_private_max_mapping_level, + .gmem_max_mapping_level = sev_gmem_max_mapping_level, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 58b9d168e0c8..d84a83ae18a1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -866,7 +866,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); -int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else @@ -895,7 +895,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in return 0; } static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} -static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; } diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index dbab1c15b0cd..dd7687ef7e2d 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -831,10 +831,10 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } -static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { if (is_td(kvm)) - return tdx_gmem_private_max_mapping_level(kvm, pfn); + return tdx_gmem_max_mapping_level(kvm, pfn); return 0; } @@ -1005,7 +1005,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), - .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) + .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level) }; struct kvm_x86_init_ops vt_init_ops __initdata = { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..b444714e8e8a 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3318,7 +3318,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return ret; } -int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return PG_LEVEL_4K; } diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 2b3424f638db..6037d1708485 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); -int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ -- 2.50.1.552.g942d659e1b-goog Move kvm_max_level_for_order() and kvm_max_private_mapping_level() up in mmu.c so that they can be used by __kvm_mmu_max_mapping_level(). Opportunistically drop the "inline" from kvm_max_level_for_order(). No functional change intended. Reviewed-by: Xiaoyao Li Reviewed-by: Ackerley Tng Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 72 +++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b735611e8fcd..20dd9f64156e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3285,6 +3285,42 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, return level; } +static u8 kvm_max_level_for_order(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + u8 max_level, int gmem_order) +{ + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn); + if (req_max_level) + max_level = min(max_level, req_max_level); + + return max_level; +} + static int __kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, int max_level, bool is_private) @@ -4503,42 +4539,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) vcpu->stat.pf_fixed++; } -static inline u8 kvm_max_level_for_order(int order) -{ - BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); - - KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && - order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && - order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); - - if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) - return PG_LEVEL_1G; - - if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) - return PG_LEVEL_2M; - - return PG_LEVEL_4K; -} - -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, - u8 max_level, int gmem_order) -{ - u8 req_max_level; - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - max_level = min(kvm_max_level_for_order(gmem_order), max_level); - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn); - if (req_max_level) - max_level = min(max_level, req_max_level); - - return max_level; -} - static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { -- 2.50.1.552.g942d659e1b-goog Rework kvm_mmu_max_mapping_level() to provide the plumbing to consult guest_memfd (and relevant vendor code) when recovering hugepages, e.g. after disabling live migration. The flaw has existed since guest_memfd was originally added, but has gone unnoticed due to lack of guest_memfd support for hugepages or dirty logging. Don't actually call into guest_memfd at this time, as it's unclear as to what the API should be. Ideally, KVM would simply use kvm_gmem_get_pfn(), but invoking kvm_gmem_get_pfn() would lead to sleeping in atomic context if guest_memfd needed to allocate memory (mmu_lock is held). Luckily, the path isn't actually reachable, so just add a TODO and WARN to ensure the functionality is added alongisde guest_memfd hugepage support, and punt the guest_memfd API design question to the future. Note, calling kvm_mem_is_private() in the non-fault path is safe, so long as mmu_lock is held, as hugepage recovery operates on shadow-present SPTEs, i.e. calling kvm_mmu_max_mapping_level() with @fault=NULL is mutually exclusive with kvm_vm_set_mem_attributes() changing the PRIVATE attribute of the gfn. Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 82 +++++++++++++++++++-------------- arch/x86/kvm/mmu/mmu_internal.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 3 files changed, 49 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 20dd9f64156e..61eb9f723675 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3302,31 +3302,54 @@ static u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, - u8 max_level, int gmem_order) +static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, + const struct kvm_memory_slot *slot, gfn_t gfn) { - u8 req_max_level; + u8 max_level, coco_level; + kvm_pfn_t pfn; - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; + /* For faults, use the gmem information that was resolved earlier. */ + if (fault) { + pfn = fault->pfn; + max_level = fault->max_level; + } else { + /* TODO: Call into guest_memfd once hugepages are supported. */ + WARN_ONCE(1, "Get pfn+order from guest_memfd"); + pfn = KVM_PFN_ERR_FAULT; + max_level = PG_LEVEL_4K; + } - max_level = min(kvm_max_level_for_order(gmem_order), max_level); if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; + return max_level; - req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn); - if (req_max_level) - max_level = min(max_level, req_max_level); + /* + * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT + * restrictions. A return of '0' means "no additional restrictions", to + * allow for using an optional "ret0" static call. + */ + coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn); + if (coco_level) + max_level = min(max_level, coco_level); return max_level; } -static int __kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, int max_level, bool is_private) +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, + const struct kvm_memory_slot *slot, gfn_t gfn) { struct kvm_lpage_info *linfo; - int host_level; + int host_level, max_level; + bool is_private; + + lockdep_assert_held(&kvm->mmu_lock); + + if (fault) { + max_level = fault->max_level; + is_private = fault->is_private; + } else { + max_level = PG_LEVEL_NUM; + is_private = kvm_mem_is_private(kvm, gfn); + } max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -3335,25 +3358,16 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, break; } + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + if (is_private) - return max_level; - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - host_level = host_pfn_mapping_level(kvm, gfn, slot); + host_level = kvm_max_private_mapping_level(kvm, fault, slot, gfn); + else + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } -int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn) -{ - bool is_private = kvm_slot_has_gmem(slot) && - kvm_mem_is_private(kvm, gfn); - - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); -} - void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -3374,9 +3388,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->max_level, - fault->is_private); + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault, + fault->slot, fault->gfn); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -4564,8 +4577,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, } fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); - fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, - fault->max_level, max_order); + fault->max_level = kvm_max_level_for_order(max_order); return RET_PF_CONTINUE; } @@ -7165,7 +7177,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, * mapping if the indirect sp has level = 1. */ if (sp->role.direct && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 65f3c89d7c5d..b776be783a2f 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return r; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7f3d7229b2c1..740cb06accdb 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1813,7 +1813,7 @@ static void recover_huge_pages_range(struct kvm *kvm, if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn); if (max_mapping_level < iter.level) continue; -- 2.50.1.552.g942d659e1b-goog Rework kvm_mmu_max_mapping_level() to consult guest_memfd for all mappings, not just private mappings, so that hugepage support plays nice with the upcoming support for backing non-private memory with guest_memfd. In addition to getting the max order from guest_memfd for gmem-only memslots, update TDX's hook to effectively ignore shared mappings, as TDX's restrictions on page size only apply to Secure EPT mappings. Do nothing for SNP, as RMP restrictions apply to both private and shared memory. Suggested-by: Ackerley Tng Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 12 +++++++----- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.h | 4 ++-- arch/x86/kvm/vmx/main.c | 5 +++-- arch/x86/kvm/vmx/tdx.c | 5 ++++- arch/x86/kvm/vmx/x86_ops.h | 2 +- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c0a739bf3829..c56cc54d682a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1922,7 +1922,7 @@ struct kvm_x86_ops { void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); - int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); + int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); }; struct kvm_x86_nested_ops { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 61eb9f723675..e83d666f32ad 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3302,8 +3302,9 @@ static u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } -static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, - const struct kvm_memory_slot *slot, gfn_t gfn) +static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, + const struct kvm_memory_slot *slot, gfn_t gfn, + bool is_private) { u8 max_level, coco_level; kvm_pfn_t pfn; @@ -3327,7 +3328,7 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault * * restrictions. A return of '0' means "no additional restrictions", to * allow for using an optional "ret0" static call. */ - coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn); + coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private); if (coco_level) max_level = min(max_level, coco_level); @@ -3361,8 +3362,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - if (is_private) - host_level = kvm_max_private_mapping_level(kvm, fault, slot, gfn); + if (is_private || kvm_memslot_is_gmem_only(slot)) + host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn, + is_private); else host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index be1c80d79331..807d4b70327a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4947,7 +4947,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) } } -int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { int level, rc; bool assigned; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d84a83ae18a1..70df7c6413cf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -866,7 +866,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); -int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else @@ -895,7 +895,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in return 0; } static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} -static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { return 0; } diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index dd7687ef7e2d..bb5f182f6788 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -831,10 +831,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } -static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + bool is_private) { if (is_td(kvm)) - return tdx_gmem_max_mapping_level(kvm, pfn); + return tdx_gmem_max_mapping_level(kvm, pfn, is_private); return 0; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b444714e8e8a..ca9c8ec7dd01 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3318,8 +3318,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return ret; } -int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { + if (!is_private) + return 0; + return PG_LEVEL_4K; } diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 6037d1708485..4c70f56c57c8 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); -int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ -- 2.50.1.552.g942d659e1b-goog From: Ackerley Tng Update the KVM MMU fault handler to service guest page faults for memory slots backed by guest_memfd with mmap support. For such slots, the MMU must always fault in pages directly from guest_memfd, bypassing the host's userspace_addr. This ensures that guest_memfd-backed memory is always handled through the guest_memfd specific faulting path, regardless of whether it's for private or non-private (shared) use cases. Additionally, rename kvm_mmu_faultin_pfn_private() to kvm_mmu_faultin_pfn_gmem(), as this function is now used to fault in pages from guest_memfd for both private and non-private memory, accommodating the new use cases. Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Ackerley Tng Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba [sean: drop the helper] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e83d666f32ad..56c80588efa0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4561,8 +4561,8 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, r == RET_PF_RETRY, fault->map_writable); } -static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { int max_order, r; @@ -4589,8 +4589,8 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, { unsigned int foll = fault->write ? FOLL_WRITE : 0; - if (fault->is_private) - return kvm_mmu_faultin_pfn_private(vcpu, fault); + if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot)) + return kvm_mmu_faultin_pfn_gmem(vcpu, fault); foll |= FOLL_NOWAIT; fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Refactor user_mem_abort() to improve code clarity and simplify assumptions within the function. Key changes include: * Immediately set force_pte to true at the beginning of the function if logging_active is true. This simplifies the flow and makes the condition for forcing a PTE more explicit. * Remove the misleading comment stating that logging_active is guaranteed to never be true for VM_PFNMAP memslots, as this assertion is not entirely correct. * Extract reusable code blocks into new helper functions: * prepare_mmu_memcache(): Encapsulates the logic for preparing and topping up the MMU page cache. * adjust_nested_fault_perms(): Isolates the adjustments to shadow S2 permissions and the encoding of nested translation levels. * Update min(a, (long)b) to min_t(long, a, b) for better type safety and consistency. * Perform other minor tidying up of the code. These changes primarily aim to simplify user_mem_abort() and make its logic easier to understand and maintain, setting the stage for future modifications. Reviewed-by: Gavin Shan Reviewed-by: Marc Zyngier Reviewed-by: Tao Chan Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- arch/arm64/kvm/mmu.c | 110 +++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 51 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 2942ec92c5a4..b3eacb400fab 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1470,13 +1470,56 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) return vma->vm_flags & VM_MTE_ALLOWED; } +static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, + void **memcache) +{ + int min_pages; + + if (!is_protected_kvm_enabled()) + *memcache = &vcpu->arch.mmu_page_cache; + else + *memcache = &vcpu->arch.pkvm_memcache; + + if (!topup_memcache) + return 0; + + min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) + return kvm_mmu_topup_memory_cache(*memcache, min_pages); + + return topup_hyp_memcache(*memcache, min_pages); +} + +/* + * Potentially reduce shadow S2 permissions to match the guest's own S2. For + * exec faults, we'd only reach this point if the guest actually allowed it (see + * kvm_s2_handle_perm_fault). + * + * Also encode the level of the original translation in the SW bits of the leaf + * entry as a proxy for the span of that translation. This will be retrieved on + * TLB invalidation from the guest and used to limit the invalidation scope if a + * TTL hint or a range isn't provided. + */ +static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot, + bool *writable) +{ + *writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_readable(nested)) + *prot &= ~KVM_PGTABLE_PROT_R; + + *prot |= kvm_encode_nested_level(nested); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, bool fault_is_perm) { int ret = 0; - bool write_fault, writable, force_pte = false; + bool topup_memcache; + bool write_fault, writable; bool exec_fault, mte_allowed; bool device = false, vfio_allow_any_uc = false; unsigned long mmu_seq; @@ -1488,6 +1531,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); + bool force_pte = logging_active; long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; @@ -1498,17 +1542,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); - VM_BUG_ON(write_fault && exec_fault); - - if (fault_is_perm && !write_fault && !exec_fault) { - kvm_err("Unexpected L2 read permission error\n"); - return -EFAULT; - } - - if (!is_protected_kvm_enabled()) - memcache = &vcpu->arch.mmu_page_cache; - else - memcache = &vcpu->arch.pkvm_memcache; + VM_WARN_ON_ONCE(write_fault && exec_fault); /* * Permission faults just need to update the existing leaf entry, @@ -1516,17 +1550,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ - if (!fault_is_perm || (logging_active && write_fault)) { - int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); - - if (!is_protected_kvm_enabled()) - ret = kvm_mmu_topup_memory_cache(memcache, min_pages); - else - ret = topup_hyp_memcache(memcache, min_pages); - - if (ret) - return ret; - } + topup_memcache = !fault_is_perm || (logging_active && write_fault); + ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); + if (ret) + return ret; /* * Let's check if we will get back a huge page backed by hugetlbfs, or @@ -1540,16 +1567,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - /* - * logging_active is guaranteed to never be true for VM_PFNMAP - * memslots. - */ - if (logging_active) { - force_pte = true; + if (force_pte) vma_shift = PAGE_SHIFT; - } else { + else vma_shift = get_vma_page_shift(vma, hva); - } switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED @@ -1601,7 +1622,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, max_map_size = PAGE_SIZE; force_pte = (max_map_size == PAGE_SIZE); - vma_pagesize = min(vma_pagesize, (long)max_map_size); + vma_pagesize = min_t(long, vma_pagesize, max_map_size); } /* @@ -1630,7 +1651,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmu_seq = kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, @@ -1665,24 +1686,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; - /* - * Potentially reduce shadow S2 permissions to match the guest's own - * S2. For exec faults, we'd only reach this point if the guest - * actually allowed it (see kvm_s2_handle_perm_fault). - * - * Also encode the level of the original translation in the SW bits - * of the leaf entry as a proxy for the span of that translation. - * This will be retrieved on TLB invalidation from the guest and - * used to limit the invalidation scope if a TTL hint or a range - * isn't provided. - */ - if (nested) { - writable &= kvm_s2_trans_writable(nested); - if (!kvm_s2_trans_readable(nested)) - prot &= ~KVM_PGTABLE_PROT_R; - - prot |= kvm_encode_nested_level(nested); - } + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; @@ -1953,6 +1958,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) goto out_unlock; } + VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && + !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, esr_fsc_is_permission_fault(esr)); if (ret == 0) -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Add arm64 architecture support for handling guest page faults on memory slots backed by guest_memfd. This change introduces a new function, gmem_abort(), which encapsulates the fault handling logic specific to guest_memfd-backed memory. The kvm_handle_guest_abort() entry point is updated to dispatch to gmem_abort() when a fault occurs on a guest_memfd-backed memory slot (as determined by kvm_slot_has_gmem()). Until guest_memfd gains support for huge pages, the fault granule for these memory regions is restricted to PAGE_SIZE. Reviewed-by: Gavin Shan Reviewed-by: James Houghton Reviewed-by: Marc Zyngier Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- arch/arm64/kvm/mmu.c | 86 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index b3eacb400fab..8c82df80a835 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1512,6 +1512,82 @@ static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, *prot |= kvm_encode_nested_level(nested); } +#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) + +static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, + struct kvm_memory_slot *memslot, bool is_perm) +{ + bool write_fault, exec_fault, writable; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + unsigned long mmu_seq; + struct page *page; + struct kvm *kvm = vcpu->kvm; + void *memcache; + kvm_pfn_t pfn; + gfn_t gfn; + int ret; + + ret = prepare_mmu_memcache(vcpu, true, &memcache); + if (ret) + return ret; + + if (nested) + gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; + else + gfn = fault_ipa >> PAGE_SHIFT; + + write_fault = kvm_is_write_fault(vcpu); + exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); + + VM_WARN_ON_ONCE(write_fault && exec_fault); + + mmu_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, + write_fault, exec_fault, false); + return ret; + } + + writable = !(memslot->flags & KVM_MEM_READONLY); + + if (nested) + adjust_nested_fault_perms(nested, &prot, &writable); + + if (writable) + prot |= KVM_PGTABLE_PROT_W; + + if (exec_fault || + (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && + (!nested || kvm_s2_trans_executable(nested)))) + prot |= KVM_PGTABLE_PROT_X; + + kvm_fault_lock(kvm); + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, + __pfn_to_phys(pfn), prot, + memcache, flags); + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); + + if (writable && !ret) + mark_page_dirty_in_slot(kvm, memslot, gfn); + + return ret != -EAGAIN ? ret : 0; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, @@ -1536,7 +1612,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; - enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1961,8 +2037,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); - ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, - esr_fsc_is_permission_fault(esr)); + if (kvm_slot_has_gmem(memslot)) + ret = gmem_abort(vcpu, fault_ipa, nested, memslot, + esr_fsc_is_permission_fault(esr)); + else + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, + esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Handle faults for memslots backed by guest_memfd in arm64 nested virtualization triggered by VNCR_EL2. * Introduce is_gmem output parameter to kvm_translate_vncr(), indicating whether the faulted memory slot is backed by guest_memfd. * Dispatch faults backed by guest_memfd to kvm_gmem_get_pfn(). * Update kvm_handle_vncr_abort() to handle potential guest_memfd errors. Some of the guest_memfd errors need to be handled by userspace instead of attempting to (implicitly) retry by returning to the guest. Suggested-by: Marc Zyngier Reviewed-by: Marc Zyngier Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- arch/arm64/kvm/nested.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index dc1d26559bfa..b3edd7f7c8cd 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu) return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48); } -static int kvm_translate_vncr(struct kvm_vcpu *vcpu) +static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem) { + struct kvm_memory_slot *memslot; bool write_fault, writable; unsigned long mmu_seq; struct vncr_tlb *vt; @@ -1216,10 +1217,25 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu) smp_rmb(); gfn = vt->wr.pa >> PAGE_SHIFT; - pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page); - if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot) return -EFAULT; + *is_gmem = kvm_slot_has_gmem(memslot); + if (!*is_gmem) { + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); + if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + return -EFAULT; + } else { + ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL); + if (ret) { + kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE, + write_fault, false, false); + return ret; + } + } + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) return -EAGAIN; @@ -1292,23 +1308,36 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu) if (esr_fsc_is_permission_fault(esr)) { inject_vncr_perm(vcpu); } else if (esr_fsc_is_translation_fault(esr)) { - bool valid; + bool valid, is_gmem = false; int ret; scoped_guard(read_lock, &vcpu->kvm->mmu_lock) valid = kvm_vncr_tlb_lookup(vcpu); if (!valid) - ret = kvm_translate_vncr(vcpu); + ret = kvm_translate_vncr(vcpu, &is_gmem); else ret = -EPERM; switch (ret) { case -EAGAIN: - case -ENOMEM: /* Let's try again... */ break; + case -ENOMEM: + /* + * For guest_memfd, this indicates that it failed to + * create a folio to back the memory. Inform userspace. + */ + if (is_gmem) + return 0; + /* Otherwise, let's try again... */ + break; case -EFAULT: + case -EIO: + case -EHWPOISON: + if (is_gmem) + return 0; + fallthrough; case -EINVAL: case -ENOENT: case -EACCES: -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Now that the infrastructure is in place, enable guest_memfd for arm64. * Select CONFIG_KVM_GUEST_MEMFD in KVM/arm64 Kconfig. * Enforce KVM_MEMSLOT_GMEM_ONLY for guest_memfd on arm64: Ensure that guest_memfd-backed memory slots on arm64 are only supported if they are intended for shared memory use cases (i.e., kvm_memslot_is_gmem_only() is true). This design reflects the current arm64 KVM ecosystem where guest_memfd is primarily being introduced for VMs that support shared memory. Reviewed-by: James Houghton Reviewed-by: Gavin Shan Reviewed-by: Marc Zyngier Acked-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- arch/arm64/kvm/Kconfig | 1 + arch/arm64/kvm/mmu.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 713248f240e0..bff62e75d681 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -37,6 +37,7 @@ menuconfig KVM select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS + select KVM_GUEST_MEMFD help Support hosting virtualized guest machines. diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8c82df80a835..85559b8a0845 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2276,6 +2276,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) return -EFAULT; + /* + * Only support guest_memfd backed memslots with mappable memory, since + * there aren't any CoCo VMs that support only private memory on arm64. + */ + if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new)) + return -EINVAL; + hva = new->userspace_addr; reg_end = hva + (new->npages << PAGE_SHIFT); -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Now that all the x86 and arm64 plumbing for mmap() on guest_memfd is in place, allow userspace to set GUEST_MEMFD_FLAG_MMAP and advertise support via a new capability, KVM_CAP_GUEST_MEMFD_MMAP. The availability of this capability is determined per architecture, and its enablement for a specific guest_memfd instance is controlled by the GUEST_MEMFD_FLAG_MMAP flag at creation time. Update the KVM API documentation to detail the KVM_CAP_GUEST_MEMFD_MMAP capability, the associated GUEST_MEMFD_FLAG_MMAP, and provide essential information regarding support for mmap in guest_memfd. Reviewed-by: David Hildenbrand Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 9 +++++++++ include/uapi/linux/kvm.h | 2 ++ virt/kvm/guest_memfd.c | 7 ++++++- virt/kvm/kvm_main.c | 2 ++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index fcb783735dd1..1e0c4a68876d 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single guest_memfd range is not allowed (any number of memory regions can be bound to a single guest_memfd file, but the bound ranges must not overlap). +When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field +supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation +enables mmap() and faulting of guest_memfd memory to host userspace. + +When the KVM MMU performs a PFN lookup to service a guest fault and the backing +guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be +consumed from guest_memfd, regardless of whether it is a shared or a private +fault. + See KVM_SET_USER_MEMORY_REGION2 for additional details. 4.143 KVM_PRE_FAULT_MEMORY diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index aeb2ca10b190..0d96d2ae6e5d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -961,6 +961,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2 240 #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 +#define KVM_CAP_GUEST_MEMFD_MMAP 243 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1597,6 +1598,7 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) struct kvm_create_guest_memfd { __u64 size; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index d5b445548af4..08a6bc7d25b6 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -314,7 +314,9 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) static bool kvm_gmem_supports_mmap(struct inode *inode) { - return false; + const u64 flags = (u64)inode->i_private; + + return flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -522,6 +524,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 flags = args->flags; u64 valid_flags = 0; + if (kvm_arch_supports_gmem_mmap(kvm)) + valid_flags |= GUEST_MEMFD_FLAG_MMAP; + if (flags & ~valid_flags) return -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4f57cb92e109..18f29ef93543 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4918,6 +4918,8 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return 1; + case KVM_CAP_GUEST_MEMFD_MMAP: + return !kvm || kvm_arch_supports_gmem_mmap(kvm); #endif default: break; -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Update the guest_memfd_test selftest to use getpagesize() instead of hardcoded 4KB page size values. Using hardcoded page sizes can cause test failures on architectures or systems configured with larger page sizes, such as arm64 with 64KB pages. By dynamically querying the system's page size, the test becomes more portable and robust across different environments. Additionally, build the guest_memfd_test selftest for arm64. Reviewed-by: David Hildenbrand Reviewed-by: Shivank Garg Reviewed-by: Gavin Shan Suggested-by: Gavin Shan Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/guest_memfd_test.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 40920445bfbe..963687892bcb 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -174,6 +174,7 @@ TEST_GEN_PROGS_arm64 += arch_timer TEST_GEN_PROGS_arm64 += coalesced_io_test TEST_GEN_PROGS_arm64 += dirty_log_perf_test TEST_GEN_PROGS_arm64 += get-reg-list +TEST_GEN_PROGS_arm64 += guest_memfd_test TEST_GEN_PROGS_arm64 += memslot_modification_stress_test TEST_GEN_PROGS_arm64 += memslot_perf_test TEST_GEN_PROGS_arm64 += mmu_stress_test diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index ce687f8d248f..341ba616cf55 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -146,24 +146,25 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) { int fd1, fd2, ret; struct stat st1, st2; + size_t page_size = getpagesize(); - fd1 = __vm_create_guest_memfd(vm, 4096, 0); + fd1 = __vm_create_guest_memfd(vm, page_size, 0); TEST_ASSERT(fd1 != -1, "memfd creation should succeed"); ret = fstat(fd1, &st1); TEST_ASSERT(ret != -1, "memfd fstat should succeed"); - TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size"); + TEST_ASSERT(st1.st_size == page_size, "memfd st_size should match requested size"); - fd2 = __vm_create_guest_memfd(vm, 8192, 0); + fd2 = __vm_create_guest_memfd(vm, page_size * 2, 0); TEST_ASSERT(fd2 != -1, "memfd creation should succeed"); ret = fstat(fd2, &st2); TEST_ASSERT(ret != -1, "memfd fstat should succeed"); - TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size"); + TEST_ASSERT(st2.st_size == page_size * 2, "second memfd st_size should match requested size"); ret = fstat(fd1, &st1); TEST_ASSERT(ret != -1, "memfd fstat should succeed"); - TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size"); + TEST_ASSERT(st1.st_size == page_size, "first memfd st_size should still match requested size"); TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers"); close(fd2); -- 2.50.1.552.g942d659e1b-goog From: Fuad Tabba Expand the guest_memfd selftests to comprehensively test host userspace mmap functionality for guest_memfd-backed memory when supported by the VM type. Introduce new test cases to verify the following: * Successful mmap operations: Ensure that MAP_SHARED mappings succeed when guest_memfd mmap is enabled. * Data integrity: Validate that data written to the mmap'd region is correctly persistent and readable. * fallocate interaction: Test that fallocate(FALLOC_FL_PUNCH_HOLE) correctly zeros out mapped pages. * Out-of-bounds access: Verify that accessing memory beyond the guest_memfd's size correctly triggers a SIGBUS signal. * Unsupported mmap: Confirm that mmap attempts fail as expected when guest_memfd mmap support is not enabled for the specific guest_memfd instance or VM type. * Flag validity: Introduce test_vm_type_gmem_flag_validity() to systematically test that only allowed guest_memfd creation flags are accepted for different VM types (e.g., GUEST_MEMFD_FLAG_MMAP for default VMs, no flags for CoCo VMs). The existing tests for guest_memfd creation (multiple instances, invalid sizes), file read/write, file size, and invalid punch hole operations are integrated into the new test_with_type() framework to allow testing across different VM types. Cc: James Houghton Cc: Gavin Shan Cc: Shivank Garg Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Signed-off-by: Fuad Tabba Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 161 +++++++++++++++--- 1 file changed, 139 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 341ba616cf55..088053d5f0f5 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -13,6 +13,8 @@ #include #include +#include +#include #include #include #include @@ -34,12 +36,83 @@ static void test_file_read_write(int fd) "pwrite on a guest_mem fd should fail"); } -static void test_mmap(int fd, size_t page_size) +static void test_mmap_supported(int fd, size_t page_size, size_t total_size) +{ + const char val = 0xaa; + char *mem; + size_t i; + int ret; + + mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + TEST_ASSERT(mem == MAP_FAILED, "Copy-on-write not allowed by guest_memfd."); + + mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed."); + + memset(mem, val, total_size); + for (i = 0; i < total_size; i++) + TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, + page_size); + TEST_ASSERT(!ret, "fallocate the first page should succeed."); + + for (i = 0; i < page_size; i++) + TEST_ASSERT_EQ(READ_ONCE(mem[i]), 0x00); + for (; i < total_size; i++) + TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); + + memset(mem, val, page_size); + for (i = 0; i < total_size; i++) + TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); + + ret = munmap(mem, total_size); + TEST_ASSERT(!ret, "munmap() should succeed."); +} + +static sigjmp_buf jmpbuf; +void fault_sigbus_handler(int signum) +{ + siglongjmp(jmpbuf, 1); +} + +static void test_fault_overflow(int fd, size_t page_size, size_t total_size) +{ + struct sigaction sa_old, sa_new = { + .sa_handler = fault_sigbus_handler, + }; + size_t map_size = total_size * 4; + const char val = 0xaa; + char *mem; + size_t i; + int ret; + + mem = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem != MAP_FAILED, "mmap() for guest_memfd should succeed."); + + sigaction(SIGBUS, &sa_new, &sa_old); + if (sigsetjmp(jmpbuf, 1) == 0) { + memset(mem, 0xaa, map_size); + TEST_ASSERT(false, "memset() should have triggered SIGBUS."); + } + sigaction(SIGBUS, &sa_old, NULL); + + for (i = 0; i < total_size; i++) + TEST_ASSERT_EQ(READ_ONCE(mem[i]), val); + + ret = munmap(mem, map_size); + TEST_ASSERT(!ret, "munmap() should succeed."); +} + +static void test_mmap_not_supported(int fd, size_t page_size, size_t total_size) { char *mem; mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); TEST_ASSERT_EQ(mem, MAP_FAILED); + + mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT_EQ(mem, MAP_FAILED); } static void test_file_size(int fd, size_t page_size, size_t total_size) @@ -120,26 +193,19 @@ static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size) } } -static void test_create_guest_memfd_invalid(struct kvm_vm *vm) +static void test_create_guest_memfd_invalid_sizes(struct kvm_vm *vm, + uint64_t guest_memfd_flags, + size_t page_size) { - size_t page_size = getpagesize(); - uint64_t flag; size_t size; int fd; for (size = 1; size < page_size; size++) { - fd = __vm_create_guest_memfd(vm, size, 0); - TEST_ASSERT(fd == -1 && errno == EINVAL, + fd = __vm_create_guest_memfd(vm, size, guest_memfd_flags); + TEST_ASSERT(fd < 0 && errno == EINVAL, "guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL", size); } - - for (flag = BIT(0); flag; flag <<= 1) { - fd = __vm_create_guest_memfd(vm, page_size, flag); - TEST_ASSERT(fd == -1 && errno == EINVAL, - "guest_memfd() with flag '0x%lx' should fail with EINVAL", - flag); - } } static void test_create_guest_memfd_multiple(struct kvm_vm *vm) @@ -171,30 +237,81 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) close(fd1); } -int main(int argc, char *argv[]) +static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags) { - size_t page_size; - size_t total_size; + size_t page_size = getpagesize(); + uint64_t flag; int fd; + + for (flag = BIT(0); flag; flag <<= 1) { + fd = __vm_create_guest_memfd(vm, page_size, flag); + if (flag & valid_flags) { + TEST_ASSERT(fd >= 0, + "guest_memfd() with flag '0x%lx' should succeed", + flag); + close(fd); + } else { + TEST_ASSERT(fd < 0 && errno == EINVAL, + "guest_memfd() with flag '0x%lx' should fail with EINVAL", + flag); + } + } +} + +static void test_guest_memfd(unsigned long vm_type) +{ + uint64_t flags = 0; struct kvm_vm *vm; - - TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + size_t total_size; + size_t page_size; + int fd; page_size = getpagesize(); total_size = page_size * 4; - vm = vm_create_barebones(); + vm = vm_create_barebones_type(vm_type); + + if (vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP)) + flags |= GUEST_MEMFD_FLAG_MMAP; - test_create_guest_memfd_invalid(vm); test_create_guest_memfd_multiple(vm); + test_create_guest_memfd_invalid_sizes(vm, flags, page_size); - fd = vm_create_guest_memfd(vm, total_size, 0); + fd = vm_create_guest_memfd(vm, total_size, flags); test_file_read_write(fd); - test_mmap(fd, page_size); + + if (flags & GUEST_MEMFD_FLAG_MMAP) { + test_mmap_supported(fd, page_size, total_size); + test_fault_overflow(fd, page_size, total_size); + } else { + test_mmap_not_supported(fd, page_size, total_size); + } + test_file_size(fd, page_size, total_size); test_fallocate(fd, page_size, total_size); test_invalid_punch_hole(fd, page_size, total_size); + test_guest_memfd_flags(vm, flags); + close(fd); + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + unsigned long vm_types, vm_type; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + + /* + * Not all architectures support KVM_CAP_VM_TYPES. However, those that + * support guest_memfd have that support for the default VM type. + */ + vm_types = kvm_check_cap(KVM_CAP_VM_TYPES); + if (!vm_types) + vm_types = VM_TYPE_DEFAULT; + + for_each_set_bit(vm_type, &vm_types, BITS_PER_TYPE(vm_types)) + test_guest_memfd(vm_type); } -- 2.50.1.552.g942d659e1b-goog Add a guest_memfd testcase to verify that a vCPU can fault-in guest_memfd memory that supports mmap(), but that is not currently mapped into host userspace and/or has a userspace address (in the memslot) that points at something other than the target guest_memfd range. Mapping guest_memfd memory into the guest is supposed to operate completely independently from any userspace mappings. Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 088053d5f0f5..b86bf89a71e0 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include "kvm_util.h" #include "test_util.h" +#include "ucall_common.h" static void test_file_read_write(int fd) { @@ -298,6 +300,66 @@ static void test_guest_memfd(unsigned long vm_type) kvm_vm_free(vm); } +static void guest_code(uint8_t *mem, uint64_t size) +{ + size_t i; + + for (i = 0; i < size; i++) + __GUEST_ASSERT(mem[i] == 0xaa, + "Guest expected 0xaa at offset %lu, got 0x%x", i, mem[i]); + + memset(mem, 0xff, size); + GUEST_DONE(); +} + +static void test_guest_memfd_guest(void) +{ + /* + * Skip the first 4gb and slot0. slot0 maps <1gb and is used to back + * the guest's code, stack, and page tables, and low memory contains + * the PCI hole and other MMIO regions that need to be avoided. + */ + const uint64_t gpa = SZ_4G; + const int slot = 1; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint8_t *mem; + size_t size; + int fd, i; + + if (!kvm_has_cap(KVM_CAP_GUEST_MEMFD_MMAP)) + return; + + vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1, guest_code); + + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP), + "Default VM type should always support guest_memfd mmap()"); + + size = vm->page_size; + fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP); + vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); + + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed"); + memset(mem, 0xaa, size); + munmap(mem, size); + + virt_pg_map(vm, gpa, gpa); + vcpu_args_set(vcpu, 2, gpa, size); + vcpu_run(vcpu); + + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem != MAP_FAILED, "mmap() on guest_memfd failed"); + for (i = 0; i < size; i++) + TEST_ASSERT_EQ(mem[i], 0xff); + + close(fd); + kvm_vm_free(vm); +} + int main(int argc, char *argv[]) { unsigned long vm_types, vm_type; @@ -314,4 +376,6 @@ int main(int argc, char *argv[]) for_each_set_bit(vm_type, &vm_types, BITS_PER_TYPE(vm_types)) test_guest_memfd(vm_type); + + test_guest_memfd_guest(); } -- 2.50.1.552.g942d659e1b-goog