Widen the error_code field in struct x86_exception from u16 to u64 to accommodate AMD's NPF error code, which defines information bits above bit 31, e.g. PFERR_GUEST_FINAL_MASK (bit 32), and PFERR_GUEST_PAGE_MASK (bit 33). Retain the u16 type for the local errcode variable in walk_addr_generic as the walker synthesizes conventional #PF error codes that are architecturally limited to bits 15:0. Signed-off-by: Kevin Cheng --- arch/x86/kvm/kvm_emulate.h | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index fb3dab4b5a53..ff4f9b0a01ff 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -22,7 +22,7 @@ enum x86_intercept_stage; struct x86_exception { u8 vector; bool error_code_valid; - u16 error_code; + u64 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 901cd2bd40b8..37eba7dafd14 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -317,6 +317,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; + /* + * Note! Track the error_code that's common to legacy shadow paging + * and NPT shadow paging as a u16 to guard against unintentionally + * setting any of bits 63:16. Architecturally, the #PF error code is + * 32 bits, and Intel CPUs don't support settings bits 31:16. + */ u16 errcode = 0; gpa_t real_gpa; gfn_t gfn; -- 2.53.0.851.ga537e3e6e9-goog Fix nested_svm_inject_npf_exit() to correctly set the fault stage bits (PFERR_GUEST_PAGE_MASK vs PFERR_GUEST_FINAL_MASK) in exit_info_1 when injecting an NPF to L1. There are two paths into nested_svm_inject_npf_exit(): hardware NPF exits (guest_mmu walker) and emulation-triggered faults (nested_mmu walker). For emulation, the nested_mmu walker knows whether the fault occurred on a page table page or the final translation, and sets the appropriate bit in fault->error_code via paging_tmpl.h. For hardware NPF exits, the guest_mmu walker cannot determine this. Only hardware knows, via exit_info_1 bits 32-33. The old code hardcoded (1ULL << 32) for the emulation path, always setting PFERR_GUEST_FINAL_MASK even for page table walk faults. For the hardware NPF path, it preserved exit_info_1's upper bits and replaced the lower 32 bits with fault->error_code, which was correct but convoluted. Introduce hardware_nested_page_fault in struct x86_exception to distinguish the two paths. For hardware NPF exits, take the fault stage bits from exit_info_1. For emulation faults, take them from fault->error_code. The lower 32 bits always come from fault->error_code, which reflects L1's NPT state (L0's NPT may differ since KVM only populates it when the full translation succeeds). Add a WARN_ON_ONCE if exactly one of PFERR_GUEST_FINAL_MASK or PFERR_GUEST_PAGE_MASK is not set in the final exit_info_1, as this would indicate a bug in the fault handling code. Signed-off-by: Kevin Cheng --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/mmu/paging_tmpl.h | 26 +++++++++++------------ arch/x86/kvm/svm/nested.c | 37 +++++++++++++++++++++++---------- 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d3bdc9828133..134394dc09e6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -281,6 +281,8 @@ enum x86_intercept_stage; #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) #define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_FAULT_STAGE_MASK \ + (PFERR_GUEST_FINAL_MASK | PFERR_GUEST_PAGE_MASK) #define PFERR_GUEST_ENC_MASK BIT_ULL(34) #define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) #define PFERR_GUEST_VMPL_MASK BIT_ULL(36) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index ff4f9b0a01ff..e67982f4da40 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -24,6 +24,7 @@ struct x86_exception { bool error_code_valid; u64 error_code; bool nested_page_fault; + bool hardware_nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; unsigned long exit_qualification; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 37eba7dafd14..ea2b7569f8a4 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -385,18 +385,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), nested_access, &walker->fault); - /* - * FIXME: This can happen if emulation (for of an INS/OUTS - * instruction) triggers a nested page fault. The exit - * qualification / exit info field will incorrectly have - * "guest page access" as the nested page fault's cause, - * instead of "guest page structure access". To fix this, - * the x86_exception struct should be augmented with enough - * information to fix the exit_qualification or exit_info_1 - * fields. - */ - if (unlikely(real_gpa == INVALID_GPA)) + if (unlikely(real_gpa == INVALID_GPA)) { +#if PTTYPE != PTTYPE_EPT + walker->fault.error_code |= PFERR_GUEST_PAGE_MASK; +#endif return 0; + } slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); if (!kvm_is_visible_memslot(slot)) @@ -452,8 +446,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, #endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == INVALID_GPA) + if (real_gpa == INVALID_GPA) { +#if PTTYPE != PTTYPE_EPT + walker->fault.error_code |= PFERR_GUEST_FINAL_MASK; +#endif return 0; + } walker->gfn = real_gpa >> PAGE_SHIFT; @@ -787,8 +785,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { - if (!fault->prefetch) + if (!fault->prefetch) { + walker.fault.hardware_nested_page_fault = walker.fault.nested_page_fault; kvm_inject_emulated_page_fault(vcpu, &walker.fault); + } return RET_PF_RETRY; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5ff01d2ac85e..62904ec08dda 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -38,19 +38,34 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; + u64 fault_stage; - if (vmcb->control.exit_code != SVM_EXIT_NPF) { - /* - * TODO: track the cause of the nested page fault, and - * correctly fill in the high bits of exit_info_1. - */ - vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_info_1 = (1ULL << 32); - vmcb->control.exit_info_2 = fault->address; - } + /* + * For hardware NPF exits, the GUEST_FAULT_STAGE bits are only + * available in the hardware exit_info_1, since the guest_mmu + * walker doesn't know whether the faulting GPA was a page table + * page or final page from L2's perspective. + */ + if (fault->hardware_nested_page_fault) + fault_stage = vmcb->control.exit_info_1 & + PFERR_GUEST_FAULT_STAGE_MASK; + else + fault_stage = fault->error_code & PFERR_GUEST_FAULT_STAGE_MASK; + + vmcb->control.exit_code = SVM_EXIT_NPF; + vmcb->control.exit_info_1 = fault_stage | fault->error_code; + vmcb->control.exit_info_2 = fault->address; - vmcb->control.exit_info_1 &= ~0xffffffffULL; - vmcb->control.exit_info_1 |= fault->error_code; + /* + * All nested page faults should be annotated as occurring on the + * final translation *or* the page walk. Arbitrarily choose "final" + * if KVM is buggy and enumerated both or neither. + */ + if (WARN_ON_ONCE(hweight64(vmcb->control.exit_info_1 & + PFERR_GUEST_FAULT_STAGE_MASK) != 1)) { + vmcb->control.exit_info_1 &= ~PFERR_GUEST_FAULT_STAGE_MASK; + vmcb->control.exit_info_1 |= PFERR_GUEST_FINAL_MASK; + } nested_svm_vmexit(svm); } -- 2.53.0.851.ga537e3e6e9-goog Make the OR of EPT_VIOLATION_GVA_IS_VALID and EPT_VIOLATION_GVA_TRANSLATED from the hardware exit qualification conditional on the fault originating from a hardware EPT violation exit. The hardware exit qualification reflects the original VM exit, which may not be an EPT violation at all, e.g. if KVM is emulating an I/O instruction and the memory operand's translation through L1's EPT fails. In that case, bits 7-8 of the exit qualification have completely different semantics (or are simply zero), and OR'ing them into the injected EPT violation corrupts the GVA_IS_VALID/ GVA_TRANSLATED information. Use the hardware_nested_page_fault flag introduced in the previous patch to distinguish hardware EPT violation exits from emulation-triggered faults. For hardware exits, take the GVA_IS_VALID/GVA_TRANSLATED bits from the hardware exit qualification. For emulation faults, take them from fault->exit_qualification, which is populated by the nested_mmu walker in paging_tmpl.h. Replace the #if PTTYPE != PTTYPE_EPT preprocessor guards in paging_tmpl.h with a runtime kvm_nested_fault_is_ept() helper that checks guest_mmu to determine whether the nested fault is EPT vs NPT, and sets the appropriate field (exit_qualification for EPT, error_code for NPF) accordingly. Signed-off-by: Kevin Cheng --- arch/x86/kvm/mmu/mmu.c | 10 ++++++++++ arch/x86/kvm/mmu/paging_tmpl.h | 22 +++++++++++++++------- arch/x86/kvm/vmx/nested.c | 9 +++++---- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3dce38ffee76..aabf4ac39c43 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5272,6 +5272,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return false; } +static bool kvm_nested_fault_is_ept(struct kvm_vcpu *vcpu, + struct x86_exception *exception); + #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" @@ -5285,6 +5288,13 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, #include "paging_tmpl.h" #undef PTTYPE +static bool kvm_nested_fault_is_ept(struct kvm_vcpu *vcpu, + struct x86_exception *exception) +{ + WARN_ON_ONCE(!exception->nested_page_fault); + return vcpu->arch.guest_mmu.page_fault == ept_page_fault; +} + static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ea2b7569f8a4..15be93d735ab 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -386,9 +386,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, nested_access, &walker->fault); if (unlikely(real_gpa == INVALID_GPA)) { -#if PTTYPE != PTTYPE_EPT - walker->fault.error_code |= PFERR_GUEST_PAGE_MASK; -#endif + /* + * Set EPT Violation flags even if the fault is an + * EPT Misconfig, fault.exit_qualification is ignored + * for EPT Misconfigs. + */ + if (kvm_nested_fault_is_ept(vcpu, &walker->fault)) + walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID; + else + walker->fault.error_code |= PFERR_GUEST_PAGE_MASK; return 0; } @@ -447,9 +453,11 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == INVALID_GPA) { -#if PTTYPE != PTTYPE_EPT - walker->fault.error_code |= PFERR_GUEST_FINAL_MASK; -#endif + if (kvm_nested_fault_is_ept(vcpu, &walker->fault)) + walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED; + else + walker->fault.error_code |= PFERR_GUEST_FINAL_MASK; return 0; } @@ -496,7 +504,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, * [2:0] - Derive from the access bits. The exit_qualification might be * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables - * [7:8] - Derived from [7:8] of real exit_qualification + * [7:8] - Set at the kvm_translate_gpa() call sites above * * The other bits are set to 0. */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 937aeb474af7..39f8504f5cf2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -443,11 +443,12 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; exit_qualification = 0; } else { - exit_qualification = fault->exit_qualification; - exit_qualification |= vmx_get_exit_qual(vcpu) & - (EPT_VIOLATION_GVA_IS_VALID | - EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + exit_qualification = fault->exit_qualification; + if (fault->hardware_nested_page_fault) + exit_qualification |= vmx_get_exit_qual(vcpu) & + (EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); } /* -- 2.53.0.851.ga537e3e6e9-goog Add a test that exercises nested page fault injection during L2 execution. L2 executes I/O string instructions (OUTSB/INSB) that access memory restricted in L1's nested page tables (NPT/EPT), triggering a nested page fault that L0 must inject to L1. The test supports both AMD SVM (NPF) and Intel VMX (EPT violation) and verifies that: - The exit reason is an NPF/EPT violation - The access type and permission bits are correct - The faulting GPA is correct Three test cases are implemented: - Unmap the final data page (final translation fault, OUTSB read) - Unmap a PT page (page walk fault, OUTSB read) - Write-protect the final data page (protection violation, INSB write) - Write-protect a PT page (protection violation on A/D update, OUTSB read) Signed-off-by: Kevin Cheng --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/x86/nested_npf_test.c | 374 ++++++++++++++++++ 2 files changed, 375 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/nested_npf_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 3d372d78a275..9308e6100f27 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -94,6 +94,7 @@ TEST_GEN_PROGS_x86 += x86/nested_dirty_log_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_npf_test TEST_GEN_PROGS_x86 += x86/nested_set_state_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test diff --git a/tools/testing/selftests/kvm/x86/nested_npf_test.c b/tools/testing/selftests/kvm/x86/nested_npf_test.c new file mode 100644 index 000000000000..7725e5dc3a38 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_npf_test.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google, Inc. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "vmx.h" + +#define L2_GUEST_STACK_SIZE 64 + +#define EPT_VIOLATION_ACC_READ BIT(0) +#define EPT_VIOLATION_ACC_WRITE BIT(1) +#define EPT_VIOLATION_ACC_INSTR BIT(2) +#define EPT_VIOLATION_PROT_READ BIT(3) +#define EPT_VIOLATION_PROT_WRITE BIT(4) +#define EPT_VIOLATION_PROT_EXEC BIT(5) +#define EPT_VIOLATION_GVA_IS_VALID BIT(7) +#define EPT_VIOLATION_GVA_TRANSLATED BIT(8) + +enum test_type { + TEST_FINAL_PAGE_UNMAPPED, /* Final data page not present */ + TEST_PT_PAGE_UNMAPPED, /* Page table page not present */ + TEST_FINAL_PAGE_WRITE_PROTECTED, /* Final data page read-only */ + TEST_PT_PAGE_WRITE_PROTECTED, /* Page table page read-only */ +}; + +static vm_vaddr_t l2_test_page; +static void (*l2_entry)(void); + +#define TEST_IO_PORT 0x80 +#define TEST1_VADDR 0x8000000ULL +#define TEST2_VADDR 0x10000000ULL +#define TEST3_VADDR 0x18000000ULL +#define TEST4_VADDR 0x20000000ULL + +/* + * L2 executes OUTS reading from l2_test_page, triggering a nested page + * fault on the read access. + */ +static void l2_guest_code_outs(void) +{ + asm volatile("outsb" ::"S"(l2_test_page), "d"(TEST_IO_PORT) : "memory"); + GUEST_FAIL("L2 should not reach here"); +} + +/* + * L2 executes INS writing to l2_test_page, triggering a nested page + * fault on the write access. + */ +static void l2_guest_code_ins(void) +{ + asm volatile("insb" ::"D"(l2_test_page), "d"(TEST_IO_PORT) : "memory"); + GUEST_FAIL("L2 should not reach here"); +} + +static void l1_vmx_code(struct vmx_pages *vmx, uint64_t expected_fault_gpa, + uint64_t test_type) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint64_t exit_qual; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_entry, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + + /* Verify we got an EPT violation exit */ + __GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_EPT_VIOLATION, + "Expected EPT violation (0x%x), got 0x%lx", + EXIT_REASON_EPT_VIOLATION, + vmreadz(VM_EXIT_REASON)); + + exit_qual = vmreadz(EXIT_QUALIFICATION); + + switch (test_type) { + case TEST_FINAL_PAGE_UNMAPPED: + /* Read access, final translation, page not present */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_READ, + "Expected ACC_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_TRANSLATED, + "Expected GVA_TRANSLATED set, exit_qual 0x%lx", + exit_qual); + break; + case TEST_PT_PAGE_UNMAPPED: + /* Read access, page walk fault, page not present */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_READ, + "Expected ACC_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_GVA_TRANSLATED), + "Expected GVA_TRANSLATED clear, exit_qual 0x%lx", + exit_qual); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + /* Write access, final translation, page present but read-only */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_WRITE, + "Expected ACC_WRITE set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_PROT_READ, + "Expected PROT_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_PROT_WRITE), + "Expected PROT_WRITE clear, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_TRANSLATED, + "Expected GVA_TRANSLATED set, exit_qual 0x%lx", + exit_qual); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + /* Write access (A/D update), page walk, page present but read-only */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_WRITE, + "Expected ACC_WRITE set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_PROT_READ, + "Expected PROT_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_PROT_WRITE), + "Expected PROT_WRITE clear, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_GVA_TRANSLATED), + "Expected GVA_TRANSLATED clear, exit_qual 0x%lx", + exit_qual); + break; + } + + __GUEST_ASSERT(vmreadz(GUEST_PHYSICAL_ADDRESS) == expected_fault_gpa, + "Expected guest_physical_address = 0x%lx, got 0x%lx", + expected_fault_gpa, + vmreadz(GUEST_PHYSICAL_ADDRESS)); + + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm, uint64_t expected_fault_gpa, + uint64_t test_type) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + uint64_t exit_info_1; + + generic_svm_setup(svm, l2_entry, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(vmcb, svm->vmcb_gpa); + + /* Verify we got an NPF exit */ + __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_NPF, + "Expected NPF exit (0x%x), got 0x%lx", SVM_EXIT_NPF, + vmcb->control.exit_code); + + exit_info_1 = vmcb->control.exit_info_1; + + switch (test_type) { + case TEST_FINAL_PAGE_UNMAPPED: + /* Read access, final translation, page not present */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_FINAL_MASK, + "Expected GUEST_FINAL set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_PAGE_MASK), + "Expected GUEST_PAGE clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_PRESENT_MASK), + "Expected PRESENT clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + case TEST_PT_PAGE_UNMAPPED: + /* Read access, page walk fault, page not present */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_PAGE_MASK, + "Expected GUEST_PAGE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_FINAL_MASK), + "Expected GUEST_FINAL clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_PRESENT_MASK), + "Expected PRESENT clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + /* Write access, final translation, page present but read-only */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_FINAL_MASK, + "Expected GUEST_FINAL set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_PAGE_MASK), + "Expected GUEST_PAGE clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_PRESENT_MASK, + "Expected PRESENT set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_WRITE_MASK, + "Expected WRITE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + /* Write access (A/D update), page walk, page present but read-only */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_PAGE_MASK, + "Expected GUEST_PAGE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_FINAL_MASK), + "Expected GUEST_FINAL clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_PRESENT_MASK, + "Expected PRESENT set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_WRITE_MASK, + "Expected WRITE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + } + + __GUEST_ASSERT(vmcb->control.exit_info_2 == expected_fault_gpa, + "Expected exit_info_2 = 0x%lx, got 0x%lx", + expected_fault_gpa, + vmcb->control.exit_info_2); + + GUEST_DONE(); +} + +static void l1_guest_code(void *data, uint64_t expected_fault_gpa, + uint64_t test_type) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data, expected_fault_gpa, test_type); + else + l1_svm_code(data, expected_fault_gpa, test_type); +} + +/* Returns the GPA of the PT page that maps @vaddr. */ +static uint64_t get_pt_gpa_for_vaddr(struct kvm_vm *vm, uint64_t vaddr) +{ + uint64_t *pte; + + pte = vm_get_pte(vm, vaddr); + TEST_ASSERT(pte && (*pte & 0x1), "PTE not present for vaddr 0x%lx", + (unsigned long)vaddr); + + return addr_hva2gpa(vm, (void *)((uint64_t)pte & ~0xFFFULL)); +} + +static void run_test(enum test_type type) +{ + vm_paddr_t expected_fault_gpa; + vm_vaddr_t nested_gva; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_tdp(vm); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); + + switch (type) { + case TEST_FINAL_PAGE_UNMAPPED: + /* + * Unmap the final data page from NPT/EPT. The guest page + * table walk succeeds, but the final GPA->HPA translation + * fails. L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST1_VADDR); + expected_fault_gpa = addr_gva2gpa(vm, l2_test_page); + break; + case TEST_PT_PAGE_UNMAPPED: + /* + * Unmap a page table page from NPT/EPT. The hardware page + * table walk fails when translating the PT page's GPA + * through NPT/EPT. L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST2_VADDR); + expected_fault_gpa = get_pt_gpa_for_vaddr(vm, l2_test_page); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + /* + * Write-protect the final data page in NPT/EPT. The page + * is present and readable, but not writable. L2 writes to + * the page via INS, triggering a protection violation. + */ + l2_entry = l2_guest_code_ins; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST3_VADDR); + expected_fault_gpa = addr_gva2gpa(vm, l2_test_page); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + /* + * Write-protect a page table page in NPT/EPT. The page is + * present and readable, but not writable. The guest page + * table walk needs write access to set A/D bits, so it + * triggers a protection violation on the PT page. + * L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST4_VADDR); + expected_fault_gpa = get_pt_gpa_for_vaddr(vm, l2_test_page); + break; + } + + tdp_identity_map_default_memslots(vm); + + if (type == TEST_FINAL_PAGE_WRITE_PROTECTED || + type == TEST_PT_PAGE_WRITE_PROTECTED) + *tdp_get_pte(vm, expected_fault_gpa) &= ~PTE_WRITABLE_MASK(&vm->stage2_mmu); + else + *tdp_get_pte(vm, expected_fault_gpa) &= ~(PTE_PRESENT_MASK(&vm->stage2_mmu) | + PTE_READABLE_MASK(&vm->stage2_mmu) | + PTE_WRITABLE_MASK(&vm->stage2_mmu) | + PTE_EXECUTABLE_MASK(&vm->stage2_mmu)); + + sync_global_to_guest(vm, l2_entry); + sync_global_to_guest(vm, l2_test_page); + vcpu_args_set(vcpu, 3, nested_gva, expected_fault_gpa, (uint64_t)type); + + /* + * For the INS-based write test, KVM emulates the instruction and + * first reads from the I/O port, which exits to userspace. + * Re-enter the guest so emulation can proceed to the memory + * write, where the nested page fault is triggered. + */ + for (;;) { + vcpu_run(vcpu); + + if (vcpu->run->exit_reason == KVM_EXIT_IO && + vcpu->run->io.port == TEST_IO_PORT && + vcpu->run->io.direction == KVM_EXIT_IO_IN) { + continue; + } + break; + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected exit reason: %d", vcpu->run->exit_reason); + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_cpu_has_tdp()); + + run_test(TEST_FINAL_PAGE_UNMAPPED); + run_test(TEST_PT_PAGE_UNMAPPED); + run_test(TEST_FINAL_PAGE_WRITE_PROTECTED); + run_test(TEST_PT_PAGE_WRITE_PROTECTED); + + return 0; +} -- 2.53.0.851.ga537e3e6e9-goog