Widen the error_code field in struct x86_exception from u16 to u64 to accommodate AMD's NPF error code, which defines information bits above bit 31, e.g. PFERR_GUEST_FINAL_MASK (bit 32), and PFERR_GUEST_PAGE_MASK (bit 33). Retain the u16 type for the local errcode variable in walk_addr_generic as the walker synthesizes conventional #PF error codes that are architecturally limited to bits 15:0. Signed-off-by: Kevin Cheng --- arch/x86/kvm/kvm_emulate.h | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index fb3dab4b5a53e..ff4f9b0a01ff7 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -22,7 +22,7 @@ enum x86_intercept_stage; struct x86_exception { u8 vector; bool error_code_valid; - u16 error_code; + u64 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 901cd2bd40b84..37eba7dafd14f 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -317,6 +317,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; + /* + * Note! Track the error_code that's common to legacy shadow paging + * and NPT shadow paging as a u16 to guard against unintentionally + * setting any of bits 63:16. Architecturally, the #PF error code is + * 32 bits, and Intel CPUs don't support settings bits 31:16. + */ u16 errcode = 0; gpa_t real_gpa; gfn_t gfn; -- 2.53.0.414.gf7e9f6c205-goog When KVM emulates an instruction for L2 and encounters a nested page fault (e.g., during string I/O emulation), nested_svm_inject_npf_exit() injects an NPF to L1. However, the code incorrectly hardcodes (1ULL << 32) for exit_info_1's upper bits when the original exit was not an NPF. This always sets PFERR_GUEST_FINAL_MASK even when the fault occurred on a page table page, preventing L1 from correctly identifying the cause of the fault. Set PFERR_GUEST_PAGE_MASK in the error code when a nested page fault occurs during a guest page table walk, and PFERR_GUEST_FINAL_MASK when the fault occurs on the final GPA-to-HPA translation. Widen error_code in struct x86_exception from u16 to u64 to accommodate the PFERR_GUEST_* bits (bits 32 and 33). Update nested_svm_inject_npf_exit() to use fault->error_code directly instead of hardcoding the upper bits. Also add a WARN_ON_ONCE if neither PFERR_GUEST_FINAL_MASK nor PFERR_GUEST_PAGE_MASK is set, as this would indicate a bug in the page fault handling code. Signed-off-by: Kevin Cheng --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu/paging_tmpl.h | 22 ++++++++++------------ arch/x86/kvm/svm/nested.c | 19 +++++++++++++------ 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ff07c45e3c731..454f84660edfc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -280,6 +280,8 @@ enum x86_intercept_stage; #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) #define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_FAULT_STAGE_MASK \ + (PFERR_GUEST_FINAL_MASK | PFERR_GUEST_PAGE_MASK) #define PFERR_GUEST_ENC_MASK BIT_ULL(34) #define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) #define PFERR_GUEST_VMPL_MASK BIT_ULL(36) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 37eba7dafd14f..f148c92b606ba 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -385,18 +385,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), nested_access, &walker->fault); - /* - * FIXME: This can happen if emulation (for of an INS/OUTS - * instruction) triggers a nested page fault. The exit - * qualification / exit info field will incorrectly have - * "guest page access" as the nested page fault's cause, - * instead of "guest page structure access". To fix this, - * the x86_exception struct should be augmented with enough - * information to fix the exit_qualification or exit_info_1 - * fields. - */ - if (unlikely(real_gpa == INVALID_GPA)) + if (unlikely(real_gpa == INVALID_GPA)) { +#if PTTYPE != PTTYPE_EPT + walker->fault.error_code |= PFERR_GUEST_PAGE_MASK; +#endif return 0; + } slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); if (!kvm_is_visible_memslot(slot)) @@ -452,8 +446,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, #endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == INVALID_GPA) + if (real_gpa == INVALID_GPA) { +#if PTTYPE != PTTYPE_EPT + walker->fault.error_code |= PFERR_GUEST_FINAL_MASK; +#endif return 0; + } walker->gfn = real_gpa >> PAGE_SHIFT; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index de90b104a0dd5..1013e814168b5 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -40,18 +40,25 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct vmcb *vmcb = svm->vmcb; if (vmcb->control.exit_code != SVM_EXIT_NPF) { - /* - * TODO: track the cause of the nested page fault, and - * correctly fill in the high bits of exit_info_1. - */ - vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_info_1 = (1ULL << 32); + vmcb->control.exit_info_1 = fault->error_code; vmcb->control.exit_info_2 = fault->address; } + vmcb->control.exit_code = SVM_EXIT_NPF; vmcb->control.exit_info_1 &= ~0xffffffffULL; vmcb->control.exit_info_1 |= fault->error_code; + /* + * All nested page faults should be annotated as occurring on the + * final translation *or* the page walk. Arbitrarily choose "final" + * if KVM is buggy and enumerated both or neither. + */ + if (WARN_ON_ONCE(hweight64(vmcb->control.exit_info_1 & + PFERR_GUEST_FAULT_STAGE_MASK) != 1)) { + vmcb->control.exit_info_1 &= ~PFERR_GUEST_FAULT_STAGE_MASK; + vmcb->control.exit_info_1 |= PFERR_GUEST_FINAL_MASK; + } + nested_svm_vmexit(svm); } -- 2.53.0.414.gf7e9f6c205-goog Remove the OR of EPT_VIOLATION_GVA_IS_VALID and EPT_VIOLATION_GVA_TRANSLATED from the hardware exit qualification when injecting a synthesized EPT violation to L1. The hardware exit qualification reflects the original VM exit, which may not be an EPT violation at all, e.g. if KVM is emulating an I/O instruction and the memory operand's translation through L1's EPT fails. In that case, bits 7-8 of the exit qualification have completely different semantics (or are simply zero), and OR'ing them into the injected EPT violation corrupts the GVA_IS_VALID/GVA_TRANSLATED information. Even when the original exit is an EPT violation, the hardware bits may not match the current fault. For example, if an EPT violation happened while walking L2's page tables, it's possible that the EPT violation injected by KVM into L1 is for the final address translation, if L1 already had the mappings for L2's page tables in its EPTs but KVM did not have shadow EPTs for them. Populate EPT_VIOLATION_GVA_IS_VALID and EPT_VIOLATION_GVA_TRANSLATED directly in the page table walker at the kvm_translate_gpa() failure sites, mirroring the existing PFERR_GUEST_PAGE_MASK and PFERR_GUEST_FINAL_MASK population for NPT. Signed-off-by: Kevin Cheng --- arch/x86/kvm/mmu/paging_tmpl.h | 16 +++++++++++++++- arch/x86/kvm/vmx/nested.c | 3 --- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f148c92b606ba..a084b5e50effc 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -386,8 +386,19 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, nested_access, &walker->fault); if (unlikely(real_gpa == INVALID_GPA)) { + /* + * Unconditionally set the NPF error_code bits and + * EPT exit_qualification bits for nested page + * faults. The walker doesn't know whether L1 uses + * NPT or EPT, and each injection handler consumes + * only the field it cares about (error_code for + * NPF, exit_qualification for EPT violations), so + * setting both is harmless. + */ #if PTTYPE != PTTYPE_EPT walker->fault.error_code |= PFERR_GUEST_PAGE_MASK; + walker->fault.exit_qualification |= + EPT_VIOLATION_GVA_IS_VALID; #endif return 0; } @@ -449,6 +460,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, if (real_gpa == INVALID_GPA) { #if PTTYPE != PTTYPE_EPT walker->fault.error_code |= PFERR_GUEST_FINAL_MASK; + walker->fault.exit_qualification |= + EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED; #endif return 0; } @@ -496,7 +510,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, * [2:0] - Derive from the access bits. The exit_qualification might be * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables - * [7:8] - Derived from [7:8] of real exit_qualification + * [7:8] - Set at the kvm_translate_gpa() call sites above * * The other bits are set to 0. */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 248635da67661..6a167b1d51595 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -444,9 +444,6 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, exit_qualification = 0; } else { exit_qualification = fault->exit_qualification; - exit_qualification |= vmx_get_exit_qual(vcpu) & - (EPT_VIOLATION_GVA_IS_VALID | - EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; } -- 2.53.0.414.gf7e9f6c205-goog Add a test that exercises nested page fault injection during L2 execution. L2 executes I/O string instructions (OUTSB/INSB) that access memory restricted in L1's nested page tables (NPT/EPT), triggering a nested page fault that L0 must inject to L1. The test supports both AMD SVM (NPF) and Intel VMX (EPT violation) and verifies that: - The exit reason is an NPF/EPT violation - The access type and permission bits are correct - The faulting GPA is correct Three test cases are implemented: - Unmap the final data page (final translation fault, OUTSB read) - Unmap a PT page (page walk fault, OUTSB read) - Write-protect the final data page (protection violation, INSB write) - Write-protect a PT page (protection violation on A/D update, OUTSB read) Signed-off-by: Kevin Cheng --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/x86/nested_npf_test.c | 374 ++++++++++++++++++ 2 files changed, 375 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/nested_npf_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index fdec90e854671..55703d6be5e7a 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -93,6 +93,7 @@ TEST_GEN_PROGS_x86 += x86/nested_dirty_log_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_npf_test TEST_GEN_PROGS_x86 += x86/nested_set_state_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test diff --git a/tools/testing/selftests/kvm/x86/nested_npf_test.c b/tools/testing/selftests/kvm/x86/nested_npf_test.c new file mode 100644 index 0000000000000..7725e5dc3a386 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_npf_test.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google, Inc. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "vmx.h" + +#define L2_GUEST_STACK_SIZE 64 + +#define EPT_VIOLATION_ACC_READ BIT(0) +#define EPT_VIOLATION_ACC_WRITE BIT(1) +#define EPT_VIOLATION_ACC_INSTR BIT(2) +#define EPT_VIOLATION_PROT_READ BIT(3) +#define EPT_VIOLATION_PROT_WRITE BIT(4) +#define EPT_VIOLATION_PROT_EXEC BIT(5) +#define EPT_VIOLATION_GVA_IS_VALID BIT(7) +#define EPT_VIOLATION_GVA_TRANSLATED BIT(8) + +enum test_type { + TEST_FINAL_PAGE_UNMAPPED, /* Final data page not present */ + TEST_PT_PAGE_UNMAPPED, /* Page table page not present */ + TEST_FINAL_PAGE_WRITE_PROTECTED, /* Final data page read-only */ + TEST_PT_PAGE_WRITE_PROTECTED, /* Page table page read-only */ +}; + +static vm_vaddr_t l2_test_page; +static void (*l2_entry)(void); + +#define TEST_IO_PORT 0x80 +#define TEST1_VADDR 0x8000000ULL +#define TEST2_VADDR 0x10000000ULL +#define TEST3_VADDR 0x18000000ULL +#define TEST4_VADDR 0x20000000ULL + +/* + * L2 executes OUTS reading from l2_test_page, triggering a nested page + * fault on the read access. + */ +static void l2_guest_code_outs(void) +{ + asm volatile("outsb" ::"S"(l2_test_page), "d"(TEST_IO_PORT) : "memory"); + GUEST_FAIL("L2 should not reach here"); +} + +/* + * L2 executes INS writing to l2_test_page, triggering a nested page + * fault on the write access. + */ +static void l2_guest_code_ins(void) +{ + asm volatile("insb" ::"D"(l2_test_page), "d"(TEST_IO_PORT) : "memory"); + GUEST_FAIL("L2 should not reach here"); +} + +static void l1_vmx_code(struct vmx_pages *vmx, uint64_t expected_fault_gpa, + uint64_t test_type) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint64_t exit_qual; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_entry, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + + /* Verify we got an EPT violation exit */ + __GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_EPT_VIOLATION, + "Expected EPT violation (0x%x), got 0x%lx", + EXIT_REASON_EPT_VIOLATION, + vmreadz(VM_EXIT_REASON)); + + exit_qual = vmreadz(EXIT_QUALIFICATION); + + switch (test_type) { + case TEST_FINAL_PAGE_UNMAPPED: + /* Read access, final translation, page not present */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_READ, + "Expected ACC_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_TRANSLATED, + "Expected GVA_TRANSLATED set, exit_qual 0x%lx", + exit_qual); + break; + case TEST_PT_PAGE_UNMAPPED: + /* Read access, page walk fault, page not present */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_READ, + "Expected ACC_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_GVA_TRANSLATED), + "Expected GVA_TRANSLATED clear, exit_qual 0x%lx", + exit_qual); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + /* Write access, final translation, page present but read-only */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_WRITE, + "Expected ACC_WRITE set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_PROT_READ, + "Expected PROT_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_PROT_WRITE), + "Expected PROT_WRITE clear, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_TRANSLATED, + "Expected GVA_TRANSLATED set, exit_qual 0x%lx", + exit_qual); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + /* Write access (A/D update), page walk, page present but read-only */ + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_ACC_WRITE, + "Expected ACC_WRITE set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_PROT_READ, + "Expected PROT_READ set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_PROT_WRITE), + "Expected PROT_WRITE clear, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(exit_qual & EPT_VIOLATION_GVA_IS_VALID, + "Expected GVA_IS_VALID set, exit_qual 0x%lx", + exit_qual); + __GUEST_ASSERT(!(exit_qual & EPT_VIOLATION_GVA_TRANSLATED), + "Expected GVA_TRANSLATED clear, exit_qual 0x%lx", + exit_qual); + break; + } + + __GUEST_ASSERT(vmreadz(GUEST_PHYSICAL_ADDRESS) == expected_fault_gpa, + "Expected guest_physical_address = 0x%lx, got 0x%lx", + expected_fault_gpa, + vmreadz(GUEST_PHYSICAL_ADDRESS)); + + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm, uint64_t expected_fault_gpa, + uint64_t test_type) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + uint64_t exit_info_1; + + generic_svm_setup(svm, l2_entry, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(vmcb, svm->vmcb_gpa); + + /* Verify we got an NPF exit */ + __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_NPF, + "Expected NPF exit (0x%x), got 0x%lx", SVM_EXIT_NPF, + vmcb->control.exit_code); + + exit_info_1 = vmcb->control.exit_info_1; + + switch (test_type) { + case TEST_FINAL_PAGE_UNMAPPED: + /* Read access, final translation, page not present */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_FINAL_MASK, + "Expected GUEST_FINAL set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_PAGE_MASK), + "Expected GUEST_PAGE clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_PRESENT_MASK), + "Expected PRESENT clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + case TEST_PT_PAGE_UNMAPPED: + /* Read access, page walk fault, page not present */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_PAGE_MASK, + "Expected GUEST_PAGE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_FINAL_MASK), + "Expected GUEST_FINAL clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_PRESENT_MASK), + "Expected PRESENT clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + /* Write access, final translation, page present but read-only */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_FINAL_MASK, + "Expected GUEST_FINAL set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_PAGE_MASK), + "Expected GUEST_PAGE clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_PRESENT_MASK, + "Expected PRESENT set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_WRITE_MASK, + "Expected WRITE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + /* Write access (A/D update), page walk, page present but read-only */ + __GUEST_ASSERT(exit_info_1 & PFERR_GUEST_PAGE_MASK, + "Expected GUEST_PAGE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(!(exit_info_1 & PFERR_GUEST_FINAL_MASK), + "Expected GUEST_FINAL clear, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_PRESENT_MASK, + "Expected PRESENT set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + __GUEST_ASSERT(exit_info_1 & PFERR_WRITE_MASK, + "Expected WRITE set, exit_info_1 0x%lx", + (unsigned long)exit_info_1); + break; + } + + __GUEST_ASSERT(vmcb->control.exit_info_2 == expected_fault_gpa, + "Expected exit_info_2 = 0x%lx, got 0x%lx", + expected_fault_gpa, + vmcb->control.exit_info_2); + + GUEST_DONE(); +} + +static void l1_guest_code(void *data, uint64_t expected_fault_gpa, + uint64_t test_type) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data, expected_fault_gpa, test_type); + else + l1_svm_code(data, expected_fault_gpa, test_type); +} + +/* Returns the GPA of the PT page that maps @vaddr. */ +static uint64_t get_pt_gpa_for_vaddr(struct kvm_vm *vm, uint64_t vaddr) +{ + uint64_t *pte; + + pte = vm_get_pte(vm, vaddr); + TEST_ASSERT(pte && (*pte & 0x1), "PTE not present for vaddr 0x%lx", + (unsigned long)vaddr); + + return addr_hva2gpa(vm, (void *)((uint64_t)pte & ~0xFFFULL)); +} + +static void run_test(enum test_type type) +{ + vm_paddr_t expected_fault_gpa; + vm_vaddr_t nested_gva; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_tdp(vm); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); + + switch (type) { + case TEST_FINAL_PAGE_UNMAPPED: + /* + * Unmap the final data page from NPT/EPT. The guest page + * table walk succeeds, but the final GPA->HPA translation + * fails. L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST1_VADDR); + expected_fault_gpa = addr_gva2gpa(vm, l2_test_page); + break; + case TEST_PT_PAGE_UNMAPPED: + /* + * Unmap a page table page from NPT/EPT. The hardware page + * table walk fails when translating the PT page's GPA + * through NPT/EPT. L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST2_VADDR); + expected_fault_gpa = get_pt_gpa_for_vaddr(vm, l2_test_page); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + /* + * Write-protect the final data page in NPT/EPT. The page + * is present and readable, but not writable. L2 writes to + * the page via INS, triggering a protection violation. + */ + l2_entry = l2_guest_code_ins; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST3_VADDR); + expected_fault_gpa = addr_gva2gpa(vm, l2_test_page); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + /* + * Write-protect a page table page in NPT/EPT. The page is + * present and readable, but not writable. The guest page + * table walk needs write access to set A/D bits, so it + * triggers a protection violation on the PT page. + * L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_vaddr_alloc(vm, vm->page_size, TEST4_VADDR); + expected_fault_gpa = get_pt_gpa_for_vaddr(vm, l2_test_page); + break; + } + + tdp_identity_map_default_memslots(vm); + + if (type == TEST_FINAL_PAGE_WRITE_PROTECTED || + type == TEST_PT_PAGE_WRITE_PROTECTED) + *tdp_get_pte(vm, expected_fault_gpa) &= ~PTE_WRITABLE_MASK(&vm->stage2_mmu); + else + *tdp_get_pte(vm, expected_fault_gpa) &= ~(PTE_PRESENT_MASK(&vm->stage2_mmu) | + PTE_READABLE_MASK(&vm->stage2_mmu) | + PTE_WRITABLE_MASK(&vm->stage2_mmu) | + PTE_EXECUTABLE_MASK(&vm->stage2_mmu)); + + sync_global_to_guest(vm, l2_entry); + sync_global_to_guest(vm, l2_test_page); + vcpu_args_set(vcpu, 3, nested_gva, expected_fault_gpa, (uint64_t)type); + + /* + * For the INS-based write test, KVM emulates the instruction and + * first reads from the I/O port, which exits to userspace. + * Re-enter the guest so emulation can proceed to the memory + * write, where the nested page fault is triggered. + */ + for (;;) { + vcpu_run(vcpu); + + if (vcpu->run->exit_reason == KVM_EXIT_IO && + vcpu->run->io.port == TEST_IO_PORT && + vcpu->run->io.direction == KVM_EXIT_IO_IN) { + continue; + } + break; + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected exit reason: %d", vcpu->run->exit_reason); + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_cpu_has_tdp()); + + run_test(TEST_FINAL_PAGE_UNMAPPED); + run_test(TEST_PT_PAGE_UNMAPPED); + run_test(TEST_FINAL_PAGE_WRITE_PROTECTED); + run_test(TEST_PT_PAGE_WRITE_PROTECTED); + + return 0; +} -- 2.53.0.414.gf7e9f6c205-goog