When KVM emulates an instruction for L2 and encounters a nested page fault (e.g., during string I/O emulation), nested_svm_inject_npf_exit() injects an NPF to L1. However, the code incorrectly hardcodes (1ULL << 32) for exit_info_1's upper bits when the original exit was not an NPF. This always sets PFERR_GUEST_FINAL_MASK even when the fault occurred on a page table page, preventing L1 from correctly identifying the cause of the fault. Set PFERR_GUEST_PAGE_MASK in the error code when a nested page fault occurs during a guest page table walk, and PFERR_GUEST_FINAL_MASK when the fault occurs on the final GPA-to-HPA translation. Widen error_code in struct x86_exception from u16 to u64 to accommodate the PFERR_GUEST_* bits (bits 32 and 33). Update nested_svm_inject_npf_exit() to use fault->error_code directly instead of hardcoding the upper bits. Also add a WARN_ON_ONCE if neither PFERR_GUEST_FINAL_MASK nor PFERR_GUEST_PAGE_MASK is set, as this would indicate a bug in the page fault handling code. Signed-off-by: Kevin Cheng --- arch/x86/kvm/kvm_emulate.h | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 22 ++++++++++------------ arch/x86/kvm/svm/nested.c | 11 +++++------ 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index fb3dab4b5a53e..ff4f9b0a01ff7 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -22,7 +22,7 @@ enum x86_intercept_stage; struct x86_exception { u8 vector; bool error_code_valid; - u16 error_code; + u64 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 901cd2bd40b84..923179bfd5c74 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -379,18 +379,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), nested_access, &walker->fault); - /* - * FIXME: This can happen if emulation (for of an INS/OUTS - * instruction) triggers a nested page fault. The exit - * qualification / exit info field will incorrectly have - * "guest page access" as the nested page fault's cause, - * instead of "guest page structure access". To fix this, - * the x86_exception struct should be augmented with enough - * information to fix the exit_qualification or exit_info_1 - * fields. - */ - if (unlikely(real_gpa == INVALID_GPA)) + if (unlikely(real_gpa == INVALID_GPA)) { +#if PTTYPE != PTTYPE_EPT + walker->fault.error_code |= PFERR_GUEST_PAGE_MASK; +#endif return 0; + } slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); if (!kvm_is_visible_memslot(slot)) @@ -446,8 +440,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, #endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == INVALID_GPA) + if (real_gpa == INVALID_GPA) { +#if PTTYPE != PTTYPE_EPT + walker->fault.error_code |= PFERR_GUEST_FINAL_MASK; +#endif return 0; + } walker->gfn = real_gpa >> PAGE_SHIFT; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index de90b104a0dd5..f8dfd5c333023 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -40,18 +40,17 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct vmcb *vmcb = svm->vmcb; if (vmcb->control.exit_code != SVM_EXIT_NPF) { - /* - * TODO: track the cause of the nested page fault, and - * correctly fill in the high bits of exit_info_1. - */ - vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_info_1 = (1ULL << 32); + vmcb->control.exit_info_1 = fault->error_code; vmcb->control.exit_info_2 = fault->address; } + vmcb->control.exit_code = SVM_EXIT_NPF; vmcb->control.exit_info_1 &= ~0xffffffffULL; vmcb->control.exit_info_1 |= fault->error_code; + WARN_ON_ONCE(!(vmcb->control.exit_info_1 & + (PFERR_GUEST_FINAL_MASK | PFERR_GUEST_PAGE_MASK))); + nested_svm_vmexit(svm); } -- 2.52.0.457.g6b5491de43-goog Add __virt_pg_unmap(), __tdp_unmap(), and tdp_unmap() as counterparts to the existing __virt_pg_map(), __tdp_map(), and tdp_map() functions. These helpers allow tests to selectively unmap pages from the TDP/NPT, enabling testing of NPT faults for unmapped pages. Signed-off-by: Kevin Cheng --- .../selftests/kvm/include/x86/processor.h | 6 +++ .../testing/selftests/kvm/lib/x86/processor.c | 53 +++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 6bfffc3b0a332..23ec5030a1d1f 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1487,6 +1487,12 @@ void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t void tdp_identity_map_default_memslots(struct kvm_vm *vm); void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); +void __virt_pg_unmap(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + int level); +void __tdp_unmap(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t size, + int level); +void tdp_unmap(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t size); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index ab869a98bbdce..8cb0d74aaa41e 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -338,6 +338,40 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } +void __virt_pg_unmap(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + int level) +{ + uint64_t *pte = &mmu->pgd; + int current_level; + + TEST_ASSERT(level >= PG_LEVEL_4K && level <= mmu->pgtable_levels, + "Invalid level %d", level); + + /* Walk down to target level */ + for (current_level = mmu->pgtable_levels; + current_level > level; + current_level--) { + pte = virt_get_pte(vm, mmu, pte, vaddr, current_level); + + TEST_ASSERT(is_present_pte(mmu, pte), + "Entry not present at level %d for vaddr 0x%lx", + current_level, vaddr); + TEST_ASSERT(!is_huge_pte(mmu, pte), + "Unexpected huge page at level %d for vaddr 0x%lx", + current_level, vaddr); + } + + /* Get the PTE at target level */ + pte = virt_get_pte(vm, mmu, pte, vaddr, level); + + TEST_ASSERT(is_present_pte(mmu, pte), + "Entry not present at level %d for vaddr 0x%lx", + level, vaddr); + + /* Clear the PTE */ + *pte = 0; +} + static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte, int *level, int current_level) { @@ -541,6 +575,25 @@ void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); } +void __tdp_unmap(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t size, + int level) +{ + size_t page_size = PG_LEVEL_SIZE(level); + size_t npages = size / page_size; + + TEST_ASSERT(nested_paddr + size > nested_paddr, "Address overflow"); + + while (npages--) { + __virt_pg_unmap(vm, &vm->stage2_mmu, nested_paddr, level); + nested_paddr += page_size; + } +} + +void tdp_unmap(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t size) +{ + __tdp_unmap(vm, nested_paddr, size, PG_LEVEL_4K); +} + /* * Set Unusable Segment * -- 2.52.0.457.g6b5491de43-goog Add a test that exercises nested NPF injection when the original VM exit was not an NPF. This tests the code path in nested_svm_inject_npf_exit() where exit_code != SVM_EXIT_NPF. L2 executes an OUTS instruction with the source address mapped in L2's page tables but not in L1's NPT. KVM emulates the string I/O, and when it tries to read the source operand, the GPA->HPA translation fails. KVM then injects an NPF to L1 even though the original exit was IOIO. The test verifies that: - The exit code is converted to SVM_EXIT_NPF - exit_info_1 has the appropriate PFERR_GUEST_* bit set - exit_info_2 contains the correct faulting GPA Two test cases are implemented: - Test 1: Unmap the final data page from NPT (PFERR_GUEST_FINAL_MASK) - Test 2: Unmap a PT page from NPT (PFERR_GUEST_PAGE_MASK) Signed-off-by: Kevin Cheng --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/x86/svm_nested_npf_test.c | 154 ++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/svm_nested_npf_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index e88699e227ddf..8babe6e228e11 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -112,6 +112,7 @@ TEST_GEN_PROGS_x86 += x86/svm_vmcall_test TEST_GEN_PROGS_x86 += x86/svm_int_ctl_test TEST_GEN_PROGS_x86 += x86/svm_nested_shutdown_test TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test +TEST_GEN_PROGS_x86 += x86/svm_nested_npf_test TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync TEST_GEN_PROGS_x86 += x86/sync_regs_test TEST_GEN_PROGS_x86 += x86/ucna_injection_test diff --git a/tools/testing/selftests/kvm/x86/svm_nested_npf_test.c b/tools/testing/selftests/kvm/x86/svm_nested_npf_test.c new file mode 100644 index 0000000000000..c0a894acbc483 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/svm_nested_npf_test.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_nested_npf_test + * + * Test nested NPF injection when the original VM exit was not an NPF. + * This exercises nested_svm_inject_npf_exit() with exit_code != SVM_EXIT_NPF. + * + * L2 executes OUTS with the source address mapped in L2's page tables but + * not in L1's NPT. KVM emulates the string I/O instruction, and when it + * tries to read the source operand, the GPA->HPA translation fails. KVM + * then injects an NPF to L1 even though the original exit was IOIO. + * + * Test 1: Final data page GPA not in NPT (PFERR_GUEST_FINAL_MASK) + * Test 2: Page table page GPA not in NPT (PFERR_GUEST_PAGE_MASK) + * + * Copyright (C) 2025, Google, Inc. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" + +#define L2_GUEST_STACK_SIZE 64 + +enum test_type { + TEST_FINAL_PAGE_UNMAPPED, /* Final data page GPA not in NPT */ + TEST_PT_PAGE_UNMAPPED, /* Page table page GPA not in NPT */ +}; + +static void *l2_test_page; + +#define TEST_IO_PORT 0x80 +#define TEST1_VADDR 0x8000000ULL +#define TEST2_VADDR 0x10000000ULL + +/* + * L2 executes OUTS with source at l2_test_page, triggering a nested NPF. + * The address is mapped in L2's page tables, but either the data page or + * a PT page is unmapped from L1's NPT, causing the fault. + */ +static void l2_guest_code(void *unused) +{ + asm volatile("outsb" ::"S"(l2_test_page), "d"(TEST_IO_PORT) : "memory"); + GUEST_ASSERT(0); +} + +static void l1_guest_code(struct svm_test_data *svm, void *expected_fault_gpa, + uint64_t exit_info_1_mask) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(vmcb, svm->vmcb_gpa); + + /* Verify we got an NPF exit (converted from IOIO by KVM) */ + __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_NPF, + "Expected NPF exit (0x%x), got 0x%lx", SVM_EXIT_NPF, + vmcb->control.exit_code); + + /* Check for PFERR_GUEST_FINAL_MASK or PFERR_GUEST_PAGE_MASK */ + __GUEST_ASSERT(vmcb->control.exit_info_1 & exit_info_1_mask, + "Expected exit_info_1 to have 0x%lx set, got 0x%lx", + (unsigned long)exit_info_1_mask, + (unsigned long)vmcb->control.exit_info_1); + + __GUEST_ASSERT(vmcb->control.exit_info_2 == (u64)expected_fault_gpa, + "Expected exit_info_2 = 0x%lx, got 0x%lx", + (unsigned long)expected_fault_gpa, + (unsigned long)vmcb->control.exit_info_2); + + GUEST_DONE(); +} + +/* Returns the GPA of the PT page that maps @vaddr. */ +static uint64_t get_pt_gpa_for_vaddr(struct kvm_vm *vm, uint64_t vaddr) +{ + uint64_t *pte; + + pte = vm_get_pte(vm, vaddr); + TEST_ASSERT(pte && (*pte & 0x1), "PTE not present for vaddr 0x%lx", + (unsigned long)vaddr); + + return addr_hva2gpa(vm, (void *)((uint64_t)pte & ~0xFFFULL)); +} + +static void run_test(enum test_type type) +{ + vm_paddr_t expected_fault_gpa; + uint64_t exit_info_1_mask; + vm_vaddr_t svm_gva; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_npt(vm); + vcpu_alloc_svm(vm, &svm_gva); + + if (type == TEST_FINAL_PAGE_UNMAPPED) { + /* + * Test 1: Unmap the final data page from NPT. The page table + * walk succeeds, but the final GPA->HPA translation fails. + */ + l2_test_page = + (void *)vm_vaddr_alloc(vm, vm->page_size, TEST1_VADDR); + expected_fault_gpa = addr_gva2gpa(vm, (vm_vaddr_t)l2_test_page); + exit_info_1_mask = PFERR_GUEST_FINAL_MASK; + } else { + /* + * Test 2: Unmap a PT page from NPT. The hardware page table + * walk fails when translating the PT page's GPA through NPT. + */ + l2_test_page = + (void *)vm_vaddr_alloc(vm, vm->page_size, TEST2_VADDR); + expected_fault_gpa = + get_pt_gpa_for_vaddr(vm, (vm_vaddr_t)l2_test_page); + exit_info_1_mask = PFERR_GUEST_PAGE_MASK; + } + + tdp_identity_map_default_memslots(vm); + tdp_unmap(vm, expected_fault_gpa, vm->page_size); + + sync_global_to_guest(vm, l2_test_page); + vcpu_args_set(vcpu, 3, svm_gva, expected_fault_gpa, exit_info_1_mask); + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected exit reason: %d", vcpu->run->exit_reason); + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_cpu_has_npt()); + + run_test(TEST_FINAL_PAGE_UNMAPPED); + run_test(TEST_PT_PAGE_UNMAPPED); + + return 0; +} -- 2.52.0.457.g6b5491de43-goog