From: David Woodhouse ICEBP (INT1, opcode 0xF1) generates a #DB that is architecturally a trap, but on SVM it was not always intercepted. Unconditionally intercept ICEBP on SVM to match VMX behaviour and ensure correct event delivery semantics. Add two selftests exercising ICEBP: - int1_ept_test: verifies that ICEBP works correctly when the exception stack page is not present (EPT/NPT fault during #DB delivery). The IST stack is evicted via MADV_DONTNEED before executing INT1. - int1_task_gate_test: verifies ICEBP delivery through a 32-bit task gate, exercising the legacy task-switch path for #DB. Tested on Intel Sapphire Rapids and AMD Genoa. Without the SVM fix, int1_task_gate_test fails on AMD with EIP pointing at ICEBP instead of after it. With the fix, both tests pass on both platforms. Signed-off-by: David Woodhouse --- arch/x86/kvm/svm/svm.c | 21 ++ tools/testing/selftests/kvm/Makefile.kvm | 2 + .../testing/selftests/kvm/x86/int1_ept_test.c | 116 +++++++ .../selftests/kvm/x86/int1_task_gate_test.c | 298 ++++++++++++++++++ 4 files changed, 437 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/int1_ept_test.c create mode 100644 tools/testing/selftests/kvm/x86/int1_task_gate_test.c diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e7fdd7a9c280..a820d47a4bca 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1159,6 +1159,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm_set_intercept(svm, INTERCEPT_SKINIT); svm_set_intercept(svm, INTERCEPT_WBINVD); svm_set_intercept(svm, INTERCEPT_XSETBV); + svm_set_intercept(svm, INTERCEPT_ICEBP); svm_set_intercept(svm, INTERCEPT_RDPRU); svm_set_intercept(svm, INTERCEPT_RSM); @@ -2045,6 +2046,25 @@ static int bp_interception(struct kvm_vcpu *vcpu) return 0; } +static int icebp_interception(struct kvm_vcpu *vcpu) +{ + /* + * ICEBP (INT1, opcode 0xF1) generates #DB as a trap. Intercept it + * unconditionally so that RIP is advanced past the instruction + * *before* #DB is injected. This is necessary because SVM reports + * the wrong RIP for ICEBP-induced #DB when delivered via a task + * gate: RIP points at the ICEBP instruction instead of after it. + * + * By intercepting ICEBP, NRIP (next_rip) gives us the correct + * post-instruction RIP. Advancing RIP first and then injecting #DB + * ensures consistent behaviour regardless of how the guest's IDT + * delivers the exception. + */ + svm_skip_emulated_instruction(vcpu); + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; +} + static int ud_interception(struct kvm_vcpu *vcpu) { return handle_ud(vcpu); @@ -3333,6 +3353,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_MONITOR] = kvm_emulate_monitor, [SVM_EXIT_MWAIT] = kvm_emulate_mwait, [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, + [SVM_EXIT_ICEBP] = icebp_interception, [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 9118a5a51b89..1b89bf0e5cf1 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -86,6 +86,8 @@ TEST_GEN_PROGS_x86 += x86/hyperv_features TEST_GEN_PROGS_x86 += x86/hyperv_ipi TEST_GEN_PROGS_x86 += x86/hyperv_svm_test TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush +TEST_GEN_PROGS_x86 += x86/int1_ept_test +TEST_GEN_PROGS_x86 += x86/int1_task_gate_test TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/kvm_buslock_test diff --git a/tools/testing/selftests/kvm/x86/int1_ept_test.c b/tools/testing/selftests/kvm/x86/int1_ept_test.c new file mode 100644 index 000000000000..1ced77ffeeb9 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/int1_ept_test.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test that KVM correctly handles an EPT/NPT fault during delivery of + * a #DB exception caused by INT1 (ICEBP, opcode 0xF1). + * + * The guest executes INT1, which generates #DB. The IDT entry for #DB + * uses IST1, pointing to a stack page whose backing has been evicted + * via MADV_DONTNEED. The CPU takes an EPT/NPT violation when pushing + * the exception frame. KVM must handle the fault and reinject #DB. + */ +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define IST_STACK_GPA 0xd0000000ull +#define IST_STACK_SIZE PAGE_SIZE +/* IST1 offset in the 64-bit TSS */ +#define TSS64_IST1_OFFSET 0x24 + +static volatile bool db_handler_called; +static volatile u64 db_handler_rip; + +static void guest_db_handler(struct ex_regs *regs) +{ + db_handler_called = true; + db_handler_rip = regs->rip; +} + +static void guest_code(void) +{ + unsigned long expected_rip; + + db_handler_called = false; + + /* + * Execute INT1 (ICEBP). This generates #DB as a trap, so RIP + * in the exception frame points after the 0xF1 byte. + */ + asm volatile("lea 1f(%%rip), %0\n\t" + ".byte 0xf1\n\t" + "1:" + : "=r"(expected_rip) :: "memory"); + + GUEST_ASSERT(db_handler_called); + GUEST_ASSERT_EQ(db_handler_rip, expected_rip); + GUEST_SYNC(0); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + void *ist_hva; + unsigned int nr_pages; + struct idt_entry *idt; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* Install #DB handler using the normal exception handler table */ + vm_install_exception_handler(vm, DB_VECTOR, guest_db_handler); + + /* + * Allocate a separate stack page for IST1. Map it into the guest + * so the IDT entry can reference it. + */ + nr_pages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, IST_STACK_SIZE); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + IST_STACK_GPA, 1, nr_pages, 0); + virt_map(vm, IST_STACK_GPA, IST_STACK_GPA, nr_pages); + ist_hva = addr_gpa2hva(vm, IST_STACK_GPA); + + /* + * Write IST1 in the TSS to point to the top of the IST stack. + * TSS64 layout: offset 0x24 = IST1 (8 bytes). + */ + { + void *tss_hva = addr_gva2hva(vm, vm->arch.tss); + u64 ist1_val = IST_STACK_GPA + IST_STACK_SIZE; + + memcpy(tss_hva + TSS64_IST1_OFFSET, &ist1_val, sizeof(ist1_val)); + } + + /* + * Set the #DB IDT entry to use IST1. The handler address stays + * the same (the framework's idt_handlers stub for vector 1). + */ + idt = (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); + idt[DB_VECTOR].ist = 1; + + /* + * Evict the IST stack page from the EPT/NPT so that when the CPU + * tries to push the #DB exception frame, it takes an EPT violation. + */ + madvise(ist_hva, IST_STACK_SIZE, MADV_DONTNEED); + + /* Run the guest — it should execute INT1, fault, get reinjected, succeed */ + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_SYNC: + pr_info("INT1 with EPT fault on IST stack: PASSED\n"); + break; + default: + TEST_FAIL("Unexpected ucall"); + } + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/int1_task_gate_test.c b/tools/testing/selftests/kvm/x86/int1_task_gate_test.c new file mode 100644 index 000000000000..4485445ee39d --- /dev/null +++ b/tools/testing/selftests/kvm/x86/int1_task_gate_test.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test ICEBP (INT1) with a 32-bit task gate for #DB delivery. + * + * On SVM, when ICEBP causes #DB and #DB is delivered via a task gate, + * the saved RIP points at the ICEBP instruction rather than after it. + * This is a hardware quirk that the hypervisor must work around by + * intercepting ICEBP and advancing RIP before the task switch. + * + * This test sets up a 32-bit protected mode guest with: + * - A #DB IDT entry configured as a task gate (type 5) + * - A TSS for the #DB handler task + * - Guest code that executes ICEBP (0xF1) + * + * The #DB handler task records the EIP from the back-link TSS and + * writes it to a known memory location. The test verifies that EIP + * points after the ICEBP instruction, not at it. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define KVM_DEV "/dev/kvm" +#define PAGE_SIZE 4096 + +/* Guest physical memory layout (all below 1MB for simplicity) */ +#define GDT_BASE 0x1000 +#define IDT_BASE 0x2000 +#define MAIN_TSS_BASE 0x3000 +#define DB_TSS_BASE 0x4000 +#define CODE_BASE 0x5000 +#define RESULT_BASE 0x6000 +#define MAIN_STACK 0x8000 /* top of main task stack */ +#define DB_STACK 0x9000 /* top of #DB task stack */ + +/* GDT selectors */ +#define SEL_CODE32 0x08 +#define SEL_DATA32 0x10 +#define SEL_MAIN_TSS 0x18 +#define SEL_DB_TSS 0x20 + +/* 32-bit GDT entry */ +struct gdt_entry { + u16 limit_lo; + u16 base_lo; + u8 base_mid; + u8 access; + u8 limit_hi_flags; + u8 base_hi; +} __attribute__((packed)); + +/* 32-bit IDT task gate entry */ +struct idt_task_gate { + u16 reserved0; + u16 tss_selector; + u8 reserved1; + u8 type_attr; /* P=1, DPL=0, type=0x5 (task gate) */ + u16 reserved2; +} __attribute__((packed)); + +/* 32-bit TSS */ +struct tss32 { + u16 back_link, __blh; + u32 esp0; + u16 ss0, __ss0h; + u32 esp1; + u16 ss1, __ss1h; + u32 esp2; + u16 ss2, __ss2h; + u32 cr3; + u32 eip; + u32 eflags; + u32 eax, ecx, edx, ebx; + u32 esp, ebp, esi, edi; + u16 es, __esh; + u16 cs, __csh; + u16 ss, __ssh; + u16 ds, __dsh; + u16 fs, __fsh; + u16 gs, __gsh; + u16 ldt, __ldth; + u16 trace; + u16 iomap_base; +} __attribute__((packed)); + +static void set_gdt_entry(struct gdt_entry *e, u32 base, u32 limit, + u8 access, u8 flags) +{ + e->limit_lo = limit & 0xffff; + e->base_lo = base & 0xffff; + e->base_mid = (base >> 16) & 0xff; + e->access = access; + e->limit_hi_flags = ((limit >> 16) & 0xf) | (flags << 4); + e->base_hi = (base >> 24) & 0xff; +} + +/* + * Guest code (32-bit protected mode, assembled as bytes). + * + * The main task executes ICEBP. The #DB task gate switches to the + * DB handler TSS. The DB handler reads the back-link TSS to get the + * saved EIP, writes it to RESULT_BASE, then halts. + */ + +/* Main task code: just execute ICEBP then halt */ +static const u8 main_code[] = { + 0xf1, /* ICEBP (INT1) */ + 0xf4, /* HLT - should not reach here if task gate works */ +}; + +/* + * #DB handler task code: + * - Read back_link from our TSS (at DB_TSS_BASE offset 0) to get main TSS selector + * - The main TSS (at MAIN_TSS_BASE) has saved EIP at offset 0x20 + * - Read that EIP and store at RESULT_BASE + * - Store 0xCAFE at RESULT_BASE+4 as "handler ran" marker + * - HLT + */ +static const u8 db_handler_code[] = { + /* mov eax, [MAIN_TSS_BASE + 0x20] -- saved EIP in main TSS */ + 0xa1, (MAIN_TSS_BASE + 0x20) & 0xff, ((MAIN_TSS_BASE + 0x20) >> 8) & 0xff, 0x00, 0x00, + /* mov [RESULT_BASE], eax */ + 0xa3, RESULT_BASE & 0xff, (RESULT_BASE >> 8) & 0xff, 0x00, 0x00, + /* mov dword [RESULT_BASE+4], 0xCAFE */ + 0xc7, 0x05, (RESULT_BASE + 4) & 0xff, ((RESULT_BASE + 4) >> 8) & 0xff, 0x00, 0x00, + 0xfe, 0xca, 0x00, 0x00, + /* hlt */ + 0xf4, +}; + +#define DB_HANDLER_OFFSET 0x100 /* offset within CODE_BASE page */ + +static int kvm_ioctl(int fd, unsigned long cmd, void *arg) +{ + int ret = ioctl(fd, cmd, arg); + if (ret < 0) { + perror("KVM ioctl"); + exit(1); + } + return ret; +} + +int main(void) +{ + int kvm_fd, vm_fd, vcpu_fd; + struct kvm_userspace_memory_region region; + struct kvm_sregs sregs; + struct kvm_regs regs; + struct kvm_run *run; + void *mem; + size_t mmap_size; + struct gdt_entry *gdt; + struct idt_task_gate *idt; + struct tss32 *main_tss, *db_tss; + u32 *result; + u32 expected_eip; + + kvm_fd = open(KVM_DEV, O_RDWR); + if (kvm_fd < 0) { perror("open /dev/kvm"); return 1; } + + vm_fd = kvm_ioctl(kvm_fd, KVM_CREATE_VM, 0); + vcpu_fd = kvm_ioctl(vm_fd, KVM_CREATE_VCPU, 0); + + /* Allocate guest memory — single 1MB region */ + mem = mmap(NULL, 0x100000, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { perror("mmap"); return 1; } + memset(mem, 0, 0x100000); + + region = (struct kvm_userspace_memory_region){ + .slot = 0, + .guest_phys_addr = 0, + .memory_size = 0x100000, + .userspace_addr = (u64)mem, + }; + kvm_ioctl(vm_fd, KVM_SET_USER_MEMORY_REGION, ®ion); + + /* Set up GDT */ + gdt = (struct gdt_entry *)(mem + GDT_BASE); + /* Null descriptor */ + memset(&gdt[0], 0, sizeof(gdt[0])); + /* Code32: base=0, limit=0xfffff, 32-bit, present, code r/x */ + set_gdt_entry(&gdt[1], 0, 0xfffff, 0x9a, 0xc); + /* Data32: base=0, limit=0xfffff, 32-bit, present, data r/w */ + set_gdt_entry(&gdt[2], 0, 0xfffff, 0x92, 0xc); + /* Main TSS: base=MAIN_TSS_BASE, limit=sizeof(tss32)-1, present, TSS32 available */ + set_gdt_entry(&gdt[3], MAIN_TSS_BASE, sizeof(struct tss32) - 1, 0x89, 0x0); + /* DB TSS: base=DB_TSS_BASE, limit=sizeof(tss32)-1, present, TSS32 available */ + set_gdt_entry(&gdt[4], DB_TSS_BASE, sizeof(struct tss32) - 1, 0x89, 0x0); + + /* Set up IDT — task gate for #DB (vector 1) */ + idt = (struct idt_task_gate *)(mem + IDT_BASE); + idt[1].tss_selector = SEL_DB_TSS; + idt[1].type_attr = 0x85; /* P=1, DPL=0, type=5 (32-bit task gate) */ + + /* Set up main TSS */ + main_tss = (struct tss32 *)(mem + MAIN_TSS_BASE); + main_tss->cr3 = 0; /* no paging */ + main_tss->eip = CODE_BASE; + main_tss->eflags = 0x02; + main_tss->esp = MAIN_STACK; + main_tss->cs = SEL_CODE32; + main_tss->ds = SEL_DATA32; + main_tss->es = SEL_DATA32; + main_tss->ss = SEL_DATA32; + main_tss->fs = SEL_DATA32; + main_tss->gs = SEL_DATA32; + + /* Set up #DB handler TSS */ + db_tss = (struct tss32 *)(mem + DB_TSS_BASE); + db_tss->cr3 = 0; + db_tss->eip = CODE_BASE + DB_HANDLER_OFFSET; + db_tss->eflags = 0x02; + db_tss->esp = DB_STACK; + db_tss->cs = SEL_CODE32; + db_tss->ds = SEL_DATA32; + db_tss->es = SEL_DATA32; + db_tss->ss = SEL_DATA32; + db_tss->fs = SEL_DATA32; + db_tss->gs = SEL_DATA32; + + /* Copy guest code */ + memcpy(mem + CODE_BASE, main_code, sizeof(main_code)); + memcpy(mem + CODE_BASE + DB_HANDLER_OFFSET, db_handler_code, sizeof(db_handler_code)); + + /* Expected EIP: after the 0xF1 byte */ + expected_eip = CODE_BASE + 1; + + /* Set up vCPU registers for 32-bit protected mode (no paging) */ + kvm_ioctl(vcpu_fd, KVM_GET_SREGS, &sregs); + sregs.cr0 = 0x11; /* PE + ET, no PG */ + sregs.cr3 = 0; + sregs.cr4 = 0; + + sregs.gdt.base = GDT_BASE; + sregs.gdt.limit = 5 * 8 - 1; + sregs.idt.base = IDT_BASE; + sregs.idt.limit = 256 * 8 - 1; + + sregs.cs.base = 0; sregs.cs.limit = 0xfffff; sregs.cs.selector = SEL_CODE32; + sregs.cs.type = 0xb; sregs.cs.present = 1; sregs.cs.db = 1; sregs.cs.s = 1; sregs.cs.g = 1; + sregs.ds.base = 0; sregs.ds.limit = 0xfffff; sregs.ds.selector = SEL_DATA32; + sregs.ds.type = 3; sregs.ds.present = 1; sregs.ds.db = 1; sregs.ds.s = 1; sregs.ds.g = 1; + sregs.es = sregs.ss = sregs.fs = sregs.gs = sregs.ds; + + sregs.tr.base = MAIN_TSS_BASE; sregs.tr.limit = sizeof(struct tss32) - 1; + sregs.tr.selector = SEL_MAIN_TSS; sregs.tr.type = 0xb; /* 32-bit TSS busy */ + sregs.tr.present = 1; + + kvm_ioctl(vcpu_fd, KVM_SET_SREGS, &sregs); + + memset(®s, 0, sizeof(regs)); + regs.rip = CODE_BASE; + regs.rsp = MAIN_STACK; + regs.rflags = 0x02; + kvm_ioctl(vcpu_fd, KVM_SET_REGS, ®s); + + /* Run the guest */ + mmap_size = kvm_ioctl(kvm_fd, KVM_GET_VCPU_MMAP_SIZE, 0); + run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu_fd, 0); + + for (int i = 0; i < 100; i++) { + kvm_ioctl(vcpu_fd, KVM_RUN, 0); + + if (run->exit_reason == KVM_EXIT_HLT) + break; + + if (run->exit_reason != KVM_EXIT_INTERNAL_ERROR && + run->exit_reason != KVM_EXIT_DEBUG) { + fprintf(stderr, "Unexpected exit reason: %d\n", run->exit_reason); + return 1; + } + } + + /* Check results */ + result = (u32 *)(mem + RESULT_BASE); + if (result[1] != 0xCAFE) { + fprintf(stderr, "FAIL: #DB handler did not run (marker=0x%x)\n", result[1]); + return 1; + } + + printf("Saved EIP: 0x%x, expected: 0x%x\n", result[0], expected_eip); + if (result[0] == expected_eip) { + printf("PASS: EIP points after ICEBP\n"); + return 0; + } else if (result[0] == expected_eip - 1) { + printf("FAIL: EIP points AT ICEBP (known SVM bug with task gates)\n"); + return 1; + } else { + printf("FAIL: EIP is unexpected\n"); + return 1; + } +} -- 2.43.0