From: Cong Wang This patch introduces a dedicated trampoline mechanism for booting secondary CPUs with different kernel instances in multikernel mode. The implementation provides: - New trampoline_64_bsp.S assembly code for real-mode to long-mode transition when launching kernels on secondary CPUs - Trampoline memory allocation and setup in low memory (<1MB) for real-mode execution compatibility - Page table construction for identity mapping during CPU bootstrap - Integration with existing multikernel kexec infrastructure The trampoline handles the complete CPU initialization sequence from 16-bit real mode through 32-bit protected mode to 64-bit long mode, setting up appropriate GDT, page tables, and control registers before jumping to the target kernel entry point without resetting the whole system or the running kernel. Note: This implementation uses legacy assembly-based trampoline code and should be migrated to C-based x86 trampoline in future updates. Signed-off-by: Cong Wang --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/head64.c | 5 + arch/x86/kernel/setup.c | 3 + arch/x86/kernel/smpboot.c | 87 +++++++-- arch/x86/kernel/trampoline_64_bsp.S | 288 ++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 6 + 6 files changed, 375 insertions(+), 15 deletions(-) create mode 100644 arch/x86/kernel/trampoline_64_bsp.S diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d2a6d953be9..ac89d82bf25b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -50,6 +50,7 @@ CFLAGS_irq.o := -I $(src)/../include/asm/trace obj-y += head_$(BITS).o obj-y += head$(BITS).o +obj-y += trampoline_64_bsp.o obj-y += ebda.o obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 533fcf5636fc..4097101011d2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -216,6 +216,9 @@ static void __init copy_bootdata(char *real_mode_data) sme_unmap_bootdata(real_mode_data); } +unsigned long orig_boot_params; +EXPORT_SYMBOL(orig_boot_params); + asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data) { /* @@ -285,6 +288,8 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode /* set init_top_pgt kernel high mapping*/ init_top_pgt[511] = early_top_pgt[511]; + orig_boot_params = (unsigned long) real_mode_data; + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1b2edd07a3e1..8342c4e46bad 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -877,6 +877,8 @@ static void __init x86_report_nx(void) * Note: On x86_64, fixmaps are ready for use even before this is called. */ +extern void __init setup_trampolines_bsp(void); + void __init setup_arch(char **cmdline_p) { #ifdef CONFIG_X86_32 @@ -1103,6 +1105,7 @@ void __init setup_arch(char **cmdline_p) (max_pfn_mapped<trampoline_start; + unsigned long start_ip; int ret; - pr_info("do_multikernel_boot_cpu(apicid=%u, cpu=%u, kernel_start_address=%lx)\n", apicid, cpu, kernel_start_address); -#ifdef CONFIG_X86_64 - /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ - if (apic->wakeup_secondary_cpu_64) - start_ip = real_mode_header->trampoline_start64; -#endif - //initial_code = (unsigned long)start_secondary; - initial_code = (unsigned long)kernel_start_address; + /* Multikernel -- set physical address where kernel has been copied. + Note that this needs to be written to the location where the + trampoline was copied, not to the location within the original + kernel itself. */ + unsigned long *kernel_virt_addr = TRAMPOLINE_SYM_BSP(&kernel_phys_addr); + unsigned long *boot_params_virt_addr = TRAMPOLINE_SYM_BSP(&boot_params_phys_addr); - if (IS_ENABLED(CONFIG_X86_32)) { - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); - //initial_stack = idle->thread.sp; - } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) { - smpboot_control = cpu; - } + *kernel_virt_addr = kernel_start_address; + *boot_params_virt_addr = orig_boot_params; + + /* start_ip had better be page-aligned! */ + start_ip = trampoline_bsp_address(); + + pr_info("do_multikernel_boot_cpu(apicid=%u, cpu=%u, kernel_start_address=%lx)\n", apicid, cpu, kernel_start_address); /* Skip init_espfix_ap(cpu); */ @@ -897,6 +918,9 @@ static int do_multikernel_boot_cpu(u32 apicid, int cpu, unsigned long kernel_sta /* If the wakeup mechanism failed, cleanup the warm reset vector */ if (ret) arch_cpuhp_cleanup_kick_cpu(cpu); + + /* mark "stuck" area as not stuck */ + *(volatile u32 *)TRAMPOLINE_SYM_BSP(trampoline_status_bsp) = 0; return ret; } /* @@ -1008,6 +1032,39 @@ int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address) return err; } +void __init setup_trampolines_bsp(void) +{ + phys_addr_t mem; + size_t size = PAGE_ALIGN(x86_trampoline_bsp_end - x86_trampoline_bsp_start); + + /* Has to be in very low memory so we can execute real-mode AP code. */ + mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20); + if (!mem) + panic("Cannot allocate trampoline\n"); + + x86_trampoline_bsp_base = __va(mem); + memblock_reserve(mem, mem + size); + + printk(KERN_DEBUG "Base memory trampoline BSP at [%p] %llx size %zu\n", + x86_trampoline_bsp_base, (unsigned long long)mem, size); + + //if (!mklinux_boot) { + memcpy(x86_trampoline_bsp_base, trampoline_data_bsp, size); + + //} else { + // printk("Multikernel boot: BSP trampoline will NOT be copied\n"); + //} +} + +static int __init configure_trampolines_bsp(void) +{ + size_t size = PAGE_ALIGN(x86_trampoline_bsp_end - x86_trampoline_bsp_start); + + set_memory_x((unsigned long)x86_trampoline_bsp_base, size >> PAGE_SHIFT); + return 0; +} + +arch_initcall(configure_trampolines_bsp); int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { diff --git a/arch/x86/kernel/trampoline_64_bsp.S b/arch/x86/kernel/trampoline_64_bsp.S new file mode 100644 index 000000000000..0bd2a971a973 --- /dev/null +++ b/arch/x86/kernel/trampoline_64_bsp.S @@ -0,0 +1,288 @@ +/* + * + * Derived from Setup.S by Linus Torvalds, then derived from Popcorn Linux + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * 15 Sept 2005 Eric Biederman: 64bit PIC support + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, data addresses need to be absolute + * (no relocation) and are taken with regard to r_base. + * + * With the addition of trampoline_level4_pgt this code can + * now enter a 64bit kernel that lives at arbitrary 64bit + * physical addresses. + * + * If you work on this file, check the object module with objdump + * --full-contents --reloc to make sure there are no relocation + * entries. + */ + +#include +#include +#include +#include +#include +#include +#include + + .section ".x86_trampoline_bsp","a" + .balign PAGE_SIZE + .code16 + +SYM_CODE_START(trampoline_data_bsp) +bsp_base = . + cli # We should be safe anyway + wbinvd + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + mov %ax, %es + mov %ax, %ss + + + movl $0xA5A5A5A5, trampoline_status_bsp - bsp_base + # write marker for master knows we're running + + # Setup stack + movw $(trampoline_stack_bsp_end - bsp_base), %sp + + # call verify_cpu # Verify the cpu supports long mode + # testl %eax, %eax # Check for return code + # jnz no_longmode_bsp + + mov %cs, %ax + movzx %ax, %esi # Find the 32bit trampoline location + shll $4, %esi + + # Fixup the absolute vectors + leal (startup_32_bsp - bsp_base)(%esi), %eax + movl %eax, startup_32_vector_bsp - bsp_base + leal (startup_64_bsp - bsp_base)(%esi), %eax + movl %eax, startup_64_vector_bsp - bsp_base + leal (tgdt_bsp - bsp_base)(%esi), %eax + movl %eax, (tgdt_bsp + 2 - bsp_base) + + /* + * GDT tables in non default location kernel can be beyond 16MB and + * lgdt will not be able to load the address as in real mode default + * operand size is 16bit. Use lgdtl instead to force operand size + * to 32 bit. + */ + + lidtl tidt_bsp - bsp_base # load idt with 0, 0 + lgdtl tgdt_bsp - bsp_base # load gdt with whatever is appropriate + + mov $X86_CR0_PE, %ax # protected mode (PE) bit + lmsw %ax # into protected mode + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector_bsp - bsp_base) +SYM_CODE_END(trampoline_data_bsp) + + .code32 + .balign 4 +startup_32_bsp: + + cli + movl $(__KERNEL_DS), %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Load new GDT with the 64bit segments using 32bit descriptor. + * The new GDT labels the entire address space as 64-bit, so we + * can switch into long mode later. */ + leal (gdt_bsp_64 - bsp_base)(%esi), %eax + movl %eax, (gdt_bsp_64 - bsp_base + 2)(%esi) + lgdt (gdt_bsp_64 - bsp_base)(%esi) + + /* Enable PAE mode. Note that this does not actually take effect + * until paging is enabled */ + movl %cr4, %eax + orl $(X86_CR4_PAE), %eax + movl %eax, %cr4 + + /* Initialize Page tables to 0 */ + leal (pgtable_bsp - bsp_base)(%esi), %edi + xorl %eax, %eax + movl $((4096*6)/4), %ecx + rep stosl + + /* Build Level 4 */ + leal (pgtable_bsp - bsp_base)(%esi), %edi + leal 0x1007 (%edi), %eax + movl %eax, 0(%edi) + + /* Build Level 3 */ + leal (pgtable_bsp - bsp_base + 0x1000)(%esi), %edi + leal 0x1007(%edi), %eax + movl $4, %ecx +1: movl %eax, 0x00(%edi) + addl $0x00001000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Build Level 2 */ + leal (pgtable_bsp - bsp_base + 0x2000)(%esi), %edi + movl $0x00000183, %eax + movl $2048, %ecx +1: movl %eax, 0(%edi) + addl $0x00200000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Enable the boot page tables */ + leal (pgtable_bsp - bsp_base)(%esi), %eax + movl %eax, %cr3 + + /* Enable Long mode in EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* + * Setup for the jump to 64bit mode + * + * When the jump is performend we will be in long mode but + * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 + * (and in turn EFER.LMA = 1). To jump into 64bit mode we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + * We place all of the values on our mini stack so lret can + * used to perform that far jump. + */ + pushl $__KERNEL_CS + leal (startup_64_bsp - bsp_base)(%esi), %eax + pushl %eax + + /* Enter paged protected Mode, activating Long Mode */ + movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */ + movl %eax, %cr0 + + /* Jump from 32bit compatibility mode into 64bit mode. */ + lret + + .code64 + .balign 4 +startup_64_bsp: + + /* Get physical address of boot_params structure */ + movq (boot_params_phys_addr - bsp_base)(%rsi), %r15 + + /* Load kernel address into register */ + movq (kernel_phys_addr - bsp_base)(%rsi), %r14 + + /* Check whether the kernel is in the 4 GB we mapped already, + * and if not, add an additional mapping */ + movq $0xffffffff00000000, %r8 + testq %r8, %r14 + je 2f + + /* If we got here, we need to identity-map an additional 1 GB */ + + /* Mask off to figure out what our directory pointer should be */ + movq %r14, %r13 + movq $0xffffffffc0000000, %r12 + andq %r12, %r13 + + /* Set our PDPTE */ + movq %r13, %r11 + shrq $(30-3), %r11 + leaq (pgtable_bsp - bsp_base + 0x1000)(%rsi), %rdi + addq %r11, %rdi + leaq (pgtable_extra_bsp - bsp_base + 0x7)(%rsi), %rax + movq %rax, 0(%rdi) + + /* Populate the page directory */ + leaq (pgtable_extra_bsp - bsp_base)(%rsi), %rdi + movq $0x00000183, %rax + addq %r13, %rax + movq $512, %rcx +1: movq %rax, 0(%rdi) + addq $0x00200000, %rax + addq $8, %rdi + decq %rcx + jnz 1b + + /* Set esi to point to the boot_params structure */ +2: movq %r15, %rsi + jmp *%r14 + + .align 8 +SYM_DATA(boot_params_phys_addr, .quad 0) + + .align 8 +SYM_DATA(kernel_phys_addr, .quad 0) + + .code16 + .balign 4 + # Careful these need to be in the same 64K segment as the above; +tidt_bsp: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 4 +tgdt_bsp: + .short tgdt_bsp_end - tgdt_bsp # gdt limit + .long tgdt_bsp - bsp_base + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tgdt_bsp_end: + + .code64 + .balign 4 +gdt_bsp_64: + .word gdt_bsp_64_end - gdt_bsp_64 + .long gdt_bsp_64 - bsp_base + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x0080890000000000 /* TS descriptor */ + .quad 0x0000000000000000 /* TS continued */ +gdt_bsp_64_end: + + .code16 + .balign 4 +startup_32_vector_bsp: + .long startup_32_bsp - bsp_base + .word __KERNEL32_CS, 0 + + .balign 4 +startup_64_vector_bsp: + .long startup_64_bsp - bsp_base + .word __KERNEL_CS, 0 + + .balign 4 +SYM_DATA(trampoline_status_bsp, .long 0) + + .balign 4 +SYM_DATA(trampoline_location, .quad 0) + +trampoline_stack_bsp: + .fill 512,8,0 +trampoline_stack_bsp_end: + +SYM_DATA(trampoline_bsp_end) + +/* + * Space for page tables (not in .bss so not zeroed) + */ + .balign 4096 +pgtable_bsp: + .fill 6*4096, 1, 0 +pgtable_extra_bsp: + .fill 1*4096, 1, 0 + diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4fa0be732af1..86f4fd37dc18 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -231,6 +231,12 @@ SECTIONS INIT_DATA_SECTION(16) + .x86_trampoline_bsp : AT(ADDR(.x86_trampoline_bsp) - LOAD_OFFSET) { + x86_trampoline_bsp_start = .; + *(.x86_trampoline_bsp) + x86_trampoline_bsp_end = .; + } + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) -- 2.34.1