From: Cong Wang This patch extends the kexec subsystem to support multikernel functionality, allowing different kernel instances to be loaded and executed on specific CPUs. The implementation introduces: - New KEXEC_TYPE_MULTIKERNEL type and KEXEC_MULTIKERNEL flag - multikernel_kick_ap() function for CPU-specific kernel booting - LINUX_REBOOT_CMD_MULTIKERNEL reboot command with CPU parameter - Specialized segment loading for multikernel images using memremap - Integration with existing kexec infrastructure while bypassing standard machine_kexec_prepare() for avoiding resets The multikernel_kexec() function validates CPU availability and uses the existing kexec image start address to boot the target CPU with a different kernel instance. This enables heterogeneous computing scenarios where different CPUs can run specialized kernel variants. Signed-off-by: Cong Wang --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 104 ++++++++++++++++++++++++++++++++++ include/linux/kexec.h | 6 +- include/uapi/linux/kexec.h | 1 + include/uapi/linux/reboot.h | 2 +- kernel/kexec_core.c | 109 ++++++++++++++++++++++++++++++++++++ kernel/reboot.c | 10 ++++ 7 files changed, 230 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 22bfebe6776d..1a59fd0de759 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -107,6 +107,7 @@ void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); +int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address); int native_cpu_disable(void); void __noreturn hlt_play_dead(void); void native_play_dead(void); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index eb289abece23..7b27fdc1d169 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -833,6 +833,72 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) return 0; } +// must be locked by cpus_read_lock() +static int do_multikernel_boot_cpu(u32 apicid, int cpu, unsigned long kernel_start_address) +{ + unsigned long start_ip = real_mode_header->trampoline_start; + int ret; + + pr_info("do_multikernel_boot_cpu(apicid=%u, cpu=%u, kernel_start_address=%lx)\n", apicid, cpu, kernel_start_address); +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif + //initial_code = (unsigned long)start_secondary; + initial_code = (unsigned long)kernel_start_address; + + if (IS_ENABLED(CONFIG_X86_32)) { + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); + //initial_stack = idle->thread.sp; + } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) { + smpboot_control = cpu; + } + + /* Skip init_espfix_ap(cpu); */ + + /* Skip announce_cpu(cpu, apicid); */ + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + if (x86_platform.legacy.warm_reset) { + + pr_debug("Setting warm reset code and vector.\n"); + + smpboot_setup_warm_reset_vector(start_ip); + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(boot_cpu_apic_version)) { + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + } + + smp_mb(); + + /* + * Wake up a CPU in difference cases: + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. + * Otherwise, + * - Use an INIT boot APIC message + */ + if (apic->wakeup_secondary_cpu_64) + ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu); + else if (apic->wakeup_secondary_cpu) + ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu); + else + ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu); + + pr_info("do_multikernel_boot_cpu end\n"); + /* If the wakeup mechanism failed, cleanup the warm reset vector */ + if (ret) + arch_cpuhp_cleanup_kick_cpu(cpu); + return ret; +} /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -905,6 +971,44 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle) return ret; } +// must be locked by cpus_read_lock() +int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address) +{ + u32 apicid = apic->cpu_present_to_apicid(cpu); + int err; + + lockdep_assert_irqs_enabled(); + + pr_info("++++++++++++++++++++=_---CPU UP %u\n", cpu); + + if (apicid == BAD_APICID || !apic_id_valid(apicid)) { + pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid); + return -EINVAL; + } + + if (!test_bit(apicid, phys_cpu_present_map)) { + pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid); + return -EINVAL; + } + + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + + /* the FPU context is blank, nobody can own it */ + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; + /* skip common_cpu_up(cpu, tidle); */ + + err = do_multikernel_boot_cpu(apicid, cpu, kernel_start_address); + if (err) + pr_err("do_multikernel_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); + + return err; +} + + int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { u32 apicid = apic->cpu_present_to_apicid(cpu); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ff7e231b0485..edf64bc98ed5 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -358,9 +358,10 @@ struct kimage { unsigned long control_page; /* Flags to indicate special processing */ - unsigned int type : 1; + unsigned int type : 2; #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 +#define KEXEC_TYPE_MULTIKERNEL 2 unsigned int preserve_context : 1; /* If set, we are using file mode kexec syscall */ unsigned int file_mode:1; @@ -437,6 +438,7 @@ extern void machine_kexec(struct kimage *image); extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); +extern int multikernel_kexec(int cpu); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); @@ -458,7 +460,7 @@ bool kexec_load_permitted(int kexec_image_type); #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT) #else #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \ - KEXEC_CRASH_HOTPLUG_SUPPORT) + KEXEC_CRASH_HOTPLUG_SUPPORT | KEXEC_MULTIKERNEL) #endif /* List of defined/legal kexec file flags */ diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 55749cb0b81d..346e0ff4e663 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -14,6 +14,7 @@ #define KEXEC_PRESERVE_CONTEXT 0x00000002 #define KEXEC_UPDATE_ELFCOREHDR 0x00000004 #define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008 +#define KEXEC_MULTIKERNEL 0x00000010 #define KEXEC_ARCH_MASK 0xffff0000 /* diff --git a/include/uapi/linux/reboot.h b/include/uapi/linux/reboot.h index 58e64398efc5..aac2f2f94a98 100644 --- a/include/uapi/linux/reboot.h +++ b/include/uapi/linux/reboot.h @@ -34,7 +34,7 @@ #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2 #define LINUX_REBOOT_CMD_KEXEC 0x45584543 - +#define LINUX_REBOOT_CMD_MULTIKERNEL 0x4D4B4C49 #endif /* _UAPI_LINUX_REBOOT_H */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fa00b239c5d9..7d89d00e2cde 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -942,6 +943,84 @@ static int kimage_load_crash_segment(struct kimage *image, int idx) } #endif +static int kimage_load_multikernel_segment(struct kimage *image, int idx) +{ + /* For multikernel we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + struct kexec_segment *segment = &image->segment[idx]; + unsigned long maddr; + size_t ubytes, mbytes; + int result; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + result = 0; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + pr_info("Loading multikernel segment: mem=0x%lx, memsz=0x%zu, buf=0x%px, bufsz=0x%zu\n", + maddr, mbytes, buf, ubytes); + while (mbytes) { + char *ptr; + size_t uchunk, mchunk; + unsigned long page_addr = maddr & PAGE_MASK; + unsigned long page_offset = maddr & ~PAGE_MASK; + + /* Use memremap to map the physical address */ + ptr = memremap(page_addr, PAGE_SIZE, MEMREMAP_WB); + if (!ptr) { + pr_err("Failed to memremap memory at 0x%lx\n", page_addr); + result = -ENOMEM; + goto out; + } + + /* Adjust pointer to the offset within the page */ + ptr += page_offset; + + /* Calculate chunk sizes */ + mchunk = min_t(size_t, mbytes, PAGE_SIZE - page_offset); + uchunk = min(ubytes, mchunk); + + /* Zero the trailing part of the page if needed */ + if (mchunk > uchunk) { + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + /* Clean up */ + memunmap(ptr - page_offset); + if (result) { + result = -EFAULT; + goto out; + } + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } +out: + return result; +} + int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; @@ -955,6 +1034,9 @@ int kimage_load_segment(struct kimage *image, int idx) result = kimage_load_crash_segment(image, idx); break; #endif + case KEXEC_TYPE_MULTIKERNEL: + result = kimage_load_multikernel_segment(image, idx); + break; } return result; @@ -1229,3 +1311,30 @@ int kernel_kexec(void) kexec_unlock(); return error; } + +int multikernel_kexec(int cpu) +{ + int rc; + + pr_info("multikernel kexec: cpu %d\n", cpu); + + if (cpu_online(cpu)) { + pr_err("The CPU is currently running with this kernel instance."); + return -EBUSY; + } + + if (!kexec_trylock()) + return -EBUSY; + if (!kexec_image) { + rc = -EINVAL; + goto unlock; + } + + cpus_read_lock(); + rc = multikernel_kick_ap(cpu, kexec_image->start); + cpus_read_unlock(); + +unlock: + kexec_unlock(); + return rc; +} diff --git a/kernel/reboot.c b/kernel/reboot.c index ec087827c85c..f3ac703c4695 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -717,6 +717,10 @@ EXPORT_SYMBOL_GPL(kernel_power_off); DEFINE_MUTEX(system_transition_mutex); +struct multikernel_boot_args { + int cpu; +}; + /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers @@ -729,6 +733,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { struct pid_namespace *pid_ns = task_active_pid_ns(current); + struct multikernel_boot_args boot_args; char buffer[256]; int ret = 0; @@ -799,6 +804,11 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, case LINUX_REBOOT_CMD_KEXEC: ret = kernel_kexec(); break; + case LINUX_REBOOT_CMD_MULTIKERNEL: + if (copy_from_user(&boot_args, arg, sizeof(boot_args))) + return -EFAULT; + ret = multikernel_kexec(boot_args.cpu); + break; #endif #ifdef CONFIG_HIBERNATION -- 2.34.1 From: Cong Wang This patch introduces a dedicated trampoline mechanism for booting secondary CPUs with different kernel instances in multikernel mode. The implementation provides: - New trampoline_64_bsp.S assembly code for real-mode to long-mode transition when launching kernels on secondary CPUs - Trampoline memory allocation and setup in low memory (<1MB) for real-mode execution compatibility - Page table construction for identity mapping during CPU bootstrap - Integration with existing multikernel kexec infrastructure The trampoline handles the complete CPU initialization sequence from 16-bit real mode through 32-bit protected mode to 64-bit long mode, setting up appropriate GDT, page tables, and control registers before jumping to the target kernel entry point without resetting the whole system or the running kernel. Note: This implementation uses legacy assembly-based trampoline code and should be migrated to C-based x86 trampoline in future updates. Signed-off-by: Cong Wang --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/head64.c | 5 + arch/x86/kernel/setup.c | 3 + arch/x86/kernel/smpboot.c | 85 ++++++-- arch/x86/kernel/trampoline_64_bsp.S | 288 ++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 6 + 6 files changed, 373 insertions(+), 15 deletions(-) create mode 100644 arch/x86/kernel/trampoline_64_bsp.S diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bc184dd38d99..49ff8272c888 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -50,6 +50,7 @@ CFLAGS_irq.o := -I $(src)/../include/asm/trace obj-y += head_$(BITS).o obj-y += head$(BITS).o +obj-y += trampoline_64_bsp.o obj-y += ebda.o obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fd28b53dbac5..60005472facb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -219,6 +219,9 @@ static void __init copy_bootdata(char *real_mode_data) sme_unmap_bootdata(real_mode_data); } +unsigned long orig_boot_params; +EXPORT_SYMBOL(orig_boot_params); + asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data) { /* @@ -288,6 +291,8 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode /* set init_top_pgt kernel high mapping*/ init_top_pgt[511] = early_top_pgt[511]; + orig_boot_params = (unsigned long) real_mode_data; + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1b2edd07a3e1..8342c4e46bad 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -877,6 +877,8 @@ static void __init x86_report_nx(void) * Note: On x86_64, fixmaps are ready for use even before this is called. */ +extern void __init setup_trampolines_bsp(void); + void __init setup_arch(char **cmdline_p) { #ifdef CONFIG_X86_32 @@ -1103,6 +1105,7 @@ void __init setup_arch(char **cmdline_p) (max_pfn_mapped<trampoline_start; + unsigned long start_ip; int ret; - pr_info("do_multikernel_boot_cpu(apicid=%u, cpu=%u, kernel_start_address=%lx)\n", apicid, cpu, kernel_start_address); -#ifdef CONFIG_X86_64 - /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ - if (apic->wakeup_secondary_cpu_64) - start_ip = real_mode_header->trampoline_start64; -#endif - //initial_code = (unsigned long)start_secondary; - initial_code = (unsigned long)kernel_start_address; + /* Multikernel -- set physical address where kernel has been copied. + Note that this needs to be written to the location where the + trampoline was copied, not to the location within the original + kernel itself. */ + unsigned long *kernel_virt_addr = TRAMPOLINE_SYM_BSP(&kernel_phys_addr); - if (IS_ENABLED(CONFIG_X86_32)) { - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); - //initial_stack = idle->thread.sp; - } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) { - smpboot_control = cpu; - } + *kernel_virt_addr = kernel_start_address; + + /* start_ip had better be page-aligned! */ + start_ip = trampoline_bsp_address(); + + pr_info("do_multikernel_boot_cpu(apicid=%u, cpu=%u, kernel_start_address=%lx)\n", apicid, cpu, kernel_start_address); /* Skip init_espfix_ap(cpu); */ @@ -897,6 +916,9 @@ static int do_multikernel_boot_cpu(u32 apicid, int cpu, unsigned long kernel_sta /* If the wakeup mechanism failed, cleanup the warm reset vector */ if (ret) arch_cpuhp_cleanup_kick_cpu(cpu); + + /* mark "stuck" area as not stuck */ + *(volatile u32 *)TRAMPOLINE_SYM_BSP(trampoline_status_bsp) = 0; return ret; } /* @@ -1008,6 +1030,39 @@ int multikernel_kick_ap(unsigned int cpu, unsigned long kernel_start_address) return err; } +void __init setup_trampolines_bsp(void) +{ + phys_addr_t mem; + size_t size = PAGE_ALIGN(x86_trampoline_bsp_end - x86_trampoline_bsp_start); + + /* Has to be in very low memory so we can execute real-mode AP code. */ + mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20); + if (!mem) + panic("Cannot allocate trampoline\n"); + + x86_trampoline_bsp_base = __va(mem); + memblock_reserve(mem, mem + size); + + printk(KERN_DEBUG "Base memory trampoline BSP at [%p] %llx size %zu\n", + x86_trampoline_bsp_base, (unsigned long long)mem, size); + + //if (!mklinux_boot) { + memcpy(x86_trampoline_bsp_base, trampoline_data_bsp, size); + + //} else { + // printk("Multikernel boot: BSP trampoline will NOT be copied\n"); + //} +} + +static int __init configure_trampolines_bsp(void) +{ + size_t size = PAGE_ALIGN(x86_trampoline_bsp_end - x86_trampoline_bsp_start); + + set_memory_x((unsigned long)x86_trampoline_bsp_base, size >> PAGE_SHIFT); + return 0; +} + +arch_initcall(configure_trampolines_bsp); int native_kick_ap(unsigned int cpu, struct task_struct *tidle) { diff --git a/arch/x86/kernel/trampoline_64_bsp.S b/arch/x86/kernel/trampoline_64_bsp.S new file mode 100644 index 000000000000..0bd2a971a973 --- /dev/null +++ b/arch/x86/kernel/trampoline_64_bsp.S @@ -0,0 +1,288 @@ +/* + * + * Derived from Setup.S by Linus Torvalds, then derived from Popcorn Linux + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * 15 Sept 2005 Eric Biederman: 64bit PIC support + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, data addresses need to be absolute + * (no relocation) and are taken with regard to r_base. + * + * With the addition of trampoline_level4_pgt this code can + * now enter a 64bit kernel that lives at arbitrary 64bit + * physical addresses. + * + * If you work on this file, check the object module with objdump + * --full-contents --reloc to make sure there are no relocation + * entries. + */ + +#include +#include +#include +#include +#include +#include +#include + + .section ".x86_trampoline_bsp","a" + .balign PAGE_SIZE + .code16 + +SYM_CODE_START(trampoline_data_bsp) +bsp_base = . + cli # We should be safe anyway + wbinvd + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + mov %ax, %es + mov %ax, %ss + + + movl $0xA5A5A5A5, trampoline_status_bsp - bsp_base + # write marker for master knows we're running + + # Setup stack + movw $(trampoline_stack_bsp_end - bsp_base), %sp + + # call verify_cpu # Verify the cpu supports long mode + # testl %eax, %eax # Check for return code + # jnz no_longmode_bsp + + mov %cs, %ax + movzx %ax, %esi # Find the 32bit trampoline location + shll $4, %esi + + # Fixup the absolute vectors + leal (startup_32_bsp - bsp_base)(%esi), %eax + movl %eax, startup_32_vector_bsp - bsp_base + leal (startup_64_bsp - bsp_base)(%esi), %eax + movl %eax, startup_64_vector_bsp - bsp_base + leal (tgdt_bsp - bsp_base)(%esi), %eax + movl %eax, (tgdt_bsp + 2 - bsp_base) + + /* + * GDT tables in non default location kernel can be beyond 16MB and + * lgdt will not be able to load the address as in real mode default + * operand size is 16bit. Use lgdtl instead to force operand size + * to 32 bit. + */ + + lidtl tidt_bsp - bsp_base # load idt with 0, 0 + lgdtl tgdt_bsp - bsp_base # load gdt with whatever is appropriate + + mov $X86_CR0_PE, %ax # protected mode (PE) bit + lmsw %ax # into protected mode + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector_bsp - bsp_base) +SYM_CODE_END(trampoline_data_bsp) + + .code32 + .balign 4 +startup_32_bsp: + + cli + movl $(__KERNEL_DS), %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Load new GDT with the 64bit segments using 32bit descriptor. + * The new GDT labels the entire address space as 64-bit, so we + * can switch into long mode later. */ + leal (gdt_bsp_64 - bsp_base)(%esi), %eax + movl %eax, (gdt_bsp_64 - bsp_base + 2)(%esi) + lgdt (gdt_bsp_64 - bsp_base)(%esi) + + /* Enable PAE mode. Note that this does not actually take effect + * until paging is enabled */ + movl %cr4, %eax + orl $(X86_CR4_PAE), %eax + movl %eax, %cr4 + + /* Initialize Page tables to 0 */ + leal (pgtable_bsp - bsp_base)(%esi), %edi + xorl %eax, %eax + movl $((4096*6)/4), %ecx + rep stosl + + /* Build Level 4 */ + leal (pgtable_bsp - bsp_base)(%esi), %edi + leal 0x1007 (%edi), %eax + movl %eax, 0(%edi) + + /* Build Level 3 */ + leal (pgtable_bsp - bsp_base + 0x1000)(%esi), %edi + leal 0x1007(%edi), %eax + movl $4, %ecx +1: movl %eax, 0x00(%edi) + addl $0x00001000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Build Level 2 */ + leal (pgtable_bsp - bsp_base + 0x2000)(%esi), %edi + movl $0x00000183, %eax + movl $2048, %ecx +1: movl %eax, 0(%edi) + addl $0x00200000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Enable the boot page tables */ + leal (pgtable_bsp - bsp_base)(%esi), %eax + movl %eax, %cr3 + + /* Enable Long mode in EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* + * Setup for the jump to 64bit mode + * + * When the jump is performend we will be in long mode but + * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 + * (and in turn EFER.LMA = 1). To jump into 64bit mode we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + * We place all of the values on our mini stack so lret can + * used to perform that far jump. + */ + pushl $__KERNEL_CS + leal (startup_64_bsp - bsp_base)(%esi), %eax + pushl %eax + + /* Enter paged protected Mode, activating Long Mode */ + movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */ + movl %eax, %cr0 + + /* Jump from 32bit compatibility mode into 64bit mode. */ + lret + + .code64 + .balign 4 +startup_64_bsp: + + /* Get physical address of boot_params structure */ + movq (boot_params_phys_addr - bsp_base)(%rsi), %r15 + + /* Load kernel address into register */ + movq (kernel_phys_addr - bsp_base)(%rsi), %r14 + + /* Check whether the kernel is in the 4 GB we mapped already, + * and if not, add an additional mapping */ + movq $0xffffffff00000000, %r8 + testq %r8, %r14 + je 2f + + /* If we got here, we need to identity-map an additional 1 GB */ + + /* Mask off to figure out what our directory pointer should be */ + movq %r14, %r13 + movq $0xffffffffc0000000, %r12 + andq %r12, %r13 + + /* Set our PDPTE */ + movq %r13, %r11 + shrq $(30-3), %r11 + leaq (pgtable_bsp - bsp_base + 0x1000)(%rsi), %rdi + addq %r11, %rdi + leaq (pgtable_extra_bsp - bsp_base + 0x7)(%rsi), %rax + movq %rax, 0(%rdi) + + /* Populate the page directory */ + leaq (pgtable_extra_bsp - bsp_base)(%rsi), %rdi + movq $0x00000183, %rax + addq %r13, %rax + movq $512, %rcx +1: movq %rax, 0(%rdi) + addq $0x00200000, %rax + addq $8, %rdi + decq %rcx + jnz 1b + + /* Set esi to point to the boot_params structure */ +2: movq %r15, %rsi + jmp *%r14 + + .align 8 +SYM_DATA(boot_params_phys_addr, .quad 0) + + .align 8 +SYM_DATA(kernel_phys_addr, .quad 0) + + .code16 + .balign 4 + # Careful these need to be in the same 64K segment as the above; +tidt_bsp: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 4 +tgdt_bsp: + .short tgdt_bsp_end - tgdt_bsp # gdt limit + .long tgdt_bsp - bsp_base + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tgdt_bsp_end: + + .code64 + .balign 4 +gdt_bsp_64: + .word gdt_bsp_64_end - gdt_bsp_64 + .long gdt_bsp_64 - bsp_base + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x0080890000000000 /* TS descriptor */ + .quad 0x0000000000000000 /* TS continued */ +gdt_bsp_64_end: + + .code16 + .balign 4 +startup_32_vector_bsp: + .long startup_32_bsp - bsp_base + .word __KERNEL32_CS, 0 + + .balign 4 +startup_64_vector_bsp: + .long startup_64_bsp - bsp_base + .word __KERNEL_CS, 0 + + .balign 4 +SYM_DATA(trampoline_status_bsp, .long 0) + + .balign 4 +SYM_DATA(trampoline_location, .quad 0) + +trampoline_stack_bsp: + .fill 512,8,0 +trampoline_stack_bsp_end: + +SYM_DATA(trampoline_bsp_end) + +/* + * Space for page tables (not in .bss so not zeroed) + */ + .balign 4096 +pgtable_bsp: + .fill 6*4096, 1, 0 +pgtable_extra_bsp: + .fill 1*4096, 1, 0 + diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d7af4a64c211..38e16ed2b3fb 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -228,6 +228,12 @@ SECTIONS INIT_DATA_SECTION(16) + .x86_trampoline_bsp : AT(ADDR(.x86_trampoline_bsp) - LOAD_OFFSET) { + x86_trampoline_bsp_start = .; + *(.x86_trampoline_bsp) + x86_trampoline_bsp_end = .; + } + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) -- 2.34.1 From: Cong Wang This commit introduces: * Configuration infrastructure (kernel/multikernel/Kconfig) that adds CONFIG_MULTIKERNEL option depending on KEXEC_CORE, it will provide kernfs interface for multikernel instance management, device tree based resource management, physical memory pool allocation, and kexec integration. * Core initialization module (kernel/multikernel/core.c) that provides basic subsystem initialization using subsys_initcall() to ensure multikernel support is initialized after core kernel subsystems. This foundational commit establishes the basic framework that subsequent patches will build upon to implement the full multikernel functionality. Signed-off-by: Cong Wang --- kernel/Kconfig.kexec | 2 ++ kernel/Makefile | 1 + kernel/multikernel/Kconfig | 20 ++++++++++++++++++++ kernel/multikernel/Makefile | 6 ++++++ kernel/multikernel/core.c | 17 +++++++++++++++++ 5 files changed, 46 insertions(+) create mode 100644 kernel/multikernel/Kconfig create mode 100644 kernel/multikernel/Makefile create mode 100644 kernel/multikernel/core.c diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..e0fbd7e9af43 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -194,4 +194,6 @@ config CRASH_MAX_MEMORY_RANGES the computation behind the value provided through the /sys/kernel/crash_elfcorehdr_size attribute. +source "kernel/multikernel/Kconfig" + endmenu diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..017ed567f86a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -56,6 +56,7 @@ obj-y += dma/ obj-y += entry/ obj-y += unwind/ obj-$(CONFIG_MODULES) += module/ +obj-$(CONFIG_MULTIKERNEL) += multikernel/ obj-$(CONFIG_KCMP) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/multikernel/Kconfig b/kernel/multikernel/Kconfig new file mode 100644 index 000000000000..0e61fd2e505a --- /dev/null +++ b/kernel/multikernel/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Multikernel configuration +# + +config MULTIKERNEL + bool "Multikernel support" + depends on KEXEC_CORE + help + Enable multikernel support, which allows running multiple kernel + instances simultaneously with resource isolation and inter-kernel + communication capabilities. + + This feature provides: + - Sysfs interface for multikernel instance management + - Device tree based resource specification + - Memory pool management for kernel instances + - Integration with kexec for kernel loading + + If unsure, say N. diff --git a/kernel/multikernel/Makefile b/kernel/multikernel/Makefile new file mode 100644 index 000000000000..950bace927a0 --- /dev/null +++ b/kernel/multikernel/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for multikernel support +# + +obj-y += core.o diff --git a/kernel/multikernel/core.c b/kernel/multikernel/core.c new file mode 100644 index 000000000000..218424d59cc3 --- /dev/null +++ b/kernel/multikernel/core.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Multikernel Technologies, Inc. All rights reserved + */ +#include +#include +#include +#include + +static int __init multikernel_init(void) +{ + pr_info("Multikernel support initialized\n"); + return 0; +} + +/* Initialize multikernel after core kernel subsystems are ready */ +subsys_initcall(multikernel_init); -- 2.34.1 From: Cong Wang This patch adds a dedicated IPI vector (0xea) for multikernel communication, enabling different kernel instances running on separate CPUs to send interrupts to each other. The implementation includes: - MULTIKERNEL_VECTOR definition at interrupt vector 0xea - IDT entry declaration and registration for sysvec_multikernel - Interrupt handler sysvec_multikernel() with proper APIC EOI and IRQ statistics tracking - Placeholder generic_multikernel_interrupt() function for extensible multikernel interrupt handling This vector provides the foundational interrupt mechanism required for implementing inter-kernel communication protocols in multikernel environments, where heterogeneous kernel instances coordinate while maintaining CPU-level isolation. Signed-off-by: Cong Wang --- arch/x86/include/asm/idtentry.h | 3 +++ arch/x86/include/asm/irq_vectors.h | 1 + arch/x86/kernel/idt.c | 3 +++ arch/x86/kernel/smp.c | 8 ++++++++ 4 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index abd637e54e94..d2c3f1ca481a 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -703,6 +703,9 @@ DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +# ifdef CONFIG_MULTIKERNEL +DECLARE_IDTENTRY_SYSVEC(MULTIKERNEL_VECTOR, sysvec_multikernel); +# endif #else # define fred_sysvec_reschedule_ipi NULL # define fred_sysvec_reboot NULL diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 47051871b436..478e2e2d188a 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,7 @@ * the host kernel. */ #define POSTED_MSI_NOTIFICATION_VECTOR 0xeb +#define MULTIKERNEL_VECTOR 0xea #define NR_VECTORS 256 diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f445bec516a0..5e6d03bb18b5 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -135,6 +135,9 @@ static const __initconst struct idt_data apic_idts[] = { INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), +#ifdef CONFIG_MULTIKERNEL + INTG(MULTIKERNEL_VECTOR, asm_sysvec_multikernel), +#endif INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b014e6d229f9..59658fcd9037 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -272,6 +272,14 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); } +#ifdef CONFIG_MULTIKERNEL +DEFINE_IDTENTRY_SYSVEC(sysvec_multikernel) +{ + apic_eoi(); + inc_irq_stat(irq_call_count); +} +#endif /* CONFIG_MULTIKERNEL */ + static int __init nonmi_ipi_setup(char *str) { smp_no_nmi_ipi = true; -- 2.34.1 From: Cong Wang The tranditional smp_processor_id() is a software-defined CPU ID which is only unique within the same kernel. With Multikernel architecture, we run multiple Linux kernels on different CPU's, hence the host kernel needs a globally unique CPU ID to manage the CPU's. The physical CPU ID is perfect for this case. This API will be used to globally distinguish CPU's among different multikernels. Signed-off-by: Cong Wang --- arch/x86/include/asm/smp.h | 6 ++++++ arch/x86/kernel/smp.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1a59fd0de759..378be65ceafa 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -40,6 +40,7 @@ struct smp_ops { void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); + int (*cpu_physical_id)(int cpu); }; /* Globals due to paravirt */ @@ -100,6 +101,11 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) smp_ops.send_call_func_ipi(mask); } +static inline int arch_cpu_physical_id(int cpu) +{ + return smp_ops.cpu_physical_id(cpu); +} + void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void smp_prepare_cpus_common(void); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 59658fcd9037..e2eba09da7fc 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -288,6 +288,11 @@ static int __init nonmi_ipi_setup(char *str) __setup("nonmi_ipi", nonmi_ipi_setup); +static int native_cpu_physical_id(int cpu) +{ + return cpu_physical_id(cpu); +} + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, @@ -305,6 +310,7 @@ struct smp_ops smp_ops = { .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, + .cpu_physical_id = native_cpu_physical_id, }; EXPORT_SYMBOL_GPL(smp_ops); -- 2.34.1 From: Cong Wang This commit introduces: * Memory pool reservation via mkkernel_pool= kernel parameter that supports both fixed address reservation (mkkernel_pool=size@addr) and dynamic allocation (mkkernel_pool=size). Uses memblock reservation during early boot to ensure memory availability. * Global memory pool management using gen_pool for runtime allocation and deallocation of physical memory chunks from the reserved pool. Provides thread-safe operations with mutex protection. * Per-instance memory pool management that allows creating dedicated memory pools for individual kernel instances. Each instance pool is carved out from the main multikernel pool and provides fine-grained allocation capabilities for IPI data, buffers, and other per-instance resources. * Integration with /proc/iomem resource hierarchy to provide visibility into multikernel memory usage and prevent conflicts with other kernel subsystems. This memory management system uses a two-tier approach: a main pool reserved during boot handles large allocations and spawning operations, while per-instance pools provide efficient small allocation services for runtime inter-kernel communication and instance-specific data structures such as kernel image and initramfs. Signed-off-by: Cong Wang --- include/linux/multikernel.h | 22 +++ kernel/multikernel/Makefile | 2 +- kernel/multikernel/mem.c | 376 ++++++++++++++++++++++++++++++++++++ 3 files changed, 399 insertions(+), 1 deletion(-) create mode 100644 include/linux/multikernel.h create mode 100644 kernel/multikernel/mem.c diff --git a/include/linux/multikernel.h b/include/linux/multikernel.h new file mode 100644 index 000000000000..51c989139a75 --- /dev/null +++ b/include/linux/multikernel.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Multikernel Technologies, Inc. All rights reserved + */ +#ifndef _LINUX_MULTIKERNEL_H +#define _LINUX_MULTIKERNEL_H + +struct resource; + +extern phys_addr_t multikernel_alloc(size_t size); +extern void multikernel_free(phys_addr_t addr, size_t size); +extern struct resource *multikernel_get_pool_resource(void); +extern bool multikernel_pool_available(void); + +/* Per-instance memory pool management */ +extern void *multikernel_create_instance_pool(int instance_id, size_t pool_size, int min_alloc_order); +extern void multikernel_destroy_instance_pool(void *pool_handle); +extern phys_addr_t multikernel_instance_alloc(void *pool_handle, size_t size, size_t align); +extern void multikernel_instance_free(void *pool_handle, phys_addr_t addr, size_t size); +extern size_t multikernel_instance_pool_avail(void *pool_handle); + +#endif /* _LINUX_MULTIKERNEL_H */ diff --git a/kernel/multikernel/Makefile b/kernel/multikernel/Makefile index 950bace927a0..0dad7f2267f9 100644 --- a/kernel/multikernel/Makefile +++ b/kernel/multikernel/Makefile @@ -3,4 +3,4 @@ # Makefile for multikernel support # -obj-y += core.o +obj-y += core.o mem.o diff --git a/kernel/multikernel/mem.c b/kernel/multikernel/mem.c new file mode 100644 index 000000000000..dbc3363764d7 --- /dev/null +++ b/kernel/multikernel/mem.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Multikernel memory management + * + * Memory pool management for multikernel spawn kernels using gen_pool + * with mkkernel_pool= command line parameter + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global multikernel memory pool resource */ +struct resource multikernel_res = { + .name = "Multikernel Memory Pool", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM, + .desc = IORES_DESC_RESERVED +}; + +/* Generic pool for runtime memory allocation */ +static struct gen_pool *multikernel_pool; + +static DEFINE_MUTEX(multikernel_mem_mutex); + +/** + * multikernel_alloc() - Allocate memory from multikernel pool + * @size: size to allocate + * + * Returns physical address of allocated memory, or 0 on failure + */ +phys_addr_t multikernel_alloc(size_t size) +{ + unsigned long addr; + + if (!multikernel_pool) + return 0; + + mutex_lock(&multikernel_mem_mutex); + addr = gen_pool_alloc(multikernel_pool, size); + mutex_unlock(&multikernel_mem_mutex); + + return (phys_addr_t)addr; +} + +/** + * multikernel_free() - Free memory back to multikernel pool + * @addr: physical address to free + * @size: size to free + */ +void multikernel_free(phys_addr_t addr, size_t size) +{ + if (!multikernel_pool || !addr) + return; + + mutex_lock(&multikernel_mem_mutex); + gen_pool_free(multikernel_pool, (unsigned long)addr, size); + mutex_unlock(&multikernel_mem_mutex); + + pr_debug("Multikernel freed %zu bytes at %pa\n", size, &addr); +} + +/** + * multikernel_get_pool_resource() - Get the multikernel pool resource + * + * Returns pointer to the multikernel pool resource for memory walking + */ +struct resource *multikernel_get_pool_resource(void) +{ + if (!multikernel_res.start) + return NULL; + + return &multikernel_res; +} + +/** + * multikernel_pool_available() - Check if multikernel pool is available + * + * Returns true if multikernel pool is configured and available + */ +bool multikernel_pool_available(void) +{ + return multikernel_pool != NULL; +} + +/** + * Per-instance memory pool management + * + * Each kernel instance gets its own gen_pool for fine-grained allocations + * (IPI data, small buffers, etc.) carved out from the main multikernel pool. + */ + +/** + * multikernel_create_instance_pool() - Create a memory pool for a kernel instance + * @instance_id: Unique identifier for the instance + * @pool_size: Total size of memory to allocate for this instance's pool + * @min_alloc_order: Minimum allocation order (at least PAGE_SHIFT) + * + * Allocates multiple chunks from the main multikernel pool to reach the target + * pool_size and creates a gen_pool for the instance to manage smaller allocations. + * + * Returns opaque handle to the instance pool, or NULL on failure + */ +void *multikernel_create_instance_pool(int instance_id, size_t pool_size, int min_alloc_order) +{ + struct gen_pool *instance_pool; + size_t remaining_size = pool_size; + size_t chunk_size; + phys_addr_t chunk_base; + int chunks_added = 0; + + if (!multikernel_pool_available()) { + pr_err("Multikernel main pool not available for instance %d\n", instance_id); + return NULL; + } + + if (min_alloc_order < PAGE_SHIFT) { + pr_err("Invalid min_alloc_order %d for instance %d (must be >= PAGE_SHIFT %d)\n", + min_alloc_order, instance_id, PAGE_SHIFT); + return NULL; + } + + instance_pool = gen_pool_create(min_alloc_order, -1); + if (!instance_pool) { + pr_err("Failed to create gen_pool for instance %d\n", instance_id); + return NULL; + } + + /* Allocate memory in chunks and add to the pool */ + while (remaining_size > 0) { + /* Try to allocate the remaining size, but be flexible */ + chunk_size = remaining_size; + chunk_base = multikernel_alloc(chunk_size); + + if (!chunk_base) { + /* If we can't get the full remaining size, try smaller chunks */ + if (chunk_size > (1024 * 1024)) { + /* Try 1MB chunks */ + chunk_size = 1024 * 1024; + chunk_base = multikernel_alloc(chunk_size); + } + + if (!chunk_base && chunk_size > (256 * 1024)) { + /* Try 256KB chunks */ + chunk_size = 256 * 1024; + chunk_base = multikernel_alloc(chunk_size); + } + + if (!chunk_base && chunk_size > (1 << min_alloc_order)) { + /* Try minimum allocation size */ + chunk_size = 1 << min_alloc_order; + chunk_base = multikernel_alloc(chunk_size); + } + + if (!chunk_base) { + pr_err("Failed to allocate chunk %d for instance %d (remaining: %zu bytes)\n", + chunks_added + 1, instance_id, remaining_size); + goto cleanup; + } + } + + /* Add the allocated chunk to the instance pool */ + if (gen_pool_add(instance_pool, chunk_base, chunk_size, -1)) { + pr_err("Failed to add chunk %d to instance pool %d\n", + chunks_added + 1, instance_id); + multikernel_free(chunk_base, chunk_size); + goto cleanup; + } + + chunks_added++; + remaining_size -= chunk_size; + + pr_debug("Added chunk %d to instance pool %d: base=0x%llx, size=%zu bytes (remaining: %zu)\n", + chunks_added, instance_id, (unsigned long long)chunk_base, + chunk_size, remaining_size); + } + + pr_info("Created instance pool %d: %d chunks, total size=%zu bytes\n", + instance_id, chunks_added, pool_size); + + return instance_pool; + +cleanup: + /* Free all chunks that were successfully added */ + multikernel_destroy_instance_pool(instance_pool); + return NULL; +} + +/** + * multikernel_destroy_instance_pool() - Destroy an instance memory pool + * @pool_handle: Handle returned by multikernel_create_instance_pool() + * + * Frees all memory associated with the instance pool back to the main pool + */ +void multikernel_destroy_instance_pool(void *pool_handle) +{ + struct gen_pool *instance_pool = (struct gen_pool *)pool_handle; + struct gen_pool_chunk *chunk; + + if (!instance_pool) + return; + + /* Free all chunks back to main pool */ + list_for_each_entry(chunk, &instance_pool->chunks, next_chunk) { + multikernel_free(chunk->start_addr, chunk->end_addr - chunk->start_addr + 1); + pr_debug("Freed instance pool chunk: 0x%lx-0x%lx\n", + chunk->start_addr, chunk->end_addr); + } + + gen_pool_destroy(instance_pool); +} + +/** + * multikernel_instance_alloc() - Allocate from an instance pool + * @pool_handle: Handle returned by multikernel_create_instance_pool() + * @size: Size to allocate + * @align: Alignment requirement (must be power of 2) + * + * Returns physical address of allocated memory, or 0 on failure + */ +phys_addr_t multikernel_instance_alloc(void *pool_handle, size_t size, size_t align) +{ + struct gen_pool *instance_pool = (struct gen_pool *)pool_handle; + unsigned long addr; + + if (!instance_pool) + return 0; + + if (align <= 1) { + addr = gen_pool_alloc(instance_pool, size); + } else { + /* Ensure alignment is at least the pool's minimum allocation order */ + size_t a = max_t(size_t, align, BIT(instance_pool->min_alloc_order)); + struct genpool_data_align data = { .align = a }; + addr = gen_pool_alloc_algo(instance_pool, size, gen_pool_first_fit_align, &data); + } + + return (phys_addr_t)addr; +} + +/** + * multikernel_instance_free() - Free memory back to instance pool + * @pool_handle: Handle returned by multikernel_create_instance_pool() + * @addr: Physical address to free + * @size: Size to free + */ +void multikernel_instance_free(void *pool_handle, phys_addr_t addr, size_t size) +{ + struct gen_pool *instance_pool = (struct gen_pool *)pool_handle; + + if (!instance_pool || !addr) + return; + + gen_pool_free(instance_pool, (unsigned long)addr, size); + pr_debug("Instance pool freed %zu bytes at 0x%llx\n", size, (unsigned long long)addr); +} + +/** + * multikernel_instance_pool_avail() - Get available space in instance pool + * @pool_handle: Handle returned by multikernel_create_instance_pool() + * + * Returns available bytes in the instance pool + */ +size_t multikernel_instance_pool_avail(void *pool_handle) +{ + struct gen_pool *instance_pool = (struct gen_pool *)pool_handle; + + if (!instance_pool) + return 0; + + return gen_pool_avail(instance_pool); +} + +static int __init mkkernel_pool_setup(char *str) +{ + char *cur = str; + unsigned long long size, start; + + if (!str) + return -EINVAL; + + size = memparse(cur, &cur); + if (size == 0) { + pr_err("mkkernel_pool: invalid size\n"); + return -EINVAL; + } + + /* Expect '@' separator, or end of string for dynamic allocation */ + if (*cur == '@') { + cur++; + /* Parse start address */ + start = memparse(cur, &cur); + if (start == 0) { + pr_err("mkkernel_pool: invalid start address\n"); + return -EINVAL; + } + } else if (*cur == '\0') { + /* No address specified, use dynamic allocation */ + start = 0; + } else { + pr_err("mkkernel_pool: expected '@' or end of string after size\n"); + return -EINVAL; + } + + /* Reserve the memory using the proper memblock reservation approach */ + phys_addr_t reserved_addr; + if (start != 0) { + /* Reserve at the user-specified address */ + pr_info("mkkernel_pool: trying to reserve at specific address %llx\n", start); + if (memblock_reserve(start, size)) { + pr_err("mkkernel_pool: failed to reserve at specified address %llx\n", start); + return -ENOMEM; + } + reserved_addr = start; + pr_info("mkkernel_pool: successfully reserved at requested address %llx\n", start); + } else { + /* Dynamic allocation */ + pr_info("mkkernel_pool: trying dynamic allocation\n"); + reserved_addr = memblock_phys_alloc(size, PAGE_SIZE); + if (!reserved_addr) { + pr_err("mkkernel_pool: failed to allocate %llu bytes\n", size); + return -ENOMEM; + } + pr_info("mkkernel_pool: dynamic allocation succeeded at %pa\n", &reserved_addr); + } + + multikernel_res.start = reserved_addr; + multikernel_res.end = reserved_addr + size - 1; + + pr_info("Multikernel pool: %pa-%pa (%lluMB) allocated\n", + &multikernel_res.start, &multikernel_res.end, (unsigned long long)size >> 20); + + return 0; +} +early_param("mkkernel_pool", mkkernel_pool_setup); + +static int __init multikernel_mem_init(void) +{ + if (multikernel_res.start) { + /* Create the generic pool */ + multikernel_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!multikernel_pool) { + pr_err("Failed to create multikernel memory pool\n"); + return -ENOMEM; + } + + /* Add the reserved memory to the pool */ + if (gen_pool_add(multikernel_pool, multikernel_res.start, + multikernel_res.end - multikernel_res.start + 1, -1)) { + pr_err("Failed to add memory to multikernel pool\n"); + gen_pool_destroy(multikernel_pool); + multikernel_pool = NULL; + return -ENOMEM; + } + + if (insert_resource(&iomem_resource, &multikernel_res)) { + pr_warn("mkkernel_pool: failed to register in /proc/iomem\n"); + } else { + pr_info("mkkernel_pool: successfully registered in /proc/iomem\n"); + } + + pr_info("Multikernel memory pool initialized: %pa-%pa\n", + &multikernel_res.start, &multikernel_res.end); + } else { + pr_info("No multikernel pool found - multikernel support disabled\n"); + } + + return 0; +} +core_initcall(multikernel_mem_init); -- 2.34.1 From: Cong Wang Replace static kexec_image and kexec_crash_image globals with a dynamic linked list infrastructure to support multiple kernel images. This change enables multikernel functionality while maintaining backward compatibility. Key changes: - Add list_head member to kimage structure for chaining - Implement thread-safe linked list management with global mutex - Update kexec load/unload logic to use list-based APIs for multikernel - Add helper functions for finding and managing multiple kimages - Preserve existing kexec_image/kexec_crash_image pointers for compatibility - Update architecture-specific crash handling to use new APIs The multikernel case now properly uses list-based management instead of overwriting compatibility pointers, allowing multiple multikernel images to coexist in the system. Signed-off-by: Cong Wang --- arch/powerpc/kexec/crash.c | 8 ++-- arch/x86/kernel/crash.c | 4 +- include/linux/kexec.h | 13 ++++++ kernel/kexec.c | 60 ++++++++++++++++++++++++++-- kernel/kexec_core.c | 81 ++++++++++++++++++++++++++++++++++++++ kernel/kexec_file.c | 33 ++++++++++++++-- 6 files changed, 187 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index a325c1c02f96..af190fad4f22 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -477,13 +477,13 @@ static void update_crash_elfcorehdr(struct kimage *image, struct memory_notify * ptr = __va(mem); if (ptr) { /* Temporarily invalidate the crash image while it is replaced */ - xchg(&kexec_crash_image, NULL); + kimage_update_compat_pointers(NULL, KEXEC_TYPE_CRASH); /* Replace the old elfcorehdr with newly prepared elfcorehdr */ memcpy((void *)ptr, elfbuf, elfsz); /* The crash image is now valid once again */ - xchg(&kexec_crash_image, image); + kimage_update_compat_pointers(image, KEXEC_TYPE_CRASH); } out: kvfree(cmem); @@ -537,14 +537,14 @@ static void update_crash_fdt(struct kimage *image) fdt = __va((void *)image->segment[fdt_index].mem); /* Temporarily invalidate the crash image while it is replaced */ - xchg(&kexec_crash_image, NULL); + kimage_update_compat_pointers(NULL, KEXEC_TYPE_CRASH); /* update FDT to reflect changes in CPU resources */ if (update_cpus_node(fdt)) pr_err("Failed to update crash FDT"); /* The crash image is now valid once again */ - xchg(&kexec_crash_image, image); + kimage_update_compat_pointers(image, KEXEC_TYPE_CRASH); } int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 335fd2ee9766..a81f91d4352d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -559,9 +559,9 @@ void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) * Temporarily invalidate the crash image while the * elfcorehdr is updated. */ - xchg(&kexec_crash_image, NULL); + kimage_update_compat_pointers(NULL, KEXEC_TYPE_CRASH); memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz); - xchg(&kexec_crash_image, image); + kimage_update_compat_pointers(image, KEXEC_TYPE_CRASH); kunmap_local(old_elfcorehdr); pr_debug("updated elfcorehdr\n"); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index edf64bc98ed5..69877db5360b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -431,6 +431,9 @@ struct kimage { /* dm crypt keys buffer */ unsigned long dm_crypt_keys_addr; unsigned long dm_crypt_keys_sz; + + /* For multikernel support: linked list node */ + struct list_head list; }; /* kexec interface functions */ @@ -534,6 +537,16 @@ extern bool kexec_file_dbg_print; extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size); extern void kimage_unmap_segment(void *buffer); + +/* Multikernel support functions */ +extern struct kimage *kimage_find_by_type(int type); +extern void kimage_add_to_list(struct kimage *image); +extern void kimage_remove_from_list(struct kimage *image); +extern void kimage_update_compat_pointers(struct kimage *new_image, int type); +extern int kimage_get_all_by_type(int type, struct kimage **images, int max_count); +extern void kimage_list_lock(void); +extern void kimage_list_unlock(void); +extern void kimage_list_multikernel_images(void); #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; diff --git a/kernel/kexec.c b/kernel/kexec.c index 28008e3d462e..74b8df4670e0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -114,7 +114,31 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (nr_segments == 0) { /* Uninstall image */ - kimage_free(xchg(dest_image, NULL)); + if (flags & KEXEC_ON_CRASH) { + struct kimage *old_image = xchg(&kexec_crash_image, NULL); + if (old_image) { + kimage_remove_from_list(old_image); + kimage_free(old_image); + } + } else if (flags & KEXEC_MULTIKERNEL) { + /* For multikernel unload, we need to specify which image to remove */ + /* For now, remove all multikernel images - this could be enhanced */ + struct kimage *images[10]; + int count, i; + + count = kimage_get_all_by_type(KEXEC_TYPE_MULTIKERNEL, images, 10); + for (i = 0; i < count; i++) { + kimage_remove_from_list(images[i]); + kimage_free(images[i]); + } + pr_info("Unloaded %d multikernel images\n", count); + } else { + struct kimage *old_image = xchg(&kexec_image, NULL); + if (old_image) { + kimage_remove_from_list(old_image); + kimage_free(old_image); + } + } ret = 0; goto out_unlock; } @@ -124,7 +148,11 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, * crashes. Free any current crash dump kernel before * we corrupt it. */ - kimage_free(xchg(&kexec_crash_image, NULL)); + struct kimage *old_crash_image = xchg(&kexec_crash_image, NULL); + if (old_crash_image) { + kimage_remove_from_list(old_crash_image); + kimage_free(old_crash_image); + } } ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); @@ -164,7 +192,33 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; /* Install the new kernel and uninstall the old */ - image = xchg(dest_image, image); + if (flags & KEXEC_ON_CRASH) { + struct kimage *old_image = xchg(&kexec_crash_image, image); + if (old_image) { + kimage_remove_from_list(old_image); + kimage_free(old_image); + } + if (image) { + kimage_add_to_list(image); + kimage_update_compat_pointers(image, KEXEC_TYPE_CRASH); + } + image = NULL; /* Don't free the new image */ + } else if (flags & KEXEC_MULTIKERNEL) { + if (image) + kimage_add_to_list(image); + image = NULL; /* Don't free the new image */ + } else { + struct kimage *old_image = xchg(&kexec_image, image); + if (old_image) { + kimage_remove_from_list(old_image); + kimage_free(old_image); + } + if (image) { + kimage_add_to_list(image); + kimage_update_compat_pointers(image, KEXEC_TYPE_DEFAULT); + } + image = NULL; /* Don't free the new image */ + } out: #ifdef CONFIG_CRASH_DUMP diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 7d89d00e2cde..449096060fe8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -56,6 +56,10 @@ bool kexec_in_progress = false; bool kexec_file_dbg_print; +/* Linked list of dynamically allocated kimages */ +static LIST_HEAD(kexec_image_list); +static DEFINE_MUTEX(kexec_image_mutex); + /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -248,6 +252,9 @@ struct kimage *do_kimage_alloc_init(void) /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); + /* Initialize the list node for multikernel support */ + INIT_LIST_HEAD(&image->list); + #ifdef CONFIG_CRASH_HOTPLUG image->hp_action = KEXEC_CRASH_HP_NONE; image->elfcorehdr_index = -1; @@ -580,6 +587,13 @@ void kimage_free(struct kimage *image) if (!image) return; + /* Remove from linked list and update compatibility pointers */ + kimage_remove_from_list(image); + if (image == kexec_image) + kimage_update_compat_pointers(NULL, KEXEC_TYPE_DEFAULT); + else if (image == kexec_crash_image) + kimage_update_compat_pointers(NULL, KEXEC_TYPE_CRASH); + #ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); @@ -1096,6 +1110,72 @@ void kimage_unmap_segment(void *segment_buffer) vunmap(segment_buffer); } +void kimage_add_to_list(struct kimage *image) +{ + mutex_lock(&kexec_image_mutex); + list_add_tail(&image->list, &kexec_image_list); + mutex_unlock(&kexec_image_mutex); +} + +void kimage_remove_from_list(struct kimage *image) +{ + mutex_lock(&kexec_image_mutex); + if (!list_empty(&image->list)) + list_del_init(&image->list); + mutex_unlock(&kexec_image_mutex); +} + +struct kimage *kimage_find_by_type(int type) +{ + struct kimage *image; + + mutex_lock(&kexec_image_mutex); + list_for_each_entry(image, &kexec_image_list, list) { + if (image->type == type) { + mutex_unlock(&kexec_image_mutex); + return image; + } + } + mutex_unlock(&kexec_image_mutex); + return NULL; +} + +void kimage_update_compat_pointers(struct kimage *new_image, int type) +{ + mutex_lock(&kexec_image_mutex); + if (type == KEXEC_TYPE_CRASH) { + kexec_crash_image = new_image; + } else if (type == KEXEC_TYPE_DEFAULT) { + kexec_image = new_image; + } + mutex_unlock(&kexec_image_mutex); +} + +int kimage_get_all_by_type(int type, struct kimage **images, int max_count) +{ + struct kimage *image; + int count = 0; + + mutex_lock(&kexec_image_mutex); + list_for_each_entry(image, &kexec_image_list, list) { + if (image->type == type && count < max_count) { + images[count++] = image; + } + } + mutex_unlock(&kexec_image_mutex); + return count; +} + +void kimage_list_lock(void) +{ + mutex_lock(&kexec_image_mutex); +} + +void kimage_list_unlock(void) +{ + mutex_unlock(&kexec_image_mutex); +} + struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; @@ -1112,6 +1192,7 @@ static struct kexec_load_limit load_limit_panic = { .limit = -1, }; +/* Compatibility: maintain pointers to current default and crash images */ struct kimage *kexec_image; struct kimage *kexec_crash_image; static int kexec_load_disabled; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index eb62a9794242..2d9d5626c8da 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -400,8 +400,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, * same memory where old crash kernel might be loaded. Free any * current crash dump kernel before we corrupt it. */ - if (flags & KEXEC_FILE_ON_CRASH) - kimage_free(xchg(&kexec_crash_image, NULL)); + if (flags & KEXEC_FILE_ON_CRASH) { + struct kimage *old_crash_image = xchg(&kexec_crash_image, NULL); + if (old_crash_image) { + kimage_remove_from_list(old_crash_image); + kimage_free(old_crash_image); + } + } ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); @@ -457,7 +462,29 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, */ kimage_file_post_load_cleanup(image); exchange: - image = xchg(dest_image, image); + if (image_type == KEXEC_TYPE_CRASH) { + struct kimage *old_image = xchg(&kexec_crash_image, image); + if (old_image) { + kimage_remove_from_list(old_image); + kimage_free(old_image); + } + if (image) { + kimage_add_to_list(image); + kimage_update_compat_pointers(image, KEXEC_TYPE_CRASH); + } + image = NULL; /* Don't free the new image */ + } else { + struct kimage *old_image = xchg(&kexec_image, image); + if (old_image) { + kimage_remove_from_list(old_image); + kimage_free(old_image); + } + if (image) { + kimage_add_to_list(image); + kimage_update_compat_pointers(image, KEXEC_TYPE_DEFAULT); + } + image = NULL; /* Don't free the new image */ + } out: #ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) -- 2.34.1 From: Cong Wang This commit introduces: * Device tree parsing infrastructure (kernel/multikernel/dts.c) that supports parsing multikernel-specific device tree blobs with resource specifications including memory size requirements and CPU assignments. Provides validation, configuration management, and resource availability checking with extensible property parsing for future enhancements. * Kernfs-based filesystem interface (kernel/multikernel/kernfs.c) that creates a dedicated multikernel filesystem mountable at /sys/fs/multikernel/ with full instance lifecycle management through device tree uploads rather than manual directory creation. * Instance management infrastructure with comprehensive state tracking (empty, ready, loading, active, failed), reference counting, resource reservation, and integration with the kexec multikernel system for seamless kernel loading operations. Key features of the interface: - Device tree upload via /device_tree file that automatically parses multikernel DTBs and creates instance directories with proper resource allocation and CPU assignment including CPU hotplug operations. - Per-instance attribute files (id, status, device_tree_source) that provide read-only access to instance information and configuration with human-readable device tree source reconstruction. - Resource management integration that reserves memory from multikernel pools, creates proper resource hierarchy entries, and handles CPU offline operations for dedicated CPU assignment. - State synchronization with kexec operations through mk_instance_find(), mk_instance_set_kexec_active(), and mk_instance_set_kexec_loading() functions for proper lifecycle tracking. The device tree format supports /instances/ nodes with resources subnodes containing memory-bytes and cpus properties, providing a structured approach to multikernel configuration with proper validation and resource conflict detection. See also the user-space kerf project for more details about device-tree specification: https://github.com/multikernel/kerf Signed-off-by: Cong Wang --- include/linux/multikernel.h | 328 +++++++++++++++ kernel/multikernel/Kconfig | 1 + kernel/multikernel/Makefile | 5 +- kernel/multikernel/core.c | 404 ++++++++++++++++++ kernel/multikernel/dts.c | 466 ++++++++++++++++++++ kernel/multikernel/internal.h | 4 + kernel/multikernel/kernfs.c | 772 ++++++++++++++++++++++++++++++++++ 7 files changed, 1979 insertions(+), 1 deletion(-) create mode 100644 kernel/multikernel/dts.c create mode 100644 kernel/multikernel/internal.h create mode 100644 kernel/multikernel/kernfs.c diff --git a/include/linux/multikernel.h b/include/linux/multikernel.h index 51c989139a75..75cbb316d565 100644 --- a/include/linux/multikernel.h +++ b/include/linux/multikernel.h @@ -5,6 +5,15 @@ #ifndef _LINUX_MULTIKERNEL_H #define _LINUX_MULTIKERNEL_H +#include +#include +#include +#include +#include +#include +#include +#include + struct resource; extern phys_addr_t multikernel_alloc(size_t size); @@ -19,4 +28,323 @@ extern phys_addr_t multikernel_instance_alloc(void *pool_handle, size_t size, si extern void multikernel_instance_free(void *pool_handle, phys_addr_t addr, size_t size); extern size_t multikernel_instance_pool_avail(void *pool_handle); +/** + * Multikernel Instance States + */ +enum mk_instance_state { + MK_STATE_EMPTY = 0, /* Instance directory exists but no DTB */ + MK_STATE_READY, /* DTB loaded, resources reserved */ + MK_STATE_LOADING, /* Kernel being loaded */ + MK_STATE_ACTIVE, /* Kernel running */ + MK_STATE_FAILED, /* Error occurred */ +}; + +/** + * Memory region wrapper + * + * This wraps a struct resource with gen_pool_chunk for memory pool management. + * Used by instance management to track both resource hierarchy and pool chunks. + */ +struct mk_memory_region { + struct resource res; /* The actual resource */ + struct gen_pool_chunk *chunk; /* Associated gen_pool chunk */ + struct list_head list; /* List entry for management */ +}; + +/** + * Complete multikernel device tree configuration + * + * This structure handles memory size requirements and CPU assignment + * parsed from device tree blobs. + */ +struct mk_dt_config { + /* Version for compatibility checking */ + u32 version; + + /* Memory requirements */ + size_t memory_size; /* Total memory size required */ + + /* CPU resources */ + cpumask_var_t cpus; /* Assigned CPU mask */ + bool cpus_valid; /* Whether CPU assignment is valid */ + + /* Extensibility: Reserved fields for future use */ + u32 reserved[12]; /* Increased due to removed fields */ + + /* Raw device tree data */ + void *dtb_data; + size_t dtb_size; +}; + +/** + * Multikernel Instance Structure + * + * Each instance represents a potential or active multikernel with + * its own resource allocation and state management. + */ +struct mk_instance { + int id; /* Kernel-assigned instance ID */ + char *name; /* User-provided instance name */ + enum mk_instance_state state; /* Current state */ + + /* Resource management - list of reserved memory regions */ + struct list_head memory_regions; /* List of struct mk_memory_region */ + int region_count; /* Number of memory regions */ + /* Memory pool for this instance */ + void *instance_pool; /* Handle for instance-specific memory pool */ + size_t pool_size; /* Size of the instance pool */ + + /* CPU resources */ + cpumask_var_t cpus; /* Assigned CPU mask */ + bool cpus_valid; /* Whether CPU assignment is valid */ + + /* Device tree information */ + void *dtb_data; /* Device tree blob data */ + size_t dtb_size; /* Size of DTB */ + + /* Sysfs representation */ + struct kernfs_node *kn; /* Kernfs node for this instance */ + + /* List management */ + struct list_head list; /* Link to global instance list */ + + /* Reference counting */ + struct kref refcount; /* Reference count for cleanup */ +}; + +/** + * Device Tree Parsing Functions + */ + +/** + * mk_dt_parse() - Parse multikernel device tree blob + * @dtb_data: Device tree blob data + * @dtb_size: Size of DTB data + * @config: Output configuration structure + * + * Parses a device tree blob and extracts multikernel-specific + * memory region properties. Supports multiple memory regions + * specified as: + * - linux,multikernel-memory = ; + * + * Each memory region becomes a struct resource that will be + * inserted as a child of the main multikernel_res. + * + * Returns 0 on success, negative error code on failure. + */ +int mk_dt_parse(const void *dtb_data, size_t dtb_size, + struct mk_dt_config *config); + +/** + * mk_dt_validate() - Validate parsed device tree configuration + * @config: Configuration to validate + * + * Validates that the parsed memory regions are reasonable and + * compatible with the current system state. Checks for: + * - Region alignment + * - Availability in multikernel pool + * - No overlaps between regions + * + * Returns 0 if valid, negative error code on validation failure. + */ +int mk_dt_validate(const struct mk_dt_config *config); + +/** + * mk_dt_config_init() - Initialize a device tree configuration + * @config: Configuration structure to initialize + * + * Initializes all fields to safe defaults. + */ +void mk_dt_config_init(struct mk_dt_config *config); + +/** + * mk_dt_config_free() - Free device tree configuration resources + * @config: Configuration to free + * + * Frees any dynamically allocated resources in the configuration. + */ +void mk_dt_config_free(struct mk_dt_config *config); + +/** + * mk_dt_resources_available() - Check if memory and CPU resources are available + * @config: Configuration with resources to check + * + * Checks if the specified memory size is available in the + * multikernel memory pool and all CPUs are possible on the system. + * + * Returns true if all resources are available, false otherwise. + */ +bool mk_dt_resources_available(const struct mk_dt_config *config); + +/** + * mk_dt_get_property_size() - Get size of a specific property + * @dtb_data: Device tree blob + * @dtb_size: Size of DTB + * @property: Property name (e.g., "linux,multikernel-memory") + * + * Helper function to determine the size of a property before parsing. + * Useful for validation and memory allocation. + * + * Returns property size in bytes, or -ENOENT if not found. + */ +int mk_dt_get_property_size(const void *dtb_data, size_t dtb_size, + const char *property); + +/** + * mk_dt_print_config() - Print configuration for debugging + * @config: Configuration to print + * + * Prints the parsed configuration in a human-readable format + * for debugging purposes. + */ +void mk_dt_print_config(const struct mk_dt_config *config); + +/** + * Sysfs Interface Functions + */ + +/** + * mk_kernfs_init() - Initialize multikernel kernfs interface + * + * Creates /sys/kernel/multikernel/ directory and sets up + * the kernfs infrastructure for multikernel instances. + * + * Returns 0 on success, negative error code on failure. + */ +int mk_kernfs_init(void); + +/** + * mk_kernfs_cleanup() - Cleanup multikernel kernfs interface + * + * Removes all kernfs entries and cleans up resources. + */ +void mk_kernfs_cleanup(void); + +/** + * mk_instance_find_by_name() - Find an existing instance by name + * @name: Instance name to find + * + * Returns pointer to mk_instance if found, NULL otherwise. + * Caller must hold appropriate locks. + */ +struct mk_instance *mk_instance_find_by_name(const char *name); + +/** + * mk_instance_get() - Increment reference count + * @instance: Instance to reference + * + * Returns the instance pointer for convenience. + */ +struct mk_instance *mk_instance_get(struct mk_instance *instance); + +/** + * mk_instance_put() - Decrement reference count + * @instance: Instance to dereference + * + * May free the instance if reference count reaches zero. + */ +void mk_instance_put(struct mk_instance *instance); + +/** + * mk_instance_set_state() - Update instance state + * @instance: Instance to update + * @state: New state + * + * Updates the instance state and notifies sysfs. + */ +void mk_instance_set_state(struct mk_instance *instance, + enum mk_instance_state state); + +/** + * mk_instance_reserve_resources() - Reserve CPU and memory resources for instance + * @instance: Instance to reserve resources for + * @config: Device tree configuration with memory size and CPU assignment + * + * Allocates the specified memory size from the multikernel pool, creates + * memory regions, and copies CPU assignment. + * + * Returns 0 on success, negative error code on failure. + */ +int mk_instance_reserve_resources(struct mk_instance *instance, + const struct mk_dt_config *config); + +/** + * mk_instance_free_memory() - Free all reserved memory regions + * @instance: Instance to free memory for + * + * Returns all reserved memory regions back to the multikernel pool + * and removes them from the resource hierarchy. + */ +void mk_instance_free_memory(struct mk_instance *instance); + +/** + * String conversion helpers + */ +const char *mk_state_to_string(enum mk_instance_state state); +enum mk_instance_state mk_string_to_state(const char *str); + +/** + * Kexec Integration Functions + * + * These functions bridge the gap between the sysfs instance management + * and the kexec multikernel system. + */ + +/** + * mk_instance_find() - Find instance by a multikernel instance ID + * @mk_id: Multikernel instance ID + * + * Returns pointer to mk_instance if found, NULL otherwise. + */ +struct mk_instance *mk_instance_find(int mk_id); + +/** + * mk_instance_set_kexec_active() - Mark instance as active for kexec + * @mk_id: Multikernel ID from kexec system + * + * Returns 0 on success, negative error code on failure. + */ +int mk_instance_set_kexec_active(int mk_id); + +/** + * mk_instance_set_kexec_loading() - Mark instance as loading for kexec + * @mk_id: Multikernel ID from kexec system + * + * Returns 0 on success, negative error code on failure. + */ +int mk_instance_set_kexec_loading(int mk_id); + +/** + * Version and Compatibility + */ +#define MK_DT_CONFIG_VERSION_1 1 +#define MK_DT_CONFIG_CURRENT MK_DT_CONFIG_VERSION_1 +#define MK_FDT_COMPATIBLE "multikernel-v1" + +/** + * Property Names + */ +#define MK_DT_RESOURCE_MEMORY "memory-bytes" +#define MK_DT_RESOURCE_CPUS "cpus" + +static const char * const mk_resource_properties[] = { + MK_DT_RESOURCE_MEMORY, + MK_DT_RESOURCE_CPUS, + NULL /* Sentinel */ +}; + +static inline bool mk_is_resource_property(const char *prop_name) +{ + const char * const *prop; + + if (!prop_name) + return false; + + for (prop = mk_resource_properties; *prop; prop++) { + if (strcmp(prop_name, *prop) == 0) + return true; + } + return false; +} + #endif /* _LINUX_MULTIKERNEL_H */ diff --git a/kernel/multikernel/Kconfig b/kernel/multikernel/Kconfig index 0e61fd2e505a..a9582a4d0c54 100644 --- a/kernel/multikernel/Kconfig +++ b/kernel/multikernel/Kconfig @@ -6,6 +6,7 @@ config MULTIKERNEL bool "Multikernel support" depends on KEXEC_CORE + select LIBFDT help Enable multikernel support, which allows running multiple kernel instances simultaneously with resource isolation and inter-kernel diff --git a/kernel/multikernel/Makefile b/kernel/multikernel/Makefile index 0dad7f2267f9..d004c577f13d 100644 --- a/kernel/multikernel/Makefile +++ b/kernel/multikernel/Makefile @@ -3,4 +3,7 @@ # Makefile for multikernel support # -obj-y += core.o mem.o +obj-y += core.o mem.o kernfs.o dts.o + +# Add libfdt include path for device tree parsing +CFLAGS_dts.o = -I $(srctree)/scripts/dtc/libfdt diff --git a/kernel/multikernel/core.c b/kernel/multikernel/core.c index 218424d59cc3..52bf8e38206a 100644 --- a/kernel/multikernel/core.c +++ b/kernel/multikernel/core.c @@ -5,10 +5,414 @@ #include #include #include +#include +#include #include +#include "internal.h" + +/** + * Instance reference counting + */ +static void mk_instance_release(struct kref *kref) +{ + struct mk_instance *instance = container_of(kref, struct mk_instance, refcount); + + pr_debug("Releasing multikernel instance %d (%s)\n", instance->id, instance->name); + + mk_instance_free_memory(instance); + + /* Free CPU mask */ + if (instance->cpus_valid) { + free_cpumask_var(instance->cpus); + instance->cpus_valid = false; + } + + kfree(instance->dtb_data); + kfree(instance->name); + kfree(instance); +} + +struct mk_instance *mk_instance_get(struct mk_instance *instance) +{ + if (instance) + kref_get(&instance->refcount); + return instance; +} + +void mk_instance_put(struct mk_instance *instance) +{ + if (instance) + kref_put(&instance->refcount, mk_instance_release); +} + +/** + * Instance state management + */ +void mk_instance_set_state(struct mk_instance *instance, + enum mk_instance_state state) +{ + enum mk_instance_state old_state = instance->state; + + if (old_state == state) + return; + + instance->state = state; + pr_debug("Instance %d (%s) state: %s -> %s\n", + instance->id, instance->name, + mk_state_to_string(old_state), + mk_state_to_string(state)); + + /* TODO: Notify status file of state change + * We should store a reference to the status file's kernfs node + * and call kernfs_notify() on that specific file, not the directory. + */ +} + +struct mk_instance *mk_instance_find_by_name(const char *name) +{ + struct mk_instance *instance; + + lockdep_assert_held(&mk_instance_mutex); + + if (!name) + return NULL; + + list_for_each_entry(instance, &mk_instance_list, list) { + if (instance->name && strcmp(instance->name, name) == 0) + return instance; + } + + return NULL; +} + +struct mk_instance *mk_instance_find(int mk_id) +{ + struct mk_instance *instance; + + mutex_lock(&mk_instance_mutex); + instance = idr_find(&mk_instance_idr, mk_id); + if (instance) + mk_instance_get(instance); + mutex_unlock(&mk_instance_mutex); + + return instance; +} + +int mk_instance_set_kexec_active(int mk_id) +{ + struct mk_instance *instance; + + instance = mk_instance_find(mk_id); + if (!instance) { + pr_err("No sysfs instance found for multikernel ID %d\n", mk_id); + return -ENOENT; + } + + mk_instance_set_state(instance, MK_STATE_ACTIVE); + mk_instance_put(instance); + pr_info("Multikernel instance %d is now active\n", mk_id); + + return 0; +} + +int mk_instance_set_kexec_loading(int mk_id) +{ + struct mk_instance *instance; + + instance = mk_instance_find(mk_id); + if (!instance) { + pr_err("No sysfs instance found for multikernel ID %d\n", mk_id); + return -ENOENT; + } + + mk_instance_set_state(instance, MK_STATE_LOADING); + mk_instance_put(instance); + pr_info("Multikernel instance %d is now loading\n", mk_id); + + return 0; +} + + +/** + * CPU management functions for instances + */ + +static int mk_instance_offline_cpus(struct mk_instance *instance) +{ + int cpu, ret = 0, failed_count = 0; + + pr_info("Bringing CPUs offline for multikernel instance %d (%s): %*pbl\n", + instance->id, instance->name, cpumask_pr_args(instance->cpus)); + + for_each_cpu(cpu, instance->cpus) { + if (!cpu_online(cpu)) { + pr_debug("CPU %d already offline for instance %d\n", cpu, instance->id); + continue; + } + + pr_info("Taking CPU %d offline for multikernel instance %d\n", cpu, instance->id); + + ret = remove_cpu(cpu); + if (ret) { + pr_err("Failed to take CPU %d offline for instance %d: %d\n", + cpu, instance->id, ret); + failed_count++; + } else { + pr_info("Successfully took CPU %d offline for instance %d\n", + cpu, instance->id); + } + } + + if (failed_count > 0) { + pr_warn("Failed to take %d CPUs offline for instance %d (%s)\n", + failed_count, instance->id, instance->name); + return -EBUSY; + } + + pr_info("Successfully took all assigned CPUs offline for instance %d (%s)\n", + instance->id, instance->name); + return 0; +} + +/** + * mk_instance_reserve_cpus() - Assign CPU resources to an instance + * @instance: Instance to assign CPU resources to + * @config: Device tree configuration with CPU assignment + * + * Copies CPU assignment from config to instance. This is the actual + * "reservation" function that assigns CPUs to the instance. + * + * Returns 0 on success, negative error code on failure. + */ +static int mk_instance_reserve_cpus(struct mk_instance *instance, + const struct mk_dt_config *config) +{ + int ret; + + if (config->cpus_valid && instance->cpus_valid) { + cpumask_copy(instance->cpus, config->cpus); + + pr_info("CPU assignment for instance %d (%s): %*pbl (%d CPUs)\n", + instance->id, instance->name, + cpumask_pr_args(instance->cpus), cpumask_weight(instance->cpus)); + + ret = mk_instance_offline_cpus(instance); + if (ret) { + pr_warn("Failed to bring some CPUs offline for instance %d (%s): %d\n", + instance->id, instance->name, ret); + return ret; + } + } else { + pr_warn("Cannot reserve CPU resources to instance %d (%s) - instance CPU mask not available\n", + instance->id, instance->name); + return -EINVAL; + } + + return 0; +} + +/** + * Memory management functions for instances + */ + +/** + * mk_instance_reserve_memory() - Reserve memory regions for an instance + * @instance: Instance to reserve memory for + * @config: Device tree configuration with memory size + * + * Creates an instance pool from the specified memory size and creates + * memory regions from the pool chunks for resource hierarchy management. + * + * Returns 0 on success, negative error code on failure. + */ +static int mk_instance_reserve_memory(struct mk_instance *instance, + const struct mk_dt_config *config) +{ + struct gen_pool *pool; + struct gen_pool_chunk *chunk; + struct mk_memory_region *region; + int ret = 0; + int region_num = 0; + + /* Handle case where no memory size is specified */ + if (config->memory_size == 0) { + pr_info("No memory size specified for instance %d (%s)\n", + instance->id, instance->name); + return 0; + } + + /* Create instance memory pool */ + instance->instance_pool = multikernel_create_instance_pool(instance->id, + config->memory_size, + PAGE_SHIFT); + if (!instance->instance_pool) { + pr_err("Failed to create instance pool for instance %d (%s)\n", + instance->id, instance->name); + return -ENOMEM; + } + + instance->pool_size = config->memory_size; + pool = (struct gen_pool *)instance->instance_pool; + + /* Create memory regions from pool chunks for resource hierarchy */ + list_for_each_entry(chunk, &pool->chunks, next_chunk) { + resource_size_t size = chunk->end_addr - chunk->start_addr + 1; + + /* Allocate a new region structure for the instance */ + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + ret = -ENOMEM; + goto cleanup; + } + + /* Set up the resource structure from chunk */ + region->res.start = chunk->start_addr; + region->res.end = chunk->end_addr; + region->res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + region->res.name = kasprintf(GFP_KERNEL, "mk-instance-%d-%s-region-%d", + instance->id, instance->name, region_num); + if (!region->res.name) { + kfree(region); + ret = -ENOMEM; + goto cleanup; + } + + /* Link region to its chunk */ + region->chunk = chunk; + + /* Insert as child of multikernel_res */ + ret = insert_resource(&multikernel_res, ®ion->res); + if (ret) { + pr_err("Failed to insert memory region as child resource: %d\n", ret); + kfree(region->res.name); + kfree(region); + goto cleanup; + } + + /* Add to instance's memory region list */ + INIT_LIST_HEAD(®ion->list); + list_add_tail(®ion->list, &instance->memory_regions); + instance->region_count++; + region_num++; + + pr_debug("Created memory region for instance %d (%s): 0x%llx-0x%llx (%llu bytes)\n", + instance->id, instance->name, + (unsigned long long)region->res.start, + (unsigned long long)region->res.end, + (unsigned long long)size); + } + + pr_info("Successfully created %d memory regions from pool for instance %d (%s), total %zu bytes\n", + instance->region_count, instance->id, instance->name, config->memory_size); + return 0; + +cleanup: + /* Clean up any regions we managed to allocate */ + mk_instance_free_memory(instance); + + if (instance->instance_pool) { + multikernel_destroy_instance_pool(instance->instance_pool); + instance->instance_pool = NULL; + instance->pool_size = 0; + } + return ret; +} + +/** + * mk_instance_free_memory() - Free all reserved memory regions + * @instance: Instance to free memory for + * + * Returns all reserved memory regions back to the multikernel pool + * and removes them from the resource hierarchy. + */ +void mk_instance_free_memory(struct mk_instance *instance) +{ + struct mk_memory_region *region, *tmp; + + if (!instance) + return; + + list_for_each_entry_safe(region, tmp, &instance->memory_regions, list) { + pr_debug("Freeing memory region for instance %d (%s): 0x%llx-0x%llx\n", + instance->id, instance->name, + (unsigned long long)region->res.start, + (unsigned long long)region->res.end); + + /* Remove from instance list */ + list_del(®ion->list); + + /* Remove from resource hierarchy */ + remove_resource(®ion->res); + + /* Free the resource name and region structure */ + kfree(region->res.name); + kfree(region); + } + + instance->region_count = 0; + if (instance->instance_pool) { + multikernel_destroy_instance_pool(instance->instance_pool); + instance->instance_pool = NULL; + instance->pool_size = 0; + } + + pr_debug("Freed all memory regions and pool for instance %d (%s)\n", + instance->id, instance->name); +} + +/** + * mk_instance_reserve_resources() - Reserve memory and CPU resources for an instance + * @instance: Instance to reserve resources for + * @config: Device tree configuration with memory regions and CPU assignment + * + * Reserves all memory regions specified in the device tree configuration, + * makes them children of the main multikernel_res, and copies CPU assignment. + * + * Returns 0 on success, negative error code on failure. + */ +int mk_instance_reserve_resources(struct mk_instance *instance, + const struct mk_dt_config *config) +{ + int ret; + + if (!config || !instance) { + pr_err("Invalid parameters to mk_instance_reserve_resources\n"); + return -EINVAL; + } + + /* Free any existing memory regions first */ + mk_instance_free_memory(instance); + + /* Reserve memory regions */ + ret = mk_instance_reserve_memory(instance, config); + if (ret) { + pr_err("Failed to reserve memory regions for instance %d (%s): %d\n", + instance->id, instance->name, ret); + return ret; + } + + /* Reserve CPU resources */ + ret = mk_instance_reserve_cpus(instance, config); + if (ret) { + pr_err("Failed to reserve CPU resources for instance %d (%s): %d\n", + instance->id, instance->name, ret); + /* Don't fail the whole operation for CPU reservation failure */ + pr_warn("Continuing without CPU assignment\n"); + } + + return 0; +} static int __init multikernel_init(void) { + int ret; + + ret = mk_kernfs_init(); + if (ret < 0) { + pr_err("Failed to initialize multikernel sysfs interface: %d\n", ret); + return ret; + } + pr_info("Multikernel support initialized\n"); return 0; } diff --git a/kernel/multikernel/dts.c b/kernel/multikernel/dts.c new file mode 100644 index 000000000000..9cd7010d3fd8 --- /dev/null +++ b/kernel/multikernel/dts.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Multikernel Technologies, Inc. All rights reserved + * + * Multikernel device tree support + * + * Provides device tree parsing and validation for multikernel instances. + * Designed to be extensible for future enhancements like CPU affinity, + * I/O resource allocation, NUMA topology, etc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Configuration initialization and cleanup + */ +void mk_dt_config_init(struct mk_dt_config *config) +{ + memset(config, 0, sizeof(*config)); + config->version = MK_DT_CONFIG_CURRENT; + config->memory_size = 0; + + /* Initialize CPU mask */ + if (!alloc_cpumask_var(&config->cpus, GFP_KERNEL)) { + pr_warn("Failed to allocate CPU mask, CPU assignment disabled\n"); + config->cpus_valid = false; + } else { + cpumask_clear(config->cpus); + config->cpus_valid = true; + } +} + +void mk_dt_config_free(struct mk_dt_config *config) +{ + if (!config) + return; + + /* Free CPU mask */ + if (config->cpus_valid) { + free_cpumask_var(config->cpus); + config->cpus_valid = false; + } + + /* Reset memory size */ + config->memory_size = 0; + + /* Note: We don't free dtb_data here as it's managed by the caller */ +} + +/** + * Function prototypes + */ +static int mk_dt_parse_memory(const void *fdt, int chosen_node, + struct mk_dt_config *config); +static int mk_dt_parse_cpus(const void *fdt, int chosen_node, + struct mk_dt_config *config); +static int mk_dt_validate_memory(const struct mk_dt_config *config); +static int mk_dt_validate_cpus(const struct mk_dt_config *config); + +/** + * Memory region parsing + */ +static int mk_dt_parse_memory(const void *fdt, int chosen_node, + struct mk_dt_config *config) +{ + const fdt32_t *prop; + int len; + size_t total_size = 0; + + /* Look for memory-bytes property */ + prop = fdt_getprop(fdt, chosen_node, MK_DT_RESOURCE_MEMORY, &len); + if (!prop) { + pr_debug("No %s property found\n", MK_DT_RESOURCE_MEMORY); + return 0; /* Not an error - property is optional */ + } + + if (len != 4) { + pr_err("Invalid %s property length: %d (must be 4 bytes for single u32 size)\n", + MK_DT_RESOURCE_MEMORY, len); + return -EINVAL; + } + + /* Parse single memory size value */ + total_size = fdt32_to_cpu(prop[0]); + if (total_size == 0) { + pr_err("Invalid memory size 0 in %s\n", MK_DT_RESOURCE_MEMORY); + return -EINVAL; + } + + /* Validate size alignment */ + if (total_size & (PAGE_SIZE - 1)) { + pr_err("Memory size 0x%zx not page-aligned\n", total_size); + return -EINVAL; + } + + config->memory_size = total_size; + pr_info("Successfully parsed memory size: %zu bytes (%zu MB)\n", + total_size, total_size >> 20); + return 0; +} + +/** + * CPU resource parsing + */ +static int mk_dt_parse_cpus(const void *fdt, int chosen_node, + struct mk_dt_config *config) +{ + const fdt32_t *prop; + int len, i, cpu_count; + + if (!config->cpus_valid) { + pr_debug("CPU mask allocation failed, skipping CPU parsing\n"); + return 0; + } + + /* Look for cpus property */ + prop = fdt_getprop(fdt, chosen_node, MK_DT_RESOURCE_CPUS, &len); + if (!prop) { + pr_debug("No %s property found\n", MK_DT_RESOURCE_CPUS); + return 0; /* Not an error - property is optional */ + } + + if (len % 4 != 0) { + pr_err("Invalid %s property length: %d (must be multiple of 4)\n", + MK_DT_RESOURCE_CPUS, len); + return -EINVAL; + } + + cpu_count = len / 4; /* Each CPU is a u32 value */ + if (cpu_count == 0) { + pr_err("Empty CPU list in %s\n", MK_DT_RESOURCE_CPUS); + return -EINVAL; + } + + pr_debug("Parsing %d CPUs\n", cpu_count); + + /* Clear the CPU mask first */ + cpumask_clear(config->cpus); + + /* Parse each CPU ID */ + for (i = 0; i < cpu_count; i++) { + u32 cpu_id = fdt32_to_cpu(prop[i]); + + /* Validate CPU ID */ + if (cpu_id >= nr_cpu_ids) { + pr_err("Invalid CPU ID %u (max: %u) in %s\n", + cpu_id, nr_cpu_ids - 1, MK_DT_RESOURCE_CPUS); + return -EINVAL; + } + + /* Check if CPU is online/possible */ + if (!cpu_possible(cpu_id)) { + pr_err("CPU %u is not possible on this system\n", cpu_id); + return -EINVAL; + } + + /* Add to CPU mask */ + cpumask_set_cpu(cpu_id, config->cpus); + pr_debug("Added CPU %u to multikernel instance\n", cpu_id); + } + + pr_info("Successfully parsed %d CPUs: %*pbl\n", + cpu_count, cpumask_pr_args(config->cpus)); + return 0; +} + +/** + * Main device tree parsing function + */ +int mk_dt_parse(const void *dtb_data, size_t dtb_size, + struct mk_dt_config *config) +{ + const void *fdt = dtb_data; + int ret; + + if (!dtb_data || !config) { + pr_err("Invalid parameters to mk_dt_parse\n"); + return -EINVAL; + } + + /* Validate FDT header */ + ret = fdt_check_header(fdt); + if (ret) { + pr_err("Invalid device tree blob: %d\n", ret); + return -EINVAL; + } + + /* Verify size matches */ + if (fdt_totalsize(fdt) > dtb_size) { + pr_err("DTB size mismatch: header=%u, provided=%zu\n", + fdt_totalsize(fdt), dtb_size); + return -EINVAL; + } + + /* Find /instances node */ + int instances_node = fdt_path_offset(fdt, "/instances"); + if (instances_node < 0) { + pr_err("No /instances node found in device tree\n"); + return -ENOENT; + } + + /* Find the first (and should be only) instance */ + int instance_node = fdt_first_subnode(fdt, instances_node); + if (instance_node < 0) { + pr_err("No instance found in /instances node\n"); + return -ENOENT; + } + + /* Find the resources subnode */ + int resources_node = fdt_subnode_offset(fdt, instance_node, "resources"); + if (resources_node < 0) { + pr_err("No resources node found in instance\n"); + return -ENOENT; + } + + /* Store raw DTB reference */ + config->dtb_data = (void *)dtb_data; + config->dtb_size = dtb_size; + + /* Parse memory regions */ + ret = mk_dt_parse_memory(fdt, resources_node, config); + if (ret) { + pr_err("Failed to parse memory regions: %d\n", ret); + mk_dt_config_free(config); + return ret; + } + + /* Parse CPU resources */ + ret = mk_dt_parse_cpus(fdt, resources_node, config); + if (ret) { + pr_err("Failed to parse CPU resources: %d\n", ret); + mk_dt_config_free(config); + return ret; + } + + pr_info("Successfully parsed multikernel device tree with %zu bytes memory and %d CPUs\n", + config->memory_size, config->cpus_valid ? cpumask_weight(config->cpus) : 0); + return 0; +} + +/** + * Configuration validation + */ +int mk_dt_validate(const struct mk_dt_config *config) +{ + int ret; + + if (!config) { + pr_err("NULL configuration\n"); + return -EINVAL; + } + + if (config->version != MK_DT_CONFIG_CURRENT) { + pr_err("Unsupported configuration version: %u\n", config->version); + return -ENOTSUPP; + } + + /* Validate memory regions */ + ret = mk_dt_validate_memory(config); + if (ret) + return ret; + + /* Validate CPU resources */ + ret = mk_dt_validate_cpus(config); + if (ret) + return ret; + + return 0; +} + +/** + * Memory region validation + */ +static int mk_dt_validate_memory(const struct mk_dt_config *config) +{ + struct resource *pool_res; + + /* Get multikernel pool resource for validation */ + pool_res = multikernel_get_pool_resource(); + if (!pool_res && config->memory_size > 0) { + pr_err("No multikernel pool available for memory allocation\n"); + return -ENODEV; + } + + /* Validate memory size */ + if (config->memory_size > 0) { + /* Basic sanity checks */ + if (config->memory_size < PAGE_SIZE) { + pr_err("Memory size too small: %zu bytes\n", config->memory_size); + return -EINVAL; + } + + if (config->memory_size > SZ_1G) { + pr_warn("Large memory size requested: %zu bytes\n", config->memory_size); + } + + /* Check if size fits within multikernel pool */ + if (pool_res) { + resource_size_t pool_size = resource_size(pool_res); + if (config->memory_size > pool_size) { + pr_err("Requested memory size %zu bytes exceeds pool size %llu bytes\n", + config->memory_size, pool_size); + return -ERANGE; + } + } + } + + return 0; +} + +/** + * CPU resource validation + */ +static int mk_dt_validate_cpus(const struct mk_dt_config *config) +{ + int cpu; + + /* Skip validation if CPU assignment is not valid or empty */ + if (!config->cpus_valid || cpumask_empty(config->cpus)) + return 0; + + /* Check that all CPUs are still possible/online */ + for_each_cpu(cpu, config->cpus) { + if (!cpu_possible(cpu)) { + pr_err("CPU %d is no longer possible on this system\n", cpu); + return -EINVAL; + } + + if (!cpu_online(cpu)) { + pr_warn("CPU %d is not online, multikernel may fail to start\n", cpu); + } + } + + /* Check for reasonable CPU count */ + if (cpumask_weight(config->cpus) > num_online_cpus()) { + pr_warn("Requested %d CPUs but only %d are online\n", + cpumask_weight(config->cpus), num_online_cpus()); + } + + /* Ensure we don't assign CPU 0 unless explicitly allowed */ + if (cpumask_test_cpu(0, config->cpus)) { + pr_warn("CPU 0 assigned to multikernel instance - this may affect system stability\n"); + } + + return 0; +} + +/** + * Resource availability checking + */ +bool mk_dt_resources_available(const struct mk_dt_config *config) +{ + struct resource *pool_res; + + if (!config) + return false; + + /* Check if multikernel pool is available */ + pool_res = multikernel_get_pool_resource(); + if (!pool_res) { + pr_debug("No multikernel pool available\n"); + return false; + } + + /* Check if requested memory size is available */ + if (config->memory_size > 0) { + resource_size_t pool_size = resource_size(pool_res); + if (pool_size < config->memory_size) { + pr_debug("Pool too small: need %zu, have %llu\n", + config->memory_size, pool_size); + return false; + } + } + + /* Check CPU availability */ + if (config->cpus_valid && !cpumask_empty(config->cpus)) { + int cpu; + + for_each_cpu(cpu, config->cpus) { + if (!cpu_possible(cpu)) { + pr_debug("CPU %d is not possible\n", cpu); + return false; + } + } + } + + /* TODO: More sophisticated checking: + * - Check for fragmentation + * - Honor specific start address requests + * - Check for conflicts with existing allocations + * - Check for CPU conflicts with other instances + */ + + return true; +} + +/** + * Property size helper + */ +int mk_dt_get_property_size(const void *dtb_data, size_t dtb_size, + const char *property) +{ + const void *fdt = dtb_data; + int chosen_node; + const void *prop; + int len; + + if (!dtb_data || !property) + return -EINVAL; + + if (fdt_check_header(fdt)) + return -EINVAL; + + chosen_node = fdt_path_offset(fdt, "/chosen"); + if (chosen_node < 0) + return -ENOENT; + + prop = fdt_getprop(fdt, chosen_node, property, &len); + if (!prop) + return -ENOENT; + + return len; +} + +/** + * Debug and information functions + */ +void mk_dt_print_config(const struct mk_dt_config *config) +{ + if (!config) { + pr_info("Multikernel DT config: (null)\n"); + return; + } + + pr_info("Multikernel DT config (version %u):\n", config->version); + + if (config->memory_size > 0) { + pr_info(" Memory size: %zu bytes (%zu MB)\n", + config->memory_size, config->memory_size >> 20); + } else { + pr_info(" Memory size: none specified\n"); + } + + if (config->cpus_valid) { + if (cpumask_empty(config->cpus)) { + pr_info(" CPU assignment: none specified\n"); + } else { + pr_info(" CPU assignment: %*pbl (%d CPUs)\n", + cpumask_pr_args(config->cpus), cpumask_weight(config->cpus)); + } + } else { + pr_info(" CPU assignment: unavailable (allocation failed)\n"); + } + + pr_info(" DTB: %zu bytes\n", config->dtb_size); +} diff --git a/kernel/multikernel/internal.h b/kernel/multikernel/internal.h new file mode 100644 index 000000000000..54253d29b2ce --- /dev/null +++ b/kernel/multikernel/internal.h @@ -0,0 +1,4 @@ +extern struct resource multikernel_res; +extern struct mutex mk_instance_mutex; +extern struct idr mk_instance_idr; +extern struct list_head mk_instance_list; diff --git a/kernel/multikernel/kernfs.c b/kernel/multikernel/kernfs.c new file mode 100644 index 000000000000..97f6e087edd8 --- /dev/null +++ b/kernel/multikernel/kernfs.c @@ -0,0 +1,772 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Multikernel Technologies, Inc. All rights reserved + * + * Multikernel kernel instance filesystem + * + * Provides a dedicated filesystem for multikernel instance management. + * Mounted at /sys/fs/multikernel/ with full mkdir/rmdir support: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MULTIKERNEL_MAGIC 0x6d6b6673 /* "mkfs" */ + +/* Global multikernel filesystem state */ +static struct kernfs_root *mk_kernfs_root; /* Kernfs root for multikernel filesystem */ +static struct kernfs_node *mk_root_kn; /* Root kernfs node */ +static struct kernfs_node *mk_instances_kn; /* Instances subdirectory node */ +LIST_HEAD(mk_instance_list); /* List of all instances */ +DEFINE_MUTEX(mk_instance_mutex); /* Protects instance list */ +DEFINE_IDR(mk_instance_idr); /* ID allocator for instances */ + +/* Filesystem context structure */ +struct mk_fs_context { + struct kernfs_fs_context kfc; +}; + +/* Forward declarations */ +static int mk_kernfs_mkdir(struct kernfs_node *parent, const char *name, umode_t mode); +static int mk_kernfs_rmdir(struct kernfs_node *kn); +static int mk_get_tree(struct fs_context *fc); +static void mk_free_fs_context(struct fs_context *fc); +static int mk_init_fs_context(struct fs_context *fc); +static void mk_kill_sb(struct super_block *sb); +static int mk_create_instance_files(struct mk_instance *instance); +static int mk_create_instance_from_dtb(const char *name, int id, const void *fdt, + int instance_node, size_t full_dtb_size); + +/* Kernfs syscall operations */ +static struct kernfs_syscall_ops mk_kernfs_syscall_ops = { + .mkdir = mk_kernfs_mkdir, + .rmdir = mk_kernfs_rmdir, +}; + +/* Filesystem context operations */ +static const struct fs_context_operations mk_fs_context_ops = { + .free = mk_free_fs_context, + .get_tree = mk_get_tree, +}; + +/* Filesystem type */ +static struct file_system_type mk_fs_type = { + .name = "multikernel", + .init_fs_context = mk_init_fs_context, + .kill_sb = mk_kill_sb, + .fs_flags = 0, +}; + +/** + * State string conversion + */ +static const char * const mk_state_strings[] = { + [MK_STATE_EMPTY] = "empty", + [MK_STATE_READY] = "ready", + [MK_STATE_LOADING] = "loading", + [MK_STATE_ACTIVE] = "active", + [MK_STATE_FAILED] = "failed", +}; + +const char *mk_state_to_string(enum mk_instance_state state) +{ + if (state >= 0 && state < ARRAY_SIZE(mk_state_strings)) + return mk_state_strings[state]; + return "unknown"; +} + +enum mk_instance_state mk_string_to_state(const char *str) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mk_state_strings); i++) { + if (sysfs_streq(str, mk_state_strings[i])) + return i; + } + return MK_STATE_FAILED; /* Invalid input */ +} + +/** + * Kernfs file operations for instance attributes + */ + +/* id attribute - shows kernel-assigned ID */ +static int id_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + struct mk_instance *instance = of->kn->priv; + seq_printf(sf, "%d\n", instance->id); + return 0; +} + +/* status attribute - shows instance state (read-only, managed by kernel) */ +static int status_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + struct mk_instance *instance = of->kn->priv; + seq_printf(sf, "%s\n", mk_state_to_string(instance->state)); + return 0; +} + +/* Root-level device_tree attribute - handles full multikernel DTB upload */ +static int root_device_tree_seq_show(struct seq_file *sf, void *v) +{ + seq_printf(sf, "Write multikernel device tree here to propagate to instances\n"); + return 0; +} + +/* Instance device_tree_source attribute - shows instance-specific DTB (read-only) */ +static int device_tree_source_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + struct mk_instance *instance = of->kn->priv; + + if (!instance->dtb_data) { + seq_printf(sf, "(no device tree loaded)\n"); + return 0; + } + + /* Parse the instance DTB and show it with the instance as root */ + const void *fdt = instance->dtb_data; + int ret = fdt_check_header(fdt); + if (ret) { + seq_printf(sf, "/* Invalid device tree blob: %d */\n", ret); + return 0; + } + + int instances_node = fdt_path_offset(fdt, "/instances"); + if (instances_node < 0) { + seq_printf(sf, "/* No instances node found */\n"); + return 0; + } + + /* Find the first (and should be only) instance in this per-instance DTB */ + int instance_node = fdt_first_subnode(fdt, instances_node); + if (instance_node < 0) { + seq_printf(sf, "/* No instance found */\n"); + return 0; + } + + const char *instance_name = fdt_get_name(fdt, instance_node, NULL); + if (!instance_name) { + seq_printf(sf, "/* Invalid instance name */\n"); + return 0; + } + + /* Show the DTB with instance as root node */ + seq_printf(sf, "/dts-v1/;\n\n"); + seq_printf(sf, "/%s {\n", instance_name); + seq_printf(sf, "\tcompatible = \"multikernel-v1\";\n"); + + /* Display instance properties */ + int prop_offset = fdt_first_property_offset(fdt, instance_node); + while (prop_offset >= 0) { + const struct fdt_property *prop = fdt_get_property_by_offset(fdt, prop_offset, NULL); + if (prop) { + const char *prop_name = fdt_string(fdt, fdt32_to_cpu(prop->nameoff)); + int prop_len = fdt32_to_cpu(prop->len); + + if (prop_len == 4) { + /* Assume it's a u32 */ + u32 val = fdt32_to_cpu(*(const fdt32_t *)prop->data); + seq_printf(sf, "\t%s = <%u>;\n", prop_name, val); + } else { + /* For other types, show as hex */ + seq_printf(sf, "\t%s = [", prop_name); + const u8 *data = (const u8 *)prop->data; + for (int i = 0; i < prop_len; i++) { + seq_printf(sf, "%02x", data[i]); + if (i < prop_len - 1) seq_printf(sf, " "); + } + seq_printf(sf, "];\n"); + } + } + prop_offset = fdt_next_property_offset(fdt, prop_offset); + } + + /* Display subnodes (like resources) */ + int subnode; + fdt_for_each_subnode(subnode, fdt, instance_node) { + const char *subnode_name = fdt_get_name(fdt, subnode, NULL); + if (subnode_name) { + seq_printf(sf, "\t%s {\n", subnode_name); + + /* Display subnode properties */ + prop_offset = fdt_first_property_offset(fdt, subnode); + while (prop_offset >= 0) { + const struct fdt_property *prop = fdt_get_property_by_offset(fdt, prop_offset, NULL); + if (prop) { + const char *prop_name = fdt_string(fdt, fdt32_to_cpu(prop->nameoff)); + int prop_len = fdt32_to_cpu(prop->len); + + if (prop_len == 4) { + /* Assume it's a u32 */ + u32 val = fdt32_to_cpu(*(const fdt32_t *)prop->data); + if (strcmp(prop_name, "memory-bytes") == 0) { + seq_printf(sf, "\t\t%s = <0x%x>; // %u MB\n", + prop_name, val, val >> 20); + } else { + seq_printf(sf, "\t\t%s = <%u>;\n", prop_name, val); + } + } else { + /* For other types, show as hex */ + seq_printf(sf, "\t\t%s = [", prop_name); + const u8 *data = (const u8 *)prop->data; + for (int i = 0; i < prop_len; i++) { + seq_printf(sf, "%02x", data[i]); + if (i < prop_len - 1) seq_printf(sf, " "); + } + seq_printf(sf, "];\n"); + } + } + prop_offset = fdt_next_property_offset(fdt, prop_offset); + } + + seq_printf(sf, "\t};\n"); + } + } + + seq_printf(sf, "};\n"); + return 0; +} + +/* Root-level device_tree write - parses multikernel DTB and creates instances */ +static ssize_t root_device_tree_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t off) +{ + const void *fdt = buf; + int instances_node, instance_node; + const fdt32_t *id_prop; + const char *instance_name; + struct mk_instance *instance; + int ret; + + pr_info("Loading multikernel device tree (%zu bytes)\n", count); + + /* Validate DTB header */ + ret = fdt_check_header(fdt); + if (ret) { + pr_err("Invalid device tree header: %d\n", ret); + return -EINVAL; + } + + /* Find /instances node */ + instances_node = fdt_path_offset(fdt, "/instances"); + if (instances_node < 0) { + pr_err("No /instances node found in device tree\n"); + return -EINVAL; + } + + mutex_lock(&mk_instance_mutex); + + /* Iterate through each instance in the DTB */ + fdt_for_each_subnode(instance_node, fdt, instances_node) { + /* Get instance name */ + instance_name = fdt_get_name(fdt, instance_node, NULL); + if (!instance_name) { + pr_err("Failed to get instance name\n"); + continue; + } + + /* Get instance ID */ + id_prop = fdt_getprop(fdt, instance_node, "id", NULL); + if (!id_prop) { + pr_err("No 'id' property found in instance '%s'\n", instance_name); + continue; + } + + /* Check if instance already exists */ + instance = mk_instance_find_by_name(instance_name); + if (instance) { + pr_info("Instance '%s' already exists, updating DTB\n", instance_name); + } else { + /* Create new instance */ + ret = mk_create_instance_from_dtb(instance_name, fdt32_to_cpu(*id_prop), + fdt, instance_node, count); + if (ret) { + pr_err("Failed to create instance '%s': %d\n", instance_name, ret); + continue; + } + } + } + + mutex_unlock(&mk_instance_mutex); + + pr_info("Successfully processed multikernel device tree\n"); + return count; +} + +/* Helper function to extract instance DTB from a specific node */ +static int mk_extract_instance_dtb_from_node(const void *fdt, int instance_node, + const char *instance_name, + void **instance_dtb, size_t *instance_size) +{ + void *new_fdt; + int ret; + size_t new_size = PAGE_SIZE; + + /* Create new DTB with just this instance */ + new_fdt = kmalloc(new_size, GFP_KERNEL); + if (!new_fdt) + return -ENOMEM; + + ret = fdt_create(new_fdt, new_size); + ret |= fdt_finish_reservemap(new_fdt); + ret |= fdt_begin_node(new_fdt, ""); + ret |= fdt_property_string(new_fdt, "compatible", "multikernel-v1"); + ret |= fdt_begin_node(new_fdt, "instances"); + + /* Copy the instance node */ + ret |= fdt_begin_node(new_fdt, instance_name); + + /* Copy all properties from the instance node */ + int prop_offset = fdt_first_property_offset(fdt, instance_node); + while (prop_offset >= 0) { + const struct fdt_property *prop = fdt_get_property_by_offset(fdt, prop_offset, NULL); + if (prop) { + const char *prop_name = fdt_string(fdt, fdt32_to_cpu(prop->nameoff)); + ret |= fdt_property(new_fdt, prop_name, prop->data, fdt32_to_cpu(prop->len)); + } + prop_offset = fdt_next_property_offset(fdt, prop_offset); + } + + /* Copy all subnodes from the instance node (including resources) */ + int subnode; + fdt_for_each_subnode(subnode, fdt, instance_node) { + const char *subnode_name = fdt_get_name(fdt, subnode, NULL); + if (!subnode_name) + continue; + + ret |= fdt_begin_node(new_fdt, subnode_name); + + /* Copy all properties from the subnode */ + prop_offset = fdt_first_property_offset(fdt, subnode); + while (prop_offset >= 0) { + const struct fdt_property *prop = fdt_get_property_by_offset(fdt, prop_offset, NULL); + if (prop) { + const char *prop_name = fdt_string(fdt, fdt32_to_cpu(prop->nameoff)); + ret |= fdt_property(new_fdt, prop_name, prop->data, fdt32_to_cpu(prop->len)); + } + prop_offset = fdt_next_property_offset(fdt, prop_offset); + } + + ret |= fdt_end_node(new_fdt); /* end subnode */ + } + ret |= fdt_end_node(new_fdt); /* end instance */ + ret |= fdt_end_node(new_fdt); /* end instances */ + ret |= fdt_end_node(new_fdt); /* end root */ + ret |= fdt_finish(new_fdt); + + if (ret) { + pr_err("Failed to create instance DTB: %d\n", ret); + kfree(new_fdt); + return ret; + } + + *instance_dtb = new_fdt; + *instance_size = fdt_totalsize(new_fdt); + + return 0; +} + +static int mk_create_instance_from_dtb(const char *name, int id, const void *fdt, + int instance_node, size_t full_dtb_size) +{ + struct mk_instance *instance; + struct kernfs_node *kn; + struct mk_dt_config config; + void *instance_dtb; + size_t instance_dtb_size; + int ret; + + ret = mk_extract_instance_dtb_from_node(fdt, instance_node, name, + &instance_dtb, &instance_dtb_size); + if (ret) { + pr_err("Failed to extract DTB for instance '%s': %d\n", name, ret); + return ret; + } + + instance = kzalloc(sizeof(*instance), GFP_KERNEL); + if (!instance) { + kfree(instance_dtb); + return -ENOMEM; + } + + instance->id = id; + instance->name = kstrdup(name, GFP_KERNEL); + if (!instance->name) { + ret = -ENOMEM; + goto cleanup_instance; + } + + instance->state = MK_STATE_EMPTY; + INIT_LIST_HEAD(&instance->list); + INIT_LIST_HEAD(&instance->memory_regions); + instance->region_count = 0; + kref_init(&instance->refcount); + + /* Initialize CPU mask */ + if (!alloc_cpumask_var(&instance->cpus, GFP_KERNEL)) { + pr_warn("Failed to allocate CPU mask for instance '%s'\n", name); + instance->cpus_valid = false; + } else { + cpumask_clear(instance->cpus); + instance->cpus_valid = true; + } + + /* Create kernfs directory under instances/ */ + kn = kernfs_create_dir(mk_instances_kn, name, 0755, instance); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + pr_err("Failed to create kernfs directory for instance '%s': %d\n", name, ret); + goto cleanup_instance_name; + } + + instance->kn = kn; + + /* Parse and validate the instance DTB */ + mk_dt_config_init(&config); + ret = mk_dt_parse(instance_dtb, instance_dtb_size, &config); + if (ret) { + pr_err("Failed to parse DTB for instance '%s': %d\n", name, ret); + goto cleanup_kernfs; + } + + /* Reserve resources */ + ret = mk_instance_reserve_resources(instance, &config); + if (ret) { + pr_err("Failed to reserve resources for instance '%s': %d\n", name, ret); + goto cleanup_config; + } + + /* Store DTB data in instance */ + instance->dtb_data = instance_dtb; + instance->dtb_size = instance_dtb_size; + + /* Create instance attribute files */ + ret = mk_create_instance_files(instance); + if (ret) { + pr_err("Failed to create attribute files for instance '%s': %d\n", name, ret); + goto cleanup_config; + } + + /* Store in IDR for quick lookup */ + ret = idr_alloc(&mk_instance_idr, instance, id, id + 1, GFP_KERNEL); + if (ret < 0) { + pr_err("Failed to allocate IDR slot %d for instance '%s': %d\n", id, name, ret); + goto cleanup_config; + } + + /* Add to global list */ + list_add_tail(&instance->list, &mk_instance_list); + + /* Update instance state */ + mk_instance_set_state(instance, MK_STATE_READY); + + /* Activate the kernfs node */ + kernfs_activate(kn); + + /* Clean up parsed config */ + mk_dt_config_free(&config); + + pr_info("Created instance '%s' (ID: %d) from multikernel DTB\n", name, id); + return 0; + +cleanup_config: + mk_dt_config_free(&config); +cleanup_kernfs: + kernfs_remove(kn); +cleanup_instance_name: + kfree(instance->name); +cleanup_instance: + kfree(instance); + kfree(instance_dtb); + return ret; +} + + +/* Kernfs file operations */ +static const struct kernfs_ops mk_id_ops = { + .seq_show = id_seq_show, +}; + +static const struct kernfs_ops mk_status_ops = { + .seq_show = status_seq_show, +}; + +/* Root-level device_tree operations */ +static const struct kernfs_ops mk_root_device_tree_ops = { + .seq_show = root_device_tree_seq_show, + .write = root_device_tree_write, +}; + +/* Instance device_tree_source operations (read-only) */ +static const struct kernfs_ops mk_device_tree_source_ops = { + .seq_show = device_tree_source_seq_show, +}; + + +/** + * Create instance attributes in kernfs + */ +static int mk_create_instance_files(struct mk_instance *instance) +{ + struct kernfs_node *kn; + + /* Create id file */ + kn = __kernfs_create_file(instance->kn, "id", 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &mk_id_ops, instance, NULL, NULL); + if (IS_ERR(kn)) { + pr_err("Failed to create id file for instance %s\n", instance->name); + return PTR_ERR(kn); + } + + /* Create status file (read-only, managed by kernel) */ + kn = __kernfs_create_file(instance->kn, "status", 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &mk_status_ops, instance, NULL, NULL); + if (IS_ERR(kn)) { + pr_err("Failed to create status file for instance %s\n", instance->name); + return PTR_ERR(kn); + } + + /* Create device_tree_source file (read-only) */ + kn = __kernfs_create_file(instance->kn, "device_tree_source", 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &mk_device_tree_source_ops, instance, NULL, NULL); + if (IS_ERR(kn)) { + pr_err("Failed to create device_tree_source file for instance %s\n", instance->name); + return PTR_ERR(kn); + } + + + return 0; +} + +/** + * mk_kernfs_mkdir - Handle mkdir operations in multikernel kernfs + * @parent: Parent kernfs node + * @name: Directory name to create + * @mode: Directory mode + * + * This function is called when user does: mkdir /sys/kernel/multikernel/my-kernel + */ +static int mk_kernfs_mkdir(struct kernfs_node *parent, const char *name, umode_t mode) +{ + /* Only allow mkdir under instances/ directory */ + if (parent != mk_instances_kn) { + pr_err("Instances can only be created under /instances directory\n"); + return -EPERM; + } + + /* Manual instance creation is now disabled - instances are created via device_tree upload */ + pr_err("Manual instance creation disabled. Upload multikernel device tree to /device_tree instead.\n"); + return -EPERM; +} + +/** + * mk_kernfs_rmdir - Handle rmdir operations in multikernel kernfs + * @kn: Kernfs node to remove + */ +static int mk_kernfs_rmdir(struct kernfs_node *kn) +{ + struct mk_instance *instance = kn->priv; + + if (!instance) { + pr_err("No instance data found for kernfs node\n"); + return -EINVAL; + } + + mutex_lock(&mk_instance_mutex); + + /* Check if instance is active - prevent removal of active instances */ + if (instance->state == MK_STATE_ACTIVE) { + mutex_unlock(&mk_instance_mutex); + pr_err("Cannot remove active instance '%s' (ID: %d). Instance must be stopped first.\n", + instance->name, instance->id); + return -EBUSY; + } + + /* Check if instance is currently loading - also prevent removal */ + if (instance->state == MK_STATE_LOADING) { + mutex_unlock(&mk_instance_mutex); + pr_err("Cannot remove loading instance '%s' (ID: %d). Wait for loading to complete.\n", + instance->name, instance->id); + return -EBUSY; + } + + list_del(&instance->list); + idr_remove(&mk_instance_idr, instance->id); + + mutex_unlock(&mk_instance_mutex); + + mk_instance_put(instance); + kernfs_remove_self(kn); + return 0; +} + +/** + * Filesystem operations implementation + */ + +static int mk_init_fs_context(struct fs_context *fc) +{ + struct mk_fs_context *ctx; + struct kernfs_fs_context *kfc; + + ctx = kzalloc(sizeof(struct mk_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + kfc = &ctx->kfc; + kfc->root = mk_kernfs_root; + kfc->magic = MULTIKERNEL_MAGIC; + fc->fs_private = ctx; + fc->ops = &mk_fs_context_ops; + fc->global = true; + return 0; +} + +static int mk_get_tree(struct fs_context *fc) +{ + int ret; + + ret = kernfs_get_tree(fc); + if (ret) + return ret; + + return 0; +} + +static void mk_free_fs_context(struct fs_context *fc) +{ + struct mk_fs_context *ctx = fc->fs_private; + + if (ctx) { + kernfs_free_fs_context(fc); + kfree(ctx); + } + fc->fs_private = NULL; +} + +static void mk_kill_sb(struct super_block *sb) +{ + kernfs_kill_sb(sb); +} + +/** + * Module initialization and cleanup + */ +int mk_kernfs_init(void) +{ + int ret; + + /* Create kernfs root with mkdir/rmdir support */ + mk_kernfs_root = kernfs_create_root(&mk_kernfs_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + NULL); + if (IS_ERR(mk_kernfs_root)) { + ret = PTR_ERR(mk_kernfs_root); + pr_err("Failed to create multikernel kernfs root: %d\n", ret); + return ret; + } + + /* Get the root kernfs node */ + mk_root_kn = kernfs_root_to_node(mk_kernfs_root); + + /* Create instances subdirectory */ + mk_instances_kn = kernfs_create_dir(mk_root_kn, "instances", 0755, NULL); + if (IS_ERR(mk_instances_kn)) { + ret = PTR_ERR(mk_instances_kn); + pr_err("Failed to create instances directory: %d\n", ret); + kernfs_destroy_root(mk_kernfs_root); + return ret; + } + + /* Create root-level device_tree file */ + struct kernfs_node *device_tree_kn = __kernfs_create_file(mk_root_kn, "device_tree", 0644, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &mk_root_device_tree_ops, NULL, NULL, NULL); + if (IS_ERR(device_tree_kn)) { + ret = PTR_ERR(device_tree_kn); + pr_err("Failed to create root device_tree file: %d\n", ret); + kernfs_destroy_root(mk_kernfs_root); + return ret; + } + + /* Register the filesystem */ + ret = register_filesystem(&mk_fs_type); + if (ret) { + pr_err("Failed to register multikernel filesystem: %d\n", ret); + kernfs_destroy_root(mk_kernfs_root); + return ret; + } + + /* Create a mount point in sysfs */ + ret = sysfs_create_mount_point(fs_kobj, "multikernel"); + if (ret) { + pr_err("Failed to create multikernel mount point: %d\n", ret); + unregister_filesystem(&mk_fs_type); + kernfs_destroy_root(mk_kernfs_root); + return ret; + } + + /* Activate the kernfs root */ + kernfs_activate(mk_root_kn); + + pr_info("Multikernel filesystem initialized. Mount with: mount -t multikernel none /sys/fs/multikernel\n"); + return 0; +} + +void mk_kernfs_cleanup(void) +{ + struct mk_instance *instance, *tmp; + + /* Remove all instances */ + mutex_lock(&mk_instance_mutex); + list_for_each_entry_safe(instance, tmp, &mk_instance_list, list) { + list_del(&instance->list); + idr_remove(&mk_instance_idr, instance->id); + mk_instance_put(instance); + } + mutex_unlock(&mk_instance_mutex); + + /* Clean up IDR */ + idr_destroy(&mk_instance_idr); + + /* Remove sysfs mount point */ + sysfs_remove_mount_point(fs_kobj, "multikernel"); + + /* Unregister filesystem */ + unregister_filesystem(&mk_fs_type); + + /* Remove kernfs directory */ + if (mk_root_kn) { + kernfs_remove(mk_root_kn); + mk_root_kn = NULL; + } + + /* Destroy kernfs root */ + if (mk_kernfs_root) { + kernfs_destroy_root(mk_kernfs_root); + mk_kernfs_root = NULL; + } + + pr_info("Multikernel filesystem cleaned up\n"); +} -- 2.34.1 From: Cong Wang Establish bidirectional integration between the kexec subsystem and multikernel instance management, enabling proper lifecycle tracking and resource coordination for multikernel operations. This commit introduces: * Enhanced kimage structure with multikernel-specific fields including mk_id for unique multikernel identification and mk_instance pointer for cross-referencing with the multikernel instance management system, enabling proper state synchronization. * UAPI extensions in include/uapi/linux/kexec.h that define multikernel ID encoding within kexec flags using KEXEC_MK_ID_MASK and KEXEC_MK_ID_SHIFT, providing up to 2047 unique multikernel instances with proper bit field management macros. * Multikernel image lookup infrastructure through kimage_find_by_id() that leverages the mk_instance system for efficient image retrieval by multikernel ID, replacing CPU-based lookup with proper instance management. * Refactored multikernel_kexec() interface from CPU-based to ID-based operation (multikernel_kexec_by_id()) that uses instance CPU assignments from device tree configuration rather than manual CPU specification, improving safety and consistency. * Proper resource lifecycle management in kimage_free() that clears cross-references, updates instance states, and handles reference counting when multikernel images are freed, preventing resource leaks and dangling pointers. * Updated reboot syscall interface that accepts multikernel instance IDs instead of CPU numbers in LINUX_REBOOT_CMD_MULTIKERNEL, providing a more intuitive and safer user interface. Signed-off-by: Cong Wang --- include/linux/kexec.h | 10 ++++- include/linux/multikernel.h | 3 ++ include/uapi/linux/kexec.h | 4 ++ kernel/kexec_core.c | 75 +++++++++++++++++++++++++++++++++---- kernel/reboot.c | 4 +- 5 files changed, 85 insertions(+), 11 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 69877db5360b..5e9e9ad1dfeb 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -132,6 +132,7 @@ struct purgatory_info { }; struct kimage; +struct mk_instance; typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, @@ -434,6 +435,12 @@ struct kimage { /* For multikernel support: linked list node */ struct list_head list; + + /* Multikernel unique ID (0 = current kernel, >0 = multikernel images) */ + int mk_id; + + /* Multikernel instance cross-reference */ + struct mk_instance *mk_instance; }; /* kexec interface functions */ @@ -441,7 +448,8 @@ extern void machine_kexec(struct kimage *image); extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); -extern int multikernel_kexec(int cpu); +extern int multikernel_kexec_by_id(int mk_id); +extern struct kimage *kimage_find_by_id(int mk_id); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); diff --git a/include/linux/multikernel.h b/include/linux/multikernel.h index 75cbb316d565..c65d39a66b84 100644 --- a/include/linux/multikernel.h +++ b/include/linux/multikernel.h @@ -102,6 +102,9 @@ struct mk_instance { void *dtb_data; /* Device tree blob data */ size_t dtb_size; /* Size of DTB */ + /* Kexec integration */ + struct kimage *kimage; /* Associated kimage object */ + /* Sysfs representation */ struct kernfs_node *kn; /* Kernfs node for this instance */ diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 346e0ff4e663..82a562ae6ac1 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -15,6 +15,10 @@ #define KEXEC_UPDATE_ELFCOREHDR 0x00000004 #define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008 #define KEXEC_MULTIKERNEL 0x00000010 +#define KEXEC_MK_ID_MASK 0x0000ffe0 +#define KEXEC_MK_ID_SHIFT 5 +#define KEXEC_MK_ID(id) (((id) << KEXEC_MK_ID_SHIFT) & KEXEC_MK_ID_MASK) +#define KEXEC_GET_MK_ID(flags) (((flags) & KEXEC_MK_ID_MASK) >> KEXEC_MK_ID_SHIFT) #define KEXEC_ARCH_MASK 0xffff0000 /* diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 449096060fe8..ed5c97b4531e 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -255,6 +256,12 @@ struct kimage *do_kimage_alloc_init(void) /* Initialize the list node for multikernel support */ INIT_LIST_HEAD(&image->list); + /* Initialize multikernel ID (0 = current kernel, will be assigned later for multikernel) */ + image->mk_id = 0; + + /* Initialize multikernel instance cross-reference */ + image->mk_instance = NULL; + #ifdef CONFIG_CRASH_HOTPLUG image->hp_action = KEXEC_CRASH_HP_NONE; image->elfcorehdr_index = -1; @@ -594,6 +601,16 @@ void kimage_free(struct kimage *image) else if (image == kexec_crash_image) kimage_update_compat_pointers(NULL, KEXEC_TYPE_CRASH); + /* Remove from IDR if it's a multikernel image */ + if (image->type == KEXEC_TYPE_MULTIKERNEL && image->mk_instance) { + /* Clear cross-reference and update state */ + image->mk_instance->kimage = NULL; + mk_instance_set_state(image->mk_instance, MK_STATE_READY); + mk_instance_put(image->mk_instance); + image->mk_instance = NULL; + pr_info("Freed multikernel ID %d\n", image->mk_id); + } + #ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); @@ -1393,26 +1410,68 @@ int kernel_kexec(void) return error; } -int multikernel_kexec(int cpu) +/* + * Find a multikernel image by ID using mk_instance lookup + */ +struct kimage *kimage_find_by_id(int mk_id) { - int rc; + struct mk_instance *instance; + struct kimage *image = NULL; - pr_info("multikernel kexec: cpu %d\n", cpu); + if (mk_id <= 0) + return NULL; - if (cpu_online(cpu)) { - pr_err("The CPU is currently running with this kernel instance."); - return -EBUSY; + /* Use mk_instance system to find the associated kimage */ + instance = mk_instance_find(mk_id); + if (instance) { + image = instance->kimage; + mk_instance_put(instance); /* Release reference from find */ } + return image; +} + +int multikernel_kexec_by_id(int mk_id) +{ + struct kimage *mk_image; + struct mk_instance *instance; + int cpu = -1; + int rc; + if (!kexec_trylock()) return -EBUSY; - if (!kexec_image) { + + mk_image = kimage_find_by_id(mk_id); + if (!mk_image) { + pr_err("No multikernel image found with ID %d\n", mk_id); rc = -EINVAL; goto unlock; } + instance = mk_image->mk_instance; + if (instance->cpus_valid && !cpumask_empty(instance->cpus)) { + cpu = cpumask_first(instance->cpus); + pr_info("multikernel kexec: using assigned CPU %d from instance cpumask %*pbl\n", + cpu, cpumask_pr_args(instance->cpus)); + } else { + pr_err("No CPU assignment found for multikernel instance %d - CPU assignment is required\n", + mk_id); + rc = -EINVAL; + goto unlock; + } + + if (cpu_online(cpu)) { + pr_err("CPU %d is currently online and cannot be used for multikernel instance %d\n", + cpu, mk_id); + rc = -EBUSY; + goto unlock; + } + + pr_info("Using multikernel image with ID %d (entry point: 0x%lx) on CPU %d\n", + mk_image->mk_id, mk_image->start, cpu); + cpus_read_lock(); - rc = multikernel_kick_ap(cpu, kexec_image->start); + rc = multikernel_kick_ap(cpu, mk_image->start); cpus_read_unlock(); unlock: diff --git a/kernel/reboot.c b/kernel/reboot.c index f3ac703c4695..bff6d3603a17 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -718,7 +718,7 @@ EXPORT_SYMBOL_GPL(kernel_power_off); DEFINE_MUTEX(system_transition_mutex); struct multikernel_boot_args { - int cpu; + int mk_id; }; /* @@ -807,7 +807,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, case LINUX_REBOOT_CMD_MULTIKERNEL: if (copy_from_user(&boot_args, arg, sizeof(boot_args))) return -EFAULT; - ret = multikernel_kexec(boot_args.cpu); + ret = multikernel_kexec_by_id(boot_args.mk_id); break; #endif -- 2.34.1 From: Cong Wang Signed-off-by: Cong Wang --- Documentation/multikernel/usage.rst | 215 ++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 Documentation/multikernel/usage.rst diff --git a/Documentation/multikernel/usage.rst b/Documentation/multikernel/usage.rst new file mode 100644 index 000000000000..a2ec8d56ca1d --- /dev/null +++ b/Documentation/multikernel/usage.rst @@ -0,0 +1,215 @@ +=================================== +Multikernel Kernfs Interface Usage +=================================== + +Overview +======== + +The multikernel kernfs interface provides a clean, user-friendly way to manage multikernel instances through the filesystem. The interface is located at ``/sys/fs/multikernel/`` and supports automatic instance creation from multikernel device trees. + +Architecture +============ + +:: + + /sys/fs/multikernel/ + ├── device_tree # Root-level DTB upload (write-only) + └── instances/ # Instance directory + ├── web-server/ # Instance created from DTB + │ ├── id # Instance ID (read-only) + │ ├── status # Instance status (read-only) + │ └── device_tree_source # Instance DTB in DTS format (read-only) + ├── database/ # Another instance + │ ├── id + │ ├── status + │ └── device_tree_source + └── ... + +Workflow +======== + +Phase 1: Instance Creation (Automatic from DTB) +------------------------------------------------ + +1. **Create Multikernel Device Tree** + + Create a device tree with multiple instances: + + .. code-block:: dts + + /dts-v1/; + / { + compatible = "multikernel-v1"; + + instances { + web-server { + id = <1>; + resources { + cpus = <1>; + memory-bytes = <0x20000000>; // 512MB + }; + }; + + database { + id = <2>; + resources { + cpus = <2 3>; + memory-bytes = <0x40000000>; // 1GB + }; + }; + }; + }; + +2. **Upload Multikernel DTB** + + .. code-block:: bash + + # Compile device tree to binary format + dtc -O dtb -o multikernel.dtb multikernel.dts + + # Upload DTB to create instances automatically + cat multikernel.dtb > /sys/fs/multikernel/device_tree + + This automatically: + + - Validates DTB format and multikernel-v1 compatibility + - Parses each instance in the ``/instances`` node + - Creates instance directories under ``instances/`` + - Reserves memory and CPU resources for each instance + - Updates each instance status to "ready" + +3. **Check Created Instances** + + .. code-block:: bash + + # List created instances + ls /sys/fs/multikernel/instances/ + # Output: database web-server + + # Check instance details + cat /sys/fs/multikernel/instances/web-server/id + # Output: 1 + + cat /sys/fs/multikernel/instances/web-server/status + # Output: ready + + # View instance device tree + cat /sys/fs/multikernel/instances/web-server/device_tree_source + # Output: DTS format showing the instance configuration + +Phase 2: Kernel Loading (Kexec Integration) +-------------------------------------------- + +1. **Load Kernel Image** + + .. code-block:: bash + + # Load kernel for instance ID 1 (web-server) + kexec_file_load(..., KEXEC_MULTIKERNEL | KEXEC_MK_ID(1)) + + This: + + - Finds pre-reserved resources for instance ID 1 + - Creates kimage using pre-allocated memory and CPU resources + - Updates status to "loading" → "active" + - Preserves instance DTB for KHO (Kexec HandOver) restoration + +2. **Instance DTB Preservation** + + The multikernel system automatically preserves each instance's device tree during kexec for restoration in the spawn kernel. The spawn kernel will: + + - Detect multikernel KHO data during early boot + - Restore the instance's DTB and recreate the instance structure + - Re-reserve the same memory and CPU resources + +Device Tree Format +================== + +Multikernel DTB Structure +-------------------------- + +The multikernel device tree uses the ``/instances`` structure with ``multikernel-v1`` compatibility: + +.. code-block:: dts + + /dts-v1/; + / { + compatible = "multikernel-v1"; + + instances { + web-server { + id = <1>; + resources { + cpus = <1>; // CPU ID 1 + memory-bytes = <0x20000000>; // 512MB + }; + }; + + database { + id = <2>; + resources { + cpus = <2 3>; // CPU IDs 2 and 3 + memory-bytes = <0x40000000>; // 1GB + }; + }; + + load-balancer { + id = <3>; + resources { + cpus = <0>; // CPU ID 0 + memory-bytes = <0x10000000>; // 256MB + }; + }; + }; + }; + +Per-Instance DTB Format +----------------------- + +When viewing an instance's ``device_tree_source``, it appears in per-instance format: + +.. code-block:: dts + + /dts-v1/; + + /web-server { + compatible = "multikernel-v1"; + id = <1>; + resources { + cpus = <1>; + memory-bytes = <0x20000000>; // 512 MB + }; + }; + +Resource Properties +------------------- + +- **cpus**: Array of CPU IDs to assign to this instance +- **memory-bytes**: Memory size in bytes (must be page-aligned) +- **id**: Unique instance identifier used for kexec operations + +The system validates that: + +- CPU IDs are valid and available +- Memory requests don't exceed available multikernel pool +- Instance IDs are unique +- All values are properly aligned + +Instance States +=============== + +- **empty**: Instance created but no resources allocated yet +- **ready**: DTB processed, resources reserved, ready for kexec +- **loading**: Kernel being loaded via kexec +- **active**: Kernel running in this instance +- **failed**: Error occurred during any phase + +Interface Restrictions +====================== + +The new kernfs interface has the following restrictions: + +- **No manual instance creation**: Use ``mkdir`` under ``instances/`` is disabled +- **No direct DTB upload to instances**: Instances don't have writable ``device_tree`` files +- **Centralized DTB management**: All instances must be created via the root ``device_tree`` file +- **Read-only instance files**: All instance attributes are read-only for consistency -- 2.34.1 From: Cong Wang Add a dedicated /proc/kimage file to provide read-only access to all loaded kernel images in the system, for both regular kexec kernel images and multikernel images. The interface displays kernel images in a tabular format showing: - Type: kexec type (default, crash, multikernel) - Start Address: entry point in hexadecimal format - Segments: number of memory segments This interface is particularly useful for inspecting kimages, debugging kexec, and verifying that kernel images are loaded correctly. Signed-off-by: Cong Wang --- kernel/kexec_core.c | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index ed5c97b4531e..7db755e64dd6 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -1214,6 +1216,52 @@ struct kimage *kexec_image; struct kimage *kexec_crash_image; static int kexec_load_disabled; +/* + * Proc interface for /proc/kimage + */ +static int kimage_proc_show(struct seq_file *m, void *v) +{ + struct kimage *image; + const char *type_names[] = { + [KEXEC_TYPE_DEFAULT] = "default", + [KEXEC_TYPE_CRASH] = "crash", + [KEXEC_TYPE_MULTIKERNEL] = "multikernel" + }; + + seq_printf(m, "MK_ID Type Start Address Segments\n"); + seq_printf(m, "----- ---------- -------------- --------\n"); + + kimage_list_lock(); + if (list_empty(&kexec_image_list)) { + seq_printf(m, "No kimages loaded\n"); + } else { + list_for_each_entry(image, &kexec_image_list, list) { + const char *type_name = "unknown"; + + if (image->type < ARRAY_SIZE(type_names) && type_names[image->type]) + type_name = type_names[image->type]; + + seq_printf(m, "%5d %-10s 0x%012lx %8lu\n", + image->mk_id, type_name, image->start, image->nr_segments); + } + } + kimage_list_unlock(); + + return 0; +} + +static int kimage_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, kimage_proc_show, NULL); +} + +static const struct proc_ops kimage_proc_ops = { + .proc_open = kimage_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + #ifdef CONFIG_SYSCTL static int kexec_limit_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -1285,6 +1333,21 @@ static int __init kexec_core_sysctl_init(void) late_initcall(kexec_core_sysctl_init); #endif +static int __init kimage_proc_init(void) +{ + struct proc_dir_entry *entry; + + entry = proc_create("kimage", 0444, NULL, &kimage_proc_ops); + if (!entry) { + pr_err("Failed to create /proc/kimage\n"); + return -ENOMEM; + } + + pr_debug("Created /proc/kimage interface\n"); + return 0; +} +late_initcall(kimage_proc_init); + bool kexec_load_permitted(int kexec_image_type) { struct kexec_load_limit *limit; -- 2.34.1 From: Cong Wang Add virtual memory allocation interface for multikernel instances, providing convenient high-level functions for memory management within instance-specific memory pools with automatic virtual address mapping. This commit introduces: * Instance-based memory allocation functions (mk_instance_alloc() and mk_instance_free()) that provide virtual memory allocation from instance-specific memory pools with configurable alignment support and automatic physical-to-virtual address mapping using memremap(). * Kimage-based memory allocation wrappers (mk_kimage_alloc() and mk_kimage_free()) that provide convenient access to instance memory pools through kimage structures, commonly used in kexec code paths for multikernel operations. * Automatic memory mapping infrastructure that uses memremap() with MEMREMAP_WB caching policy to provide write-back cached virtual addresses for allocated physical memory from instance pools. * Proper error handling and cleanup with automatic rollback of physical allocations when virtual mapping fails, preventing memory leaks in error conditions. The allocation functions complement the existing physical memory pool management by providing a higher-level interface suitable for kernel code that requires virtual addresses, such as buffer management, data structure allocation, and inter-kernel communication buffers. This interface enables multikernel subsystems to allocate kernel images, initramfs etc., maintaining proper resource isolation between multikernel instances. Signed-off-by: Cong Wang --- include/linux/multikernel.h | 6 +++ kernel/multikernel/core.c | 98 +++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/include/linux/multikernel.h b/include/linux/multikernel.h index c65d39a66b84..79611923649e 100644 --- a/include/linux/multikernel.h +++ b/include/linux/multikernel.h @@ -280,6 +280,12 @@ int mk_instance_reserve_resources(struct mk_instance *instance, */ void mk_instance_free_memory(struct mk_instance *instance); +void *mk_instance_alloc(struct mk_instance *instance, size_t size, size_t align); +void mk_instance_free(struct mk_instance *instance, void *virt_addr, size_t size); + +void *mk_kimage_alloc(struct kimage *image, size_t size, size_t align); +void mk_kimage_free(struct kimage *image, void *virt_addr, size_t size); + /** * String conversion helpers */ diff --git a/kernel/multikernel/core.c b/kernel/multikernel/core.c index 52bf8e38206a..ee7a21327ea5 100644 --- a/kernel/multikernel/core.c +++ b/kernel/multikernel/core.c @@ -7,7 +7,10 @@ #include #include #include +#include +#include #include +#include #include "internal.h" /** @@ -403,6 +406,101 @@ int mk_instance_reserve_resources(struct mk_instance *instance, return 0; } +/** + * Per-instance memory pool management + */ + +/** + * mk_instance_alloc() - Allocate memory from instance pool + * @instance: Instance to allocate from + * @size: Size to allocate + * @align: Alignment requirement (must be power of 2) + * + * Returns virtual address of allocated memory, or NULL on failure. + */ +void *mk_instance_alloc(struct mk_instance *instance, size_t size, size_t align) +{ + phys_addr_t phys_addr; + void *virt_addr; + + if (!instance || !instance->instance_pool) { + pr_debug("mk_instance_alloc: instance %p has no pool\n", instance); + return NULL; + } + + /* Allocate from instance pool with alignment */ + phys_addr = multikernel_instance_alloc(instance->instance_pool, size, align); + if (!phys_addr) { + pr_debug("Failed to allocate %zu bytes from instance pool (align=0x%zx)\n", size, align); + return NULL; + } + + /* Map to virtual address space */ + virt_addr = memremap(phys_addr, size, MEMREMAP_WB); + if (!virt_addr) { + pr_err("Failed to map instance memory at 0x%llx\n", (unsigned long long)phys_addr); + multikernel_instance_free(instance->instance_pool, phys_addr, size); + return NULL; + } + + return virt_addr; +} + +/** + * mk_instance_free() - Free memory back to instance pool + * @instance: Instance to free to + * @virt_addr: Virtual address to free + * @size: Size to free + */ +void mk_instance_free(struct mk_instance *instance, void *virt_addr, size_t size) +{ + phys_addr_t phys_addr; + + if (!instance || !instance->instance_pool || !virt_addr) + return; + + phys_addr = virt_to_phys(virt_addr); + memunmap(virt_addr); + multikernel_instance_free(instance->instance_pool, phys_addr, size); +} + +/** + * Kimage-based memory pool access functions + * + * These provide convenient wrappers for accessing instance memory pools + * through the kimage structure, commonly used in kexec code paths. + */ + +/** + * mk_kimage_alloc() - Allocate memory from kimage's instance pool + * @image: kimage with associated mk_instance + * @size: Size to allocate + * @align: Alignment requirement (must be power of 2) + * + * Returns virtual address of allocated memory, or NULL on failure. + */ +void *mk_kimage_alloc(struct kimage *image, size_t size, size_t align) +{ + if (!image || !image->mk_instance) + return NULL; + + return mk_instance_alloc(image->mk_instance, size, align); +} + +/** + * mk_kimage_free() - Free memory back to kimage's instance pool + * @image: kimage with associated mk_instance + * @virt_addr: Virtual address to free + * @size: Size to free + */ +void mk_kimage_free(struct kimage *image, void *virt_addr, size_t size) +{ + if (!image || !image->mk_instance) + return; + + mk_instance_free(image->mk_instance, virt_addr, size); +} + static int __init multikernel_init(void) { int ret; -- 2.34.1 From: Cong Wang This patch implements a comprehensive IPI-based communication system for multikernel environments, enabling data exchange between different kernel instances running on separate CPUs. Key features include: - Generic IPI handler registration and callback mechanism allowing modules to register for multikernel communication events - Shared memory infrastructure on top of the general per-instance memory allocation infrastructure - Per-instance data buffers in shared memory for efficient IPI payload transfer up to 256 bytes per message - IRQ work integration for safe callback execution in interrupt context - PFN-based flexible shared memory APIs for page-level data sharing - Resource tracking integration for /proc/iomem visibility It provides the key API multikernel_send_ipi_data() for sending typed data to target kernel instance and multikernel_register_handler() for registering IPI handler. Shared memory is established on top of the per-instance memory allocation infra. This infrastructure enables multikernel instances to coordinate and share data while maintaining isolation on their respective CPU cores. (Note, as a proof-of-concept, we have only implemented the x86 part.) Signed-off-by: Cong Wang --- arch/x86/kernel/smp.c | 3 + include/linux/multikernel.h | 66 +++++ kernel/multikernel/Makefile | 2 +- kernel/multikernel/ipi.c | 471 ++++++++++++++++++++++++++++++++++++ 4 files changed, 541 insertions(+), 1 deletion(-) create mode 100644 kernel/multikernel/ipi.c diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e2eba09da7fc..2be7c1a777ef 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -273,10 +273,13 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) } #ifdef CONFIG_MULTIKERNEL +void generic_multikernel_interrupt(void); + DEFINE_IDTENTRY_SYSVEC(sysvec_multikernel) { apic_eoi(); inc_irq_stat(irq_call_count); + generic_multikernel_interrupt(); } #endif /* CONFIG_MULTIKERNEL */ diff --git a/include/linux/multikernel.h b/include/linux/multikernel.h index 79611923649e..ee96bd2332b6 100644 --- a/include/linux/multikernel.h +++ b/include/linux/multikernel.h @@ -14,6 +14,72 @@ #include #include +/** + * Multikernel IPI interface + */ + +/* Maximum data size that can be transferred via IPI */ +#define MK_MAX_DATA_SIZE 256 + +/* Data structure for passing parameters via IPI */ +struct mk_ipi_data { + int sender_cpu; /* Which CPU sent this IPI */ + unsigned int type; /* User-defined type identifier */ + size_t data_size; /* Size of the data */ + char buffer[MK_MAX_DATA_SIZE]; /* Actual data buffer */ +}; + +/* Function pointer type for IPI callbacks */ +typedef void (*mk_ipi_callback_t)(struct mk_ipi_data *data, void *ctx); + +struct mk_ipi_handler { + mk_ipi_callback_t callback; + void *context; + unsigned int ipi_type; /* IPI type this handler is registered for */ + struct mk_ipi_handler *next; + struct mk_ipi_data *saved_data; + struct irq_work work; +}; + +/** + * multikernel_register_handler - Register a callback for multikernel IPI + * @callback: Function to call when IPI is received + * @ctx: Context pointer passed to the callback + * @ipi_type: IPI type this handler should process + * + * Returns pointer to handler on success, NULL on failure + */ +struct mk_ipi_handler *multikernel_register_handler(mk_ipi_callback_t callback, void *ctx, unsigned int ipi_type); + +/** + * multikernel_unregister_handler - Unregister a multikernel IPI callback + * @handler: Handler pointer returned from multikernel_register_handler + */ +void multikernel_unregister_handler(struct mk_ipi_handler *handler); + +/** + * multikernel_send_ipi_data - Send data to another CPU via IPI + * @instance_id: Target multikernel instance ID + * @data: Pointer to data to send + * @data_size: Size of data + * @type: User-defined type identifier + * + * This function copies the data to per-CPU storage and sends an IPI + * to the target CPU. + * + * Returns 0 on success, negative error code on failure + */ +int multikernel_send_ipi_data(int instance_id, void *data, size_t data_size, unsigned long type); + +void generic_multikernel_interrupt(void); + +/* Flexible shared memory APIs (PFN-based) */ +int mk_send_pfn(int instance_id, unsigned long pfn); +int mk_receive_pfn(struct mk_ipi_data *data, unsigned long *out_pfn); +void *mk_receive_map_page(struct mk_ipi_data *data); + +#define mk_receive_unmap_page(p) memunmap(p) + struct resource; extern phys_addr_t multikernel_alloc(size_t size); diff --git a/kernel/multikernel/Makefile b/kernel/multikernel/Makefile index d004c577f13d..b539acc656c6 100644 --- a/kernel/multikernel/Makefile +++ b/kernel/multikernel/Makefile @@ -3,7 +3,7 @@ # Makefile for multikernel support # -obj-y += core.o mem.o kernfs.o dts.o +obj-y += core.o mem.o kernfs.o dts.o ipi.o # Add libfdt include path for device tree parsing CFLAGS_dts.o = -I $(srctree)/scripts/dtc/libfdt diff --git a/kernel/multikernel/ipi.c b/kernel/multikernel/ipi.c new file mode 100644 index 000000000000..b5c4a06747a2 --- /dev/null +++ b/kernel/multikernel/ipi.c @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Multikernel Technologies, Inc. All rights reserved + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Per-instance IPI data - no more global variables */ +struct mk_instance_ipi_data { + void *instance_pool; /* Instance pool handle */ + struct mk_shared_data *shared_mem; /* IPI shared memory for this instance */ + size_t shared_mem_size; /* Size of shared memory */ +}; + +/* Shared memory structures - per-instance design */ +struct mk_shared_data { + struct mk_ipi_data cpu_data[NR_CPUS]; /* Data area for each CPU */ +}; + +#define MK_MAX_INSTANCES 256 +static struct mk_instance_ipi_data *mk_instance_ipi_map[MK_MAX_INSTANCES]; +static DEFINE_SPINLOCK(mk_ipi_map_lock); + +static struct mk_shared_data *mk_this_kernel_ipi_data; +static phys_addr_t mk_ipi_shared_phys_addr; + +/* Callback management */ +static struct mk_ipi_handler *mk_handlers; +static raw_spinlock_t mk_handlers_lock = __RAW_SPIN_LOCK_UNLOCKED(mk_handlers_lock); + +static void *multikernel_alloc_ipi_buffer(void *pool_handle, size_t buffer_size); +static void multikernel_free_ipi_buffer(void *pool_handle, void *virt_addr, size_t buffer_size); + +static void handler_work(struct irq_work *work) +{ + struct mk_ipi_handler *handler = container_of(work, struct mk_ipi_handler, work); + if (handler->callback) + handler->callback(handler->saved_data, handler->context); +} + +/** + * mk_instance_ipi_create() - Create IPI data for a multikernel instance + * @instance: The multikernel instance + * + * Allocates and initializes IPI communication buffers for the given instance. + * Returns 0 on success, negative error code on failure. + */ +static int mk_instance_ipi_create(struct mk_instance *instance) +{ + struct mk_instance_ipi_data *ipi_data; + unsigned long flags; + int ret = 0; + + if (!instance || instance->id < 0 || instance->id >= MK_MAX_INSTANCES) + return -EINVAL; + + ipi_data = kzalloc(sizeof(*ipi_data), GFP_KERNEL); + if (!ipi_data) + return -ENOMEM; + + /* Use the instance's own memory pool */ + ipi_data->instance_pool = instance->instance_pool; + if (!ipi_data->instance_pool) { + pr_err("Instance %d has no memory pool for IPI allocation\n", instance->id); + kfree(ipi_data); + return -ENODEV; + } + + /* Allocate IPI buffer from the instance pool */ + ipi_data->shared_mem_size = sizeof(struct mk_shared_data); + ipi_data->shared_mem = multikernel_alloc_ipi_buffer(ipi_data->instance_pool, + ipi_data->shared_mem_size); + if (!ipi_data->shared_mem) { + pr_err("Failed to allocate IPI shared memory for instance %d\n", instance->id); + kfree(ipi_data); + return -ENOMEM; + } + + /* Initialize the shared memory structure */ + memset(ipi_data->shared_mem, 0, ipi_data->shared_mem_size); + + /* Register in the global map */ + spin_lock_irqsave(&mk_ipi_map_lock, flags); + if (mk_instance_ipi_map[instance->id]) { + pr_err("IPI data already exists for instance %d\n", instance->id); + ret = -EEXIST; + } else { + mk_instance_ipi_map[instance->id] = ipi_data; + } + spin_unlock_irqrestore(&mk_ipi_map_lock, flags); + + if (ret) { + multikernel_free_ipi_buffer(ipi_data->instance_pool, + ipi_data->shared_mem, + ipi_data->shared_mem_size); + kfree(ipi_data); + return ret; + } + + pr_info("Created IPI data for instance %d (%s): virt=%px, size=%zu bytes\n", + instance->id, instance->name, ipi_data->shared_mem, ipi_data->shared_mem_size); + + return 0; +} + +/** + * mk_instance_ipi_destroy() - Destroy IPI data for a multikernel instance + * @instance_id: The instance ID + * + * Cleans up and frees IPI communication buffers for the given instance. + */ +static void mk_instance_ipi_destroy(int instance_id) +{ + struct mk_instance_ipi_data *ipi_data; + unsigned long flags; + + if (instance_id < 0 || instance_id >= MK_MAX_INSTANCES) + return; + + spin_lock_irqsave(&mk_ipi_map_lock, flags); + ipi_data = mk_instance_ipi_map[instance_id]; + mk_instance_ipi_map[instance_id] = NULL; + spin_unlock_irqrestore(&mk_ipi_map_lock, flags); + + if (!ipi_data) + return; + + pr_debug("Destroying IPI data for instance %d\n", instance_id); + + /* Free the shared memory buffer */ + if (ipi_data->shared_mem) { + multikernel_free_ipi_buffer(ipi_data->instance_pool, + ipi_data->shared_mem, + ipi_data->shared_mem_size); + } + + kfree(ipi_data); +} + +/** + * mk_instance_ipi_get() - Get IPI data for a multikernel instance + * @instance_id: The instance ID + * + * Returns the IPI data for the given instance, or NULL if not found. + */ +static struct mk_instance_ipi_data *mk_instance_ipi_get(int instance_id) +{ + struct mk_instance_ipi_data *ipi_data; + unsigned long flags; + + if (instance_id < 0 || instance_id >= MK_MAX_INSTANCES) + return NULL; + + spin_lock_irqsave(&mk_ipi_map_lock, flags); + ipi_data = mk_instance_ipi_map[instance_id]; + spin_unlock_irqrestore(&mk_ipi_map_lock, flags); + + return ipi_data; +} + +/** + * multikernel_register_handler - Register a callback for multikernel IPI + * @callback: Function to call when IPI is received + * @ctx: Context pointer passed to the callback + * @ipi_type: IPI type this handler should process + * + * Returns pointer to handler on success, NULL on failure + */ +struct mk_ipi_handler *multikernel_register_handler(mk_ipi_callback_t callback, void *ctx, unsigned int ipi_type) +{ + struct mk_ipi_handler *handler; + unsigned long flags; + + if (!callback) + return NULL; + + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler) + return NULL; + + handler->callback = callback; + handler->context = ctx; + handler->ipi_type = ipi_type; + + init_irq_work(&handler->work, handler_work); + + raw_spin_lock_irqsave(&mk_handlers_lock, flags); + handler->next = mk_handlers; + mk_handlers = handler; + raw_spin_unlock_irqrestore(&mk_handlers_lock, flags); + + return handler; +} +EXPORT_SYMBOL(multikernel_register_handler); + +/** + * multikernel_unregister_handler - Unregister a multikernel IPI callback + * @handler: Handler pointer returned from multikernel_register_handler + */ +void multikernel_unregister_handler(struct mk_ipi_handler *handler) +{ + struct mk_ipi_handler **pp, *p; + unsigned long flags; + + if (!handler) + return; + + raw_spin_lock_irqsave(&mk_handlers_lock, flags); + pp = &mk_handlers; + while ((p = *pp) != NULL) { + if (p == handler) { + *pp = p->next; + break; + } + pp = &p->next; + } + raw_spin_unlock_irqrestore(&mk_handlers_lock, flags); + + /* Wait for pending work to complete */ + irq_work_sync(&handler->work); + kfree(p); +} +EXPORT_SYMBOL(multikernel_unregister_handler); + +/** + * multikernel_send_ipi_data - Send data to another CPU via IPI + * @instance_id: Target multikernel instance ID + * @data: Pointer to data to send + * @data_size: Size of data + * @type: User-defined type identifier + * + * This function copies the data to per-CPU storage and sends an IPI + * to the target CPU. The cpu parameter must be a physical CPU ID. + * + * Returns 0 on success, negative error code on failure + */ +int multikernel_send_ipi_data(int instance_id, void *data, size_t data_size, unsigned long type) +{ + struct mk_instance_ipi_data *ipi_data; + struct mk_ipi_data *target; + struct mk_instance *instance = mk_instance_find(instance_id); + int cpu ; + + if (!instance) + return -EINVAL; + if (data_size > MK_MAX_DATA_SIZE) + return -EINVAL; + + cpu = cpumask_first(instance->cpus); + /* Get the IPI data for the target instance */ + ipi_data = mk_instance_ipi_get(instance_id); + if (!ipi_data || !ipi_data->shared_mem) { + pr_debug("Multikernel IPI shared memory not available for instance %d\n", instance_id); + return -ENODEV; + } + + /* Get target CPU's data area from shared memory */ + target = &ipi_data->shared_mem->cpu_data[cpu]; + + /* Initialize/clear the IPI data structure to prevent stale data */ + memset(target, 0, sizeof(*target)); + + /* Set header information */ + target->data_size = data_size; + target->sender_cpu = arch_cpu_physical_id(smp_processor_id()); + target->type = type; + + /* Copy the actual data into the buffer */ + if (data && data_size > 0) + memcpy(target->buffer, data, data_size); + + /* Send IPI to target CPU using physical CPU ID */ + __apic_send_IPI(cpu, MULTIKERNEL_VECTOR); + + return 0; +} + +/** + * multikernel_interrupt_handler - Handle the multikernel IPI + * + * This function is called when a multikernel IPI is received. + * It invokes all registered callbacks with the per-CPU data. + * + * In spawned kernels, we use the shared IPI data passed via boot parameter. + * In host kernels, we may need to check instance mappings. + */ +static void multikernel_interrupt_handler(void) +{ + struct mk_ipi_data *data; + struct mk_ipi_handler *handler; + int current_cpu = smp_processor_id(); + int current_physical_id = arch_cpu_physical_id(current_cpu); + + if (!mk_this_kernel_ipi_data) + return; + + data = &mk_this_kernel_ipi_data->cpu_data[current_physical_id]; + + if (data->data_size == 0 || data->data_size > MK_MAX_DATA_SIZE) { + pr_debug("Multikernel IPI received on CPU %d but no valid data\n", current_cpu); + return; + } + + pr_info("Multikernel IPI received on CPU %d (physical id %d) from CPU %d type=%u\n", + current_cpu, current_physical_id, data->sender_cpu, data->type); + + raw_spin_lock(&mk_handlers_lock); + for (handler = mk_handlers; handler; handler = handler->next) { + if (handler->ipi_type == data->type) { + handler->saved_data = data; + irq_work_queue(&handler->work); + } + } + raw_spin_unlock(&mk_handlers_lock); +} + +/** + * Generic multikernel interrupt handler - called by the IPI vector + * + * This is the function that gets called by the IPI vector handler. + */ +void generic_multikernel_interrupt(void) +{ + multikernel_interrupt_handler(); +} + +/** + * multikernel_alloc_ipi_buffer() - Allocate IPI communication buffer + * @pool_handle: Instance pool handle + * @buffer_size: Size of IPI buffer needed + * + * Allocates and maps a buffer suitable for IPI communication. + * Returns virtual address of mapped buffer, or NULL on failure. + */ +static void *multikernel_alloc_ipi_buffer(void *pool_handle, size_t buffer_size) +{ + phys_addr_t phys_addr; + void *virt_addr; + + phys_addr = multikernel_instance_alloc(pool_handle, buffer_size, PAGE_SIZE); + if (!phys_addr) { + pr_err("Failed to allocate %zu bytes for IPI buffer\n", buffer_size); + return NULL; + } + + /* Map to virtual address space */ + virt_addr = memremap(phys_addr, buffer_size, MEMREMAP_WB); + if (!virt_addr) { + pr_err("Failed to map IPI buffer at 0x%llx\n", (unsigned long long)phys_addr); + multikernel_instance_free(pool_handle, phys_addr, buffer_size); + return NULL; + } + + pr_debug("Allocated IPI buffer: phys=0x%llx, virt=%px, size=%zu\n", + (unsigned long long)phys_addr, virt_addr, buffer_size); + + return virt_addr; +} + +/** + * multikernel_free_ipi_buffer() - Free IPI communication buffer + * @pool_handle: Instance pool handle + * @virt_addr: Virtual address returned by multikernel_alloc_ipi_buffer() + * @buffer_size: Size of the buffer + * + * Unmaps and frees an IPI buffer back to the instance pool. + */ +static void multikernel_free_ipi_buffer(void *pool_handle, void *virt_addr, size_t buffer_size) +{ + phys_addr_t phys_addr; + + if (!virt_addr) + return; + + /* Convert virtual address back to physical */ + phys_addr = virt_to_phys(virt_addr); + + /* Unmap virtual address */ + memunmap(virt_addr); + + /* Free back to instance pool */ + multikernel_instance_free(pool_handle, phys_addr, buffer_size); + + pr_debug("Freed IPI buffer: phys=0x%llx, virt=%px, size=%zu\n", + (unsigned long long)phys_addr, virt_addr, buffer_size); +} + +static int __init mk_ipi_shared_setup(char *str) +{ + if (!str) + return -EINVAL; + + mk_ipi_shared_phys_addr = memparse(str, NULL); + if (!mk_ipi_shared_phys_addr) { + pr_err("Invalid multikernel IPI shared memory address: %s\n", str); + return -EINVAL; + } + + pr_info("Multikernel IPI shared memory address: 0x%llx\n", + (unsigned long long)mk_ipi_shared_phys_addr); + return 0; +} +early_param("mk_ipi_shared", mk_ipi_shared_setup); + +/** + * multikernel_ipi_init - Initialize multikernel IPI subsystem + * + * Sets up IPI handling infrastructure. + * - In spawned kernels: IPI buffer is mapped from boot parameter address + * Returns 0 on success, negative error code on failure + */ +static int __init multikernel_ipi_init(void) +{ + /* Check if we're in a spawned kernel with IPI shared memory address */ + if (mk_ipi_shared_phys_addr) { + /* Spawned kernel: Map the shared IPI memory */ + mk_this_kernel_ipi_data = memremap(mk_ipi_shared_phys_addr, + sizeof(struct mk_shared_data), + MEMREMAP_WB); + if (!mk_this_kernel_ipi_data) { + pr_err("Failed to map multikernel IPI shared memory at 0x%llx\n", + (unsigned long long)mk_ipi_shared_phys_addr); + return -ENOMEM; + } + + pr_info("Multikernel IPI subsystem initialized (spawned kernel): virt=%px, phys=0x%llx\n", + mk_this_kernel_ipi_data, (unsigned long long)mk_ipi_shared_phys_addr); + } + + return 0; +} +subsys_initcall(multikernel_ipi_init); + +/* ---- Flexible shared memory APIs (PFN-based) ---- */ +#define MK_PFN_IPI_TYPE 0x80000001U + +/* Send a PFN to another kernel via mk_ipi_data */ +int mk_send_pfn(int instance_id, unsigned long pfn) +{ + return multikernel_send_ipi_data(instance_id, &pfn, sizeof(pfn), MK_PFN_IPI_TYPE); +} + +/* Receive a PFN from mk_ipi_data. Caller must check type. */ +int mk_receive_pfn(struct mk_ipi_data *data, unsigned long *out_pfn) +{ + if (!data || !out_pfn) + return -EINVAL; + if (data->type != MK_PFN_IPI_TYPE || data->data_size != sizeof(unsigned long)) + return -EINVAL; + *out_pfn = *(unsigned long *)data->buffer; + return 0; +} + +void *mk_receive_map_page(struct mk_ipi_data *data) +{ + unsigned long pfn; + int ret; + + ret = mk_receive_pfn(data, &pfn); + if (ret < 0) + return NULL; + return memremap(pfn << PAGE_SHIFT, PAGE_SIZE, MEMREMAP_WB); +} -- 2.34.1 From: Cong Wang Introduce a structured messaging system built on top of the existing multikernel IPI infrastructure to enable reliable communication between kernel instances running on different CPUs. The messaging layer provides: * Simple message format with type/subtype hierarchy for extensibility * Support for I/O interrupt forwarding between kernel instances * Resource management messages for CPU and memory hotplug operations * Type-safe payload structures with validation * Handler registration system for message processing * Convenient inline functions for common operations Message types include: - MK_MSG_IO: I/O interrupt forwarding and load balancing - MK_MSG_RESOURCE: CPU/memory add/remove operations - MK_MSG_SYSTEM: System-level coordination messages - MK_MSG_USER: User-defined message types The implementation leverages the reliable nature of intra-machine IPIs, maintaining simplicity and performance. Messages are limited to the existing 256-byte IPI buffer size, with larger data transfers handled via the existing PFN-based shared memory mechanism. This messaging foundation enables sophisticated multikernel coordination scenarios including dynamic resource allocation, interrupt load balancing, and system-wide state management. Signed-off-by: Cong Wang --- include/linux/multikernel.h | 200 ++++++++++++++++++++++++ kernel/multikernel/Makefile | 2 +- kernel/multikernel/core.c | 7 + kernel/multikernel/messaging.c | 278 +++++++++++++++++++++++++++++++++ 4 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 kernel/multikernel/messaging.c diff --git a/include/linux/multikernel.h b/include/linux/multikernel.h index ee96bd2332b6..3bc07361145b 100644 --- a/include/linux/multikernel.h +++ b/include/linux/multikernel.h @@ -80,6 +80,206 @@ void *mk_receive_map_page(struct mk_ipi_data *data); #define mk_receive_unmap_page(p) memunmap(p) +/* + * Multikernel Messaging System + */ + +/** + * Message type definitions - organized by category + */ + +/* Top-level message categories */ +#define MK_MSG_IO 0x1000 +#define MK_MSG_RESOURCE 0x2000 +#define MK_MSG_SYSTEM 0x3000 +#define MK_MSG_USER 0x4000 + +/* I/O interrupt forwarding subtypes */ +#define MK_IO_IRQ_FORWARD (MK_MSG_IO + 1) +#define MK_IO_IRQ_BALANCE (MK_MSG_IO + 2) +#define MK_IO_IRQ_MASK (MK_MSG_IO + 3) +#define MK_IO_IRQ_UNMASK (MK_MSG_IO + 4) + +/* Resource management subtypes */ +#define MK_RES_CPU_ADD (MK_MSG_RESOURCE + 1) +#define MK_RES_CPU_REMOVE (MK_MSG_RESOURCE + 2) +#define MK_RES_MEM_ADD (MK_MSG_RESOURCE + 3) +#define MK_RES_MEM_REMOVE (MK_MSG_RESOURCE + 4) +#define MK_RES_QUERY (MK_MSG_RESOURCE + 5) + +/* System management subtypes */ +#define MK_SYS_HEARTBEAT (MK_MSG_SYSTEM + 1) +#define MK_SYS_SHUTDOWN (MK_MSG_SYSTEM + 2) + +/** + * Core message structure + */ +struct mk_message { + u32 msg_type; /* Message type identifier */ + u32 msg_subtype; /* Subtype for specific operations */ + u64 msg_id; /* Optional message ID for correlation */ + u32 payload_len; /* Length of payload data */ + u8 payload[]; /* Variable payload (up to remaining IPI buffer) */ +}; + +/** + * Payload structures for specific message types + */ + +/* I/O interrupt forwarding */ +struct mk_io_irq_payload { + u32 irq_number; /* Hardware IRQ number */ + u32 vector; /* Interrupt vector */ + u32 device_id; /* Device identifier (optional) */ + u32 flags; /* Control flags (priority, etc.) */ +}; + +/* IRQ control flags */ +#define MK_IRQ_HIGH_PRIORITY 0x01 +#define MK_IRQ_LOW_LATENCY 0x02 +#define MK_IRQ_EDGE_TRIGGERED 0x04 +#define MK_IRQ_LEVEL_TRIGGERED 0x08 + +/* CPU resource operations */ +struct mk_cpu_resource_payload { + u32 cpu_id; /* Physical CPU ID */ + u32 numa_node; /* NUMA node (optional) */ + u32 flags; /* CPU capabilities/attributes */ +}; + +/* CPU capability flags */ +#define MK_CPU_HAS_AVX512 0x01 +#define MK_CPU_HAS_TSX 0x02 +#define MK_CPU_HYPERTHREAD 0x04 + +/* Memory resource operations */ +struct mk_mem_resource_payload { + u64 start_pfn; /* Starting page frame number */ + u64 nr_pages; /* Number of pages */ + u32 numa_node; /* NUMA node */ + u32 mem_type; /* Memory type (normal/DMA/etc.) */ +}; + +/* Memory types */ +#define MK_MEM_NORMAL 0x01 +#define MK_MEM_DMA 0x02 +#define MK_MEM_DMA32 0x04 +#define MK_MEM_HIGHMEM 0x08 + +/** + * Message handler callback type + */ +typedef void (*mk_msg_handler_t)(u32 msg_type, u32 subtype, + void *payload, u32 payload_len, void *ctx); + +/** + * Message API functions + */ + +/** + * mk_send_message - Send a message to another CPU + * @instance_id: Target multikernel instance ID + * @msg_type: Message type identifier + * @subtype: Message subtype + * @payload: Pointer to payload data (can be NULL) + * @payload_len: Length of payload data + * + * Returns 0 on success, negative error code on failure + */ +int mk_send_message(int instance_id, u32 msg_type, u32 subtype, + void *payload, u32 payload_len); + +/** + * mk_register_msg_handler - Register handler for specific message type + * @msg_type: Message type to handle + * @handler: Handler function + * @ctx: Context pointer passed to handler + * + * Returns 0 on success, negative error code on failure + */ +int mk_register_msg_handler(u32 msg_type, mk_msg_handler_t handler, void *ctx); + +/** + * mk_unregister_msg_handler - Unregister message handler + * @msg_type: Message type to unregister + * @handler: Handler function to remove + * + * Returns 0 on success, negative error code on failure + */ +int mk_unregister_msg_handler(u32 msg_type, mk_msg_handler_t handler); + +/** + * Convenience functions for common message types + */ + +/* I/O interrupt forwarding */ +static inline int mk_send_irq_forward(int instance_id, u32 irq_number, + u32 vector, u32 device_id, u32 flags) +{ + struct mk_io_irq_payload payload = { + .irq_number = irq_number, + .vector = vector, + .device_id = device_id, + .flags = flags + }; + return mk_send_message(instance_id, MK_MSG_IO, MK_IO_IRQ_FORWARD, + &payload, sizeof(payload)); +} + +/* CPU resource management */ +static inline int mk_send_cpu_add(int instance_id, u32 cpu_id, + u32 numa_node, u32 flags) +{ + struct mk_cpu_resource_payload payload = { + .cpu_id = cpu_id, + .numa_node = numa_node, + .flags = flags + }; + return mk_send_message(instance_id, MK_MSG_RESOURCE, MK_RES_CPU_ADD, + &payload, sizeof(payload)); +} + +static inline int mk_send_cpu_remove(int instance_id, u32 cpu_id) +{ + struct mk_cpu_resource_payload payload = { + .cpu_id = cpu_id, + .numa_node = 0, + .flags = 0 + }; + return mk_send_message(instance_id, MK_MSG_RESOURCE, MK_RES_CPU_REMOVE, + &payload, sizeof(payload)); +} + +/* Memory resource management */ +static inline int mk_send_mem_add(int instance_id, u64 start_pfn, u64 nr_pages, + u32 numa_node, u32 mem_type) +{ + struct mk_mem_resource_payload payload = { + .start_pfn = start_pfn, + .nr_pages = nr_pages, + .numa_node = numa_node, + .mem_type = mem_type + }; + return mk_send_message(instance_id, MK_MSG_RESOURCE, MK_RES_MEM_ADD, + &payload, sizeof(payload)); +} + +static inline int mk_send_mem_remove(int instance_id, u64 start_pfn, u64 nr_pages) +{ + struct mk_mem_resource_payload payload = { + .start_pfn = start_pfn, + .nr_pages = nr_pages, + .numa_node = 0, + .mem_type = 0 + }; + return mk_send_message(instance_id, MK_MSG_RESOURCE, MK_RES_MEM_REMOVE, + &payload, sizeof(payload)); +} + +/* Messaging system functions */ +int __init mk_messaging_init(void); +void mk_messaging_cleanup(void); + struct resource; extern phys_addr_t multikernel_alloc(size_t size); diff --git a/kernel/multikernel/Makefile b/kernel/multikernel/Makefile index b539acc656c6..f133e1eaf534 100644 --- a/kernel/multikernel/Makefile +++ b/kernel/multikernel/Makefile @@ -3,7 +3,7 @@ # Makefile for multikernel support # -obj-y += core.o mem.o kernfs.o dts.o ipi.o +obj-y += core.o mem.o kernfs.o dts.o ipi.o messaging.o # Add libfdt include path for device tree parsing CFLAGS_dts.o = -I $(srctree)/scripts/dtc/libfdt diff --git a/kernel/multikernel/core.c b/kernel/multikernel/core.c index ee7a21327ea5..37dbf0cf4be6 100644 --- a/kernel/multikernel/core.c +++ b/kernel/multikernel/core.c @@ -505,9 +505,16 @@ static int __init multikernel_init(void) { int ret; + ret = mk_messaging_init(); + if (ret < 0) { + pr_err("Failed to initialize multikernel messaging: %d\n", ret); + return ret; + } + ret = mk_kernfs_init(); if (ret < 0) { pr_err("Failed to initialize multikernel sysfs interface: %d\n", ret); + mk_messaging_cleanup(); return ret; } diff --git a/kernel/multikernel/messaging.c b/kernel/multikernel/messaging.c new file mode 100644 index 000000000000..be1fba8778ec --- /dev/null +++ b/kernel/multikernel/messaging.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Multikernel Messaging System + * Copyright (C) 2025 Multikernel Technologies, Inc. All rights reserved + * + * Simple messaging layer on top of multikernel IPI infrastructure + */ + +#include +#include +#include +#include + +/* Per-type message handler registry */ +struct mk_msg_type_handler { + u32 msg_type; + struct mk_ipi_handler *ipi_handler; + mk_msg_handler_t msg_handler; + void *context; + struct mk_msg_type_handler *next; +}; + +static struct mk_msg_type_handler *mk_msg_type_handlers; +static raw_spinlock_t mk_msg_type_handlers_lock = __RAW_SPIN_LOCK_UNLOCKED(mk_msg_type_handlers_lock); + +/** + * mk_message_type_ipi_callback - IPI callback to handle incoming messages for a specific type + * @data: IPI data containing the message + * @ctx: Context containing the message handler info + */ +static void mk_message_type_ipi_callback(struct mk_ipi_data *data, void *ctx) +{ + struct mk_msg_type_handler *type_handler = (struct mk_msg_type_handler *)ctx; + struct mk_message *msg; + u32 msg_type, msg_subtype; + void *payload; + u32 payload_len; + + if (!type_handler || !type_handler->msg_handler) { + pr_warn("Multikernel message received but no handler registered\n"); + return; + } + + /* Verify this matches our expected message type */ + if (data->type != type_handler->msg_type) { + pr_warn("Multikernel message type mismatch: expected 0x%x, got 0x%x\n", + type_handler->msg_type, data->type); + return; + } + + /* Ensure we have at least a message header */ + if (data->data_size < sizeof(struct mk_message)) { + pr_warn("Multikernel message too small: %zu bytes\n", data->data_size); + return; + } + + msg = (struct mk_message *)data->buffer; + + /* Validate message structure */ + if (msg->payload_len > (data->data_size - sizeof(struct mk_message))) { + pr_warn("Multikernel message payload length invalid: %u > %zu\n", + msg->payload_len, data->data_size - sizeof(struct mk_message)); + return; + } + + msg_type = msg->msg_type; + msg_subtype = msg->msg_subtype; + payload = msg->payload_len > 0 ? msg->payload : NULL; + payload_len = msg->payload_len; + + pr_debug("Multikernel message received: type=0x%x, subtype=0x%x, len=%u from CPU %d\n", + msg_type, msg_subtype, payload_len, data->sender_cpu); + + /* Call the registered handler for this message type */ + type_handler->msg_handler(msg_type, msg_subtype, payload, payload_len, type_handler->context); +} + +/** + * mk_send_message - Send a message to another CPU + * @instance_id: Target multikernel instance ID + * @msg_type: Message type identifier + * @subtype: Message subtype + * @payload: Pointer to payload data (can be NULL) + * @payload_len: Length of payload data + * + * Returns 0 on success, negative error code on failure + */ +int mk_send_message(int instance_id, u32 msg_type, u32 subtype, + void *payload, u32 payload_len) +{ + struct mk_message *msg; + size_t total_size; + int ret; + + /* Calculate total message size */ + total_size = sizeof(struct mk_message) + payload_len; + + /* Check if message fits in IPI buffer */ + if (total_size > MK_MAX_DATA_SIZE) { + pr_err("Multikernel message too large: %zu > %d bytes\n", + total_size, MK_MAX_DATA_SIZE); + return -EMSGSIZE; + } + + /* Allocate temporary buffer for message */ + msg = kzalloc(total_size, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + /* Fill in message header */ + msg->msg_type = msg_type; + msg->msg_subtype = subtype; + msg->msg_id = 0; /* Could be enhanced with unique IDs later */ + msg->payload_len = payload_len; + + /* Copy payload if provided */ + if (payload && payload_len > 0) + memcpy(msg->payload, payload, payload_len); + + /* Send via IPI using the message type as IPI type */ + ret = multikernel_send_ipi_data(instance_id, msg, total_size, msg_type); + + /* Clean up temporary buffer */ + kfree(msg); + + if (ret < 0) { + pr_err("Failed to send multikernel message: %d\n", ret); + return ret; + } + + pr_debug("Multikernel message sent: type=0x%x, subtype=0x%x, len=%u to instance %d\n", + msg_type, subtype, payload_len, instance_id); + + return 0; +} +EXPORT_SYMBOL(mk_send_message); + +/** + * mk_register_msg_handler - Register handler for specific message type + * @msg_type: Message type to handle + * @handler: Handler function + * @ctx: Context pointer passed to handler + * + * Returns 0 on success, negative error code on failure + */ +int mk_register_msg_handler(u32 msg_type, mk_msg_handler_t handler, void *ctx) +{ + struct mk_msg_type_handler *type_handler; + unsigned long flags; + + if (!handler) + return -EINVAL; + + /* Check if handler for this type already exists */ + raw_spin_lock_irqsave(&mk_msg_type_handlers_lock, flags); + for (type_handler = mk_msg_type_handlers; type_handler; type_handler = type_handler->next) { + if (type_handler->msg_type == msg_type) { + raw_spin_unlock_irqrestore(&mk_msg_type_handlers_lock, flags); + pr_warn("Handler for message type 0x%x already registered\n", msg_type); + return -EEXIST; + } + } + raw_spin_unlock_irqrestore(&mk_msg_type_handlers_lock, flags); + + /* Allocate new type handler entry */ + type_handler = kzalloc(sizeof(*type_handler), GFP_KERNEL); + if (!type_handler) + return -ENOMEM; + + type_handler->msg_type = msg_type; + type_handler->msg_handler = handler; + type_handler->context = ctx; + + /* Register IPI handler for this message type */ + type_handler->ipi_handler = multikernel_register_handler(mk_message_type_ipi_callback, + type_handler, msg_type); + if (!type_handler->ipi_handler) { + pr_err("Failed to register IPI handler for message type 0x%x\n", msg_type); + kfree(type_handler); + return -ENOMEM; + } + + /* Add to type handler list */ + raw_spin_lock_irqsave(&mk_msg_type_handlers_lock, flags); + type_handler->next = mk_msg_type_handlers; + mk_msg_type_handlers = type_handler; + raw_spin_unlock_irqrestore(&mk_msg_type_handlers_lock, flags); + + pr_debug("Registered multikernel message handler for type 0x%x\n", msg_type); + return 0; +} +EXPORT_SYMBOL(mk_register_msg_handler); + +/** + * mk_unregister_msg_handler - Unregister message handler + * @msg_type: Message type to unregister + * @handler: Handler function to remove + * + * Returns 0 on success, negative error code on failure + */ +int mk_unregister_msg_handler(u32 msg_type, mk_msg_handler_t handler) +{ + struct mk_msg_type_handler **pp, *type_handler; + unsigned long flags; + int found = 0; + + if (!handler) + return -EINVAL; + + raw_spin_lock_irqsave(&mk_msg_type_handlers_lock, flags); + pp = &mk_msg_type_handlers; + while ((type_handler = *pp) != NULL) { + if (type_handler->msg_type == msg_type && type_handler->msg_handler == handler) { + *pp = type_handler->next; + found = 1; + break; + } + pp = &type_handler->next; + } + raw_spin_unlock_irqrestore(&mk_msg_type_handlers_lock, flags); + + if (found) { + /* Unregister the IPI handler */ + if (type_handler->ipi_handler) { + multikernel_unregister_handler(type_handler->ipi_handler); + } + kfree(type_handler); + pr_debug("Unregistered multikernel message handler for type 0x%x\n", msg_type); + return 0; + } + + return -ENOENT; +} +EXPORT_SYMBOL(mk_unregister_msg_handler); + +/** + * mk_messaging_init - Initialize the messaging system + * + * Called during multikernel initialization to set up message handling + * Returns 0 on success, negative error code on failure + */ +int __init mk_messaging_init(void) +{ + /* No global IPI handler needed anymore - handlers are registered per message type */ + pr_info("Multikernel messaging system initialized\n"); + return 0; +} + +/** + * mk_messaging_cleanup - Cleanup the messaging system + * + * Called during multikernel cleanup + */ +void mk_messaging_cleanup(void) +{ + struct mk_msg_type_handler *type_handler, *next; + unsigned long flags; + + /* Clean up all registered message type handlers */ + raw_spin_lock_irqsave(&mk_msg_type_handlers_lock, flags); + type_handler = mk_msg_type_handlers; + mk_msg_type_handlers = NULL; + raw_spin_unlock_irqrestore(&mk_msg_type_handlers_lock, flags); + + while (type_handler) { + next = type_handler->next; + + /* Unregister IPI handler */ + if (type_handler->ipi_handler) { + multikernel_unregister_handler(type_handler->ipi_handler); + } + + kfree(type_handler); + type_handler = next; + } + + pr_info("Multikernel messaging system cleaned up\n"); +} -- 2.34.1 From: Cong Wang This commit introduces: * Multikernel flag support in kexec_file_load by adding KEXEC_MULTIKERNEL to KEXEC_FILE_FLAGS, enabling user-space to specify multikernel operations through the file-based kexec interface with proper flag validation and ID extraction. * Instance-based memory allocation for multikernel images through kexec_alloc_multikernel() that allocates kernel segments from instance-specific memory pools rather than system memory, ensuring compliance with device tree resource specifications. * Multikernel control page allocation via kimage_alloc_multikernel_control_pages() that provides page-aligned control structures from instance pools with proper alignment validation and conflict detection against existing segments. * Enhanced kimage_file_alloc_init() with multikernel instance association that extracts multikernel IDs from kexec flags, validates instance availability, establishes bidirectional cross-references, and updates instance states to LOADING during the load process. * Integrated memory hole location in kexec_locate_mem_hole() that prioritizes multikernel instance pool allocation over system memory allocation, ensuring multikernel segments respect reserved memory boundaries and resource isolation. The integration maintains compatibility with existing kexec_file_load() use cases, such as crash kernel, while extending it for multikernel case. Standard kexec operations continue to use system memory allocation, while multikernel operations automatically use instance-specific pools when the KEXEC_MULTIKERNEL flag is specified. This enables user-space tools to load multikernel images using the more secure and flexible kexec_file_load interface rather than the legacy kexec_load syscall, providing better integration with modern security frameworks and signed kernel verification. Signed-off-by: Cong Wang --- include/linux/kexec.h | 3 +- kernel/kexec_core.c | 61 ++++++++++++++++++++++ kernel/kexec_file.c | 116 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 177 insertions(+), 3 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 5e9e9ad1dfeb..b907b7a92fd2 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -477,7 +477,8 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB | \ + KEXEC_MULTIKERNEL) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 7db755e64dd6..61ad01acd034 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -476,6 +476,64 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, } #endif +static struct page *kimage_alloc_multikernel_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages for multikernel must be allocated from the instance's + * memory pool to ensure they stay within the reserved memory regions + * specified in the device tree configuration. + * + * We use mk_kimage_alloc() to get memory from the instance pool, + * then convert it to page structures. + */ + void *virt_addr; + phys_addr_t phys_addr; + struct page *pages; + unsigned long size; + unsigned int count; + + if (!image->mk_instance) { + pr_err("Multikernel image has no associated instance\n"); + return NULL; + } + + count = 1 << order; + size = count << PAGE_SHIFT; + + /* Allocate from the multikernel instance pool (page aligned) */ + virt_addr = mk_kimage_alloc(image, size, PAGE_SIZE); + if (!virt_addr) { + pr_debug("Failed to allocate %lu bytes for multikernel control pages\n", size); + return NULL; + } + + /* Convert virtual address to physical */ + phys_addr = virt_to_phys(virt_addr); + + /* Check alignment requirements - control pages need page alignment */ + if (!IS_ALIGNED(phys_addr, PAGE_SIZE)) { + pr_err("Multikernel control page allocation not page-aligned: phys=0x%llx\n", + (unsigned long long)phys_addr); + mk_kimage_free(image, virt_addr, size); + return NULL; + } + + /* Get the page structure */ + pages = virt_to_page(virt_addr); + + /* Check for conflicts with existing segments */ + if (kimage_is_destination_range(image, phys_addr, phys_addr + size - 1)) { + pr_debug("Multikernel control pages conflict with existing segments: 0x%llx+0x%lx\n", + (unsigned long long)phys_addr, size); + mk_kimage_free(image, virt_addr, size); + return NULL; + } + + pr_debug("Allocated multikernel control pages: order=%u, phys=0x%llx, virt=%px\n", + order, (unsigned long long)phys_addr, virt_addr); + + return pages; +} struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) @@ -491,6 +549,9 @@ struct page *kimage_alloc_control_pages(struct kimage *image, pages = kimage_alloc_crash_control_pages(image, order); break; #endif + case KEXEC_TYPE_MULTIKERNEL: + pages = kimage_alloc_multikernel_control_pages(image, order); + break; } return pages; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 2d9d5626c8da..f9979c1d9f9e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -309,6 +310,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, int ret; struct kimage *image; bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + bool multikernel_load = flags & KEXEC_MULTIKERNEL; image = do_kimage_alloc_init(); if (!image) @@ -322,8 +324,50 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; - } + } else #endif + if (multikernel_load) { + struct mk_instance *instance; + int mk_id = KEXEC_GET_MK_ID(flags); + + /* Set multikernel image type for proper memory allocation */ + image->type = KEXEC_TYPE_MULTIKERNEL; + + pr_info("kexec_file_load: multikernel load - flags=0x%lx, extracted mk_id=%d\n", + flags, mk_id); + + if (mk_id <= 0) { + pr_err("Invalid multikernel ID %d in flags\n", mk_id); + ret = -EINVAL; + goto out_free_image; + } + + /* Find the existing mk_instance */ + instance = mk_instance_find(mk_id); + if (!instance) { + pr_err("No multikernel instance found with ID %d\n", mk_id); + ret = -ENOENT; + goto out_free_image; + } + + /* Check if instance is already associated with a kimage */ + if (instance->kimage) { + pr_err("Multikernel instance %d already has an associated kimage\n", mk_id); + mk_instance_put(instance); + ret = -EBUSY; + goto out_free_image; + } + + /* Establish cross-references */ + image->mk_instance = instance; /* Transfer reference from find */ + image->mk_id = mk_id; + instance->kimage = image; + + /* Update instance state */ + mk_instance_set_state(instance, MK_STATE_LOADING); + + pr_info("Associated kimage with multikernel instance %d\n", mk_id); + } ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); @@ -731,6 +775,61 @@ static int kexec_alloc_contig(struct kexec_buf *kbuf) return 0; } +static int kexec_alloc_multikernel(struct kexec_buf *kbuf) +{ + void *virt_addr; + phys_addr_t phys_addr; + + pr_info("kexec_alloc_multikernel: called for segment size=0x%lx, buf_min=0x%lx, buf_max=0x%lx, align=0x%lx\n", + kbuf->memsz, kbuf->buf_min, kbuf->buf_max, kbuf->buf_align); + + /* Check if this is a multikernel image with an associated instance */ + if (!kbuf->image->mk_instance || kbuf->image->type != KEXEC_TYPE_MULTIKERNEL) { + pr_info("kexec_alloc_multikernel: not a multikernel image (mk_instance=%p, type=%d)\n", + kbuf->image->mk_instance, kbuf->image->type); + return -EPERM; + } + + /* Allocate from the multikernel instance pool using the proper API */ + virt_addr = mk_kimage_alloc(kbuf->image, kbuf->memsz, kbuf->buf_align); + if (!virt_addr) { + pr_info("Failed to allocate %lu bytes from multikernel instance pool (align=0x%lx)\n", + kbuf->memsz, kbuf->buf_align); + return -ENOMEM; + } + + /* Convert virtual address to physical */ + phys_addr = virt_to_phys(virt_addr); + + if (!IS_ALIGNED(phys_addr, kbuf->buf_align)) { + pr_info("Multikernel allocation not aligned: phys=0x%llx, required=0x%lx\n", + (unsigned long long)phys_addr, kbuf->buf_align); + mk_kimage_free(kbuf->image, virt_addr, kbuf->memsz); + return -ENOMEM; + } + + if (phys_addr < kbuf->buf_min || (phys_addr + kbuf->memsz - 1) > kbuf->buf_max) { + pr_info("Multikernel allocation out of bounds: phys=0x%llx, min=0x%lx, max=0x%lx\n", + (unsigned long long)phys_addr, kbuf->buf_min, kbuf->buf_max); + mk_kimage_free(kbuf->image, virt_addr, kbuf->memsz); + return -ENOMEM; + } + + if (kimage_is_destination_range(kbuf->image, phys_addr, phys_addr + kbuf->memsz - 1)) { + pr_info("Multikernel allocation conflicts with existing segments: 0x%llx+0x%lx\n", + (unsigned long long)phys_addr, kbuf->memsz); + mk_kimage_free(kbuf->image, virt_addr, kbuf->memsz); + return -EBUSY; + } + + kbuf->mem = phys_addr; + + pr_info("Allocated %lu bytes from multikernel pool at 0x%llx (virt=%px)\n", + kbuf->memsz, (unsigned long long)phys_addr, virt_addr); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -743,8 +842,21 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) { int ret; + pr_info("kexec_locate_mem_hole: called for segment size=0x%lx, mem=0x%lx, image_type=%d\n", + kbuf->memsz, kbuf->mem, kbuf->image->type); + /* Arch knows where to place */ - if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) + if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) { + pr_info("kexec_locate_mem_hole: memory already specified (0x%lx), skipping allocation\n", kbuf->mem); + return 0; + } + + /* + * If this is a multikernel image, try to allocate from the instance's + * memory pool first. This ensures multikernel segments use pre-reserved + * memory from the device tree configuration and respects the pool management. + */ + if (!kexec_alloc_multikernel(kbuf)) return 0; /* -- 2.34.1 From: Cong Wang This commit introduces: * Multikernel-specific KHO handling in arch/x86 that differentiates between regular KHO operations (requiring both FDT and scratch areas) and multikernel operations (requiring only FDT) with conditional setup data generation and proper boot parameter configuration. * Enhanced KHO framework with multikernel-aware notifier chain (mk_kexec_register_notifier/mk_kexec_unregister_notifier) that provides dedicated callbacks for multikernel DTB preservation without the overhead of full KHO serialization infrastructure. * Multikernel KHO finalization through mk_kexec_finalize() that creates minimal FDT structures with multikernel-v1 compatibility and calls specialized notifiers to preserve target instance DTBs in shared memory for seamless handover to spawned kernels. * DTB restoration infrastructure (mk_kho_restore_dtbs()) that enables spawned kernels to automatically recover their device tree configurations from KHO shared memory during early boot, recreating instance structures with proper resource allocation. * Integration with existing kexec_file_load workflow through automatic FDT page allocation for multikernel images and proper cleanup handling when images are freed, ensuring resource lifecycle management across kexec boundaries. The KHO integration maintains full compatibility with existing KHO code path while extending it for the multikernel use case. Regular kexec operations continue to use standard KHO scratch area and serialization, while multikernel case uses the streamlined DTB-only protocol. This enables seamless multikernel instance migration across kernel boundaries, allowing spawned kernels to automatically inherit their device tree configurations and resource allocations without manual intervention or complex boot parameter manipulation. Signed-off-by: Cong Wang --- arch/x86/kernel/kexec-bzimage64.c | 16 +- arch/x86/kernel/setup.c | 8 +- drivers/of/kexec.c | 20 +- include/linux/kexec_handover.h | 40 ++++ include/linux/multikernel.h | 34 +++ kernel/kexec_core.c | 11 + kernel/kexec_file.c | 9 + kernel/kexec_handover.c | 197 ++++++++++++++++- kernel/multikernel/Kconfig | 5 + kernel/multikernel/Makefile | 2 +- kernel/multikernel/core.c | 6 + kernel/multikernel/kho.c | 356 ++++++++++++++++++++++++++++++ 12 files changed, 691 insertions(+), 13 deletions(-) create mode 100644 kernel/multikernel/kho.c diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index c3244ac680d1..8ed2b49bf086 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -276,15 +276,23 @@ static void setup_kho(const struct kimage *image, struct boot_params *params, sd->type = SETUP_KEXEC_KHO; sd->len = sizeof(struct kho_data); - /* Only add if we have all KHO images in place */ - if (!image->kho.fdt || !image->kho.scratch) + if (image->type == KEXEC_TYPE_MULTIKERNEL) { + if (!image->kho.fdt) + return; + } else if (!image->kho.fdt || !image->kho.scratch) return; /* Add setup data */ kho->fdt_addr = image->kho.fdt; kho->fdt_size = PAGE_SIZE; - kho->scratch_addr = image->kho.scratch->mem; - kho->scratch_size = image->kho.scratch->bufsz; + if (image->type == KEXEC_TYPE_MULTIKERNEL) { + kho->scratch_addr = 0; + kho->scratch_size = 0; + } else { + kho->scratch_addr = image->kho.scratch->mem; + kho->scratch_size = image->kho.scratch->bufsz; + } + sd->next = params->hdr.setup_data; params->hdr.setup_data = params_load_addr + setup_data_offset; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8342c4e46bad..40e3cb19e06b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -467,7 +467,13 @@ static void __init add_kho(u64 phys_addr, u32 data_len) return; } - kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size); + if (kho->scratch_addr == 0 && kho->scratch_size == 0) { + pr_info("setup: detected multikernel KHO data\n"); + mk_kho_populate(kho->fdt_addr, kho->fdt_size); + } else { + pr_info("setup: detected regular KHO data\n"); + kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size); + } early_memunmap(kho, size); } diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 1ee2d31816ae..a27971be555c 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -280,13 +280,21 @@ static int kho_add_chosen(const struct kimage *image, void *fdt, int chosen_node if (ret && ret != -FDT_ERR_NOTFOUND) return ret; - if (!image->kho.fdt || !image->kho.scratch) + if (image->type == KEXEC_TYPE_MULTIKERNEL) { + if (!image->kho.fdt) + return 0; + } else if (!image->kho.fdt || !image->kho.scratch) return 0; fdt_mem = image->kho.fdt; fdt_len = PAGE_SIZE; - scratch_mem = image->kho.scratch->mem; - scratch_len = image->kho.scratch->bufsz; + if (image->type == KEXEC_TYPE_MULTIKERNEL) { + scratch_mem = 0; + scratch_len = 0; + } else { + scratch_mem = image->kho.scratch->mem; + scratch_len = image->kho.scratch->bufsz; + } pr_debug("Adding kho metadata to DT"); @@ -294,8 +302,10 @@ static int kho_add_chosen(const struct kimage *image, void *fdt, int chosen_node fdt_mem, fdt_len); if (ret) return ret; - ret = fdt_appendprop_addrrange(fdt, 0, chosen_node, "linux,kho-scratch", - scratch_mem, scratch_len); + + if (scratch_mem && scratch_len) + ret = fdt_appendprop_addrrange(fdt, 0, chosen_node, "linux,kho-scratch", + scratch_mem, scratch_len); #endif /* CONFIG_KEXEC_HANDOVER */ return ret; diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 25042c1d8d54..480c4510a9ed 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -64,9 +64,20 @@ int register_kho_notifier(struct notifier_block *nb); int unregister_kho_notifier(struct notifier_block *nb); void kho_memory_init(void); +int kho_get_target_mk_id(struct kho_serialization *ser); +struct kimage; void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len); +void mk_kho_populate(phys_addr_t fdt_phys, u64 fdt_len); + +/* Multikernel kexec notifier functions */ +int mk_kexec_register_notifier(struct notifier_block *nb); +int mk_kexec_unregister_notifier(struct notifier_block *nb); +int mk_kexec_finalize(struct kimage *target_image); + +/* KHO FDT access */ +phys_addr_t kho_get_fdt_phys(void); #else static inline bool kho_is_enabled(void) { @@ -121,6 +132,11 @@ static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) return -EOPNOTSUPP; } +static inline int kho_get_target_mk_id(struct kho_serialization *ser) +{ + return 0; +} + static inline int register_kho_notifier(struct notifier_block *nb) { return -EOPNOTSUPP; @@ -139,6 +155,30 @@ static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { } + +static inline void mk_kho_populate(phys_addr_t fdt_phys, u64 fdt_len) +{ +} + +static inline int mk_kexec_register_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline int mk_kexec_unregister_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline int mk_kexec_finalize(struct kimage *target_image) +{ + return -EOPNOTSUPP; +} + +static inline phys_addr_t kho_get_fdt_phys(void) +{ + return 0; +} #endif /* CONFIG_KEXEC_HANDOVER */ #endif /* LINUX_KEXEC_HANDOVER_H */ diff --git a/include/linux/multikernel.h b/include/linux/multikernel.h index 3bc07361145b..c463940eb27c 100644 --- a/include/linux/multikernel.h +++ b/include/linux/multikernel.h @@ -622,4 +622,38 @@ static inline bool mk_is_resource_property(const char *prop_name) return false; } +/** + * KHO (Kexec HandOver) Integration Functions + * + * These functions provide KHO support for preserving and restoring + * multikernel instance device trees across kexec boundaries. + */ + +/** + * mk_kho_restore_dtbs() - Restore DTBs from KHO shared memory + * + * Called during multikernel initialization to restore DTBs that were + * preserved by the previous kernel via KHO. + * + * Returns: 0 on success, negative error code on failure + */ +int __init mk_kho_restore_dtbs(void); + +/** + * mk_kho_init() - Initialize KHO support for multikernel + * + * Registers the KHO notifier and attempts to restore DTBs from + * a previous KHO boot. + * + * Returns: 0 on success, negative error code on failure + */ +int __init mk_kho_init(void); + +/** + * mk_kho_cleanup() - Cleanup KHO support for multikernel + * + * Unregisters the KHO notifier. + */ +void mk_kho_cleanup(void); + #endif /* _LINUX_MULTIKERNEL_H */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 61ad01acd034..c5ed3fd90ede 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -673,6 +674,10 @@ void kimage_free(struct kimage *image) image->mk_instance = NULL; pr_info("Freed multikernel ID %d\n", image->mk_id); } + if (image->kho.fdt && image->type == KEXEC_TYPE_MULTIKERNEL) { + put_page(phys_to_page(image->kho.fdt)); + image->kho.fdt = 0; + } #ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { @@ -1591,6 +1596,12 @@ int multikernel_kexec_by_id(int mk_id) goto unlock; } + rc = mk_kexec_finalize(mk_image); + if (rc) + pr_warn("KHO finalization failed: %d\n", rc); + else + pr_info("KHO finalized for multikernel instance\n"); + pr_info("Using multikernel image with ID %d (entry point: 0x%lx) on CPU %d\n", mk_image->mk_id, mk_image->start, cpu); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f9979c1d9f9e..44b42ae34ef2 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -329,6 +329,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (multikernel_load) { struct mk_instance *instance; int mk_id = KEXEC_GET_MK_ID(flags); + struct page *fdt_page; /* Set multikernel image type for proper memory allocation */ image->type = KEXEC_TYPE_MULTIKERNEL; @@ -367,6 +368,14 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, mk_instance_set_state(instance, MK_STATE_LOADING); pr_info("Associated kimage with multikernel instance %d\n", mk_id); + + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) { + pr_err("Failed to allocate FDT page for multikernel kimage\n"); + ret = -ENOMEM; + goto out_free_image; + } + image->kho.fdt = page_to_phys(fdt_page); } ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..70bfd4b27220 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -106,6 +107,8 @@ struct kho_serialization { struct kho_mem_track track; /* First chunk of serialized preserved memory map */ struct khoser_mem_chunk *preserved_mem_map; + /* Target multikernel instance ID for selective preservation */ + int target_mk_id; }; struct kho_out { @@ -1100,12 +1103,20 @@ static int kho_abort(void) return err; } -static int kho_finalize(void) +static int kho_finalize(struct kimage *target_image) { int err = 0; u64 *preserved_mem_map; void *fdt = page_to_virt(kho_out.ser.fdt); + if (target_image && target_image->mk_id > 0) { + kho_out.ser.target_mk_id = target_image->mk_id; + pr_info("KHO finalize: targeting multikernel instance %d\n", target_image->mk_id); + } else { + kho_out.ser.target_mk_id = 0; + pr_info("KHO finalize: no specific target (preserving all instances)\n"); + } + err |= fdt_create(fdt, PAGE_SIZE); err |= fdt_finish_reservemap(fdt); err |= fdt_begin_node(fdt, ""); @@ -1149,6 +1160,134 @@ static int kho_finalize(void) return err; } +static BLOCKING_NOTIFIER_HEAD(mk_kexec_notifier_chain); + +/** + * mk_kexec_call_notifiers - Call multikernel-specific notifiers + * @target_image: The multikernel kimage being executed + * @fdt: The FDT to populate with multikernel data + * + * This calls a dedicated notifier chain for multikernel that bypasses + * the complex KHO serialization system. + * + * Returns: 0 on success, negative error code on failure + */ +static int mk_kexec_call_notifiers(struct kimage *target_image, void *fdt) +{ + struct mk_kexec_data { + struct kimage *image; + void *fdt; + int mk_id; + } data = { + .image = target_image, + .fdt = fdt, + .mk_id = target_image->mk_id + }; + + int ret = blocking_notifier_call_chain(&mk_kexec_notifier_chain, + 0, &data); + return notifier_to_errno(ret); +} + +/** + * mk_kexec_register_notifier - Register a multikernel kexec notifier + * @nb: The notifier block to register + * + * Returns: 0 on success, negative error code on failure + */ +int mk_kexec_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&mk_kexec_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(mk_kexec_register_notifier); + +/** + * mk_kexec_unregister_notifier - Unregister a multikernel kexec notifier + * @nb: The notifier block to unregister + * + * Returns: 0 on success, negative error code on failure + */ +int mk_kexec_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&mk_kexec_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(mk_kexec_unregister_notifier); + +/** + * mk_kexec_finalize - Finalize multikernel DTB for kexec + * @target_image: The multikernel kimage being executed + * + * This function creates a minimal FDT for multikernel and calls the + * multikernel notifier to preserve the target instance's DTB. + * It reuses the existing device tree infrastructure but avoids the + * complex KHO serialization system. + * + * Returns: 0 on success, negative error code on failure + */ +int mk_kexec_finalize(struct kimage *target_image) +{ + void *fdt; + int ret; + + if (target_image->mk_id <= 0) { + pr_warn("mk_kexec_finalize called without valid multikernel target\n"); + return -EINVAL; + } + + if (!target_image->kho.fdt) { + pr_err("No FDT page allocated for multikernel kimage\n"); + return -EINVAL; + } + + /* Use the pre-allocated FDT page from multikernel memory pool */ + fdt = phys_to_virt(target_image->kho.fdt); + + ret = fdt_create(fdt, PAGE_SIZE); + ret |= fdt_finish_reservemap(fdt); + ret |= fdt_begin_node(fdt, ""); + ret |= fdt_property_string(fdt, "compatible", MK_FDT_COMPATIBLE); + if (ret) { + pr_err("Failed to create FDT structure: %d\n", ret); + return ret; + } + + ret = mk_kexec_call_notifiers(target_image, fdt); + if (ret) { + pr_err("Multikernel notifier failed: %d\n", ret); + fdt_end_node(fdt); + fdt_finish(fdt); + return ret; + } + + ret = fdt_end_node(fdt); + ret |= fdt_finish(fdt); + if (ret) { + pr_err("Failed to finalize FDT: %d\n", ret); + return ret; + } + + if (fdt_totalsize(fdt) > PAGE_SIZE) { + pr_err("FDT size (%d bytes) exceeds allocated page size (%lu bytes)\n", + fdt_totalsize(fdt), PAGE_SIZE); + return -ENOSPC; + } + + pr_info("Finalized multikernel FDT for instance %d (size: %d bytes)\n", + target_image->mk_id, fdt_totalsize(fdt)); + return 0; +} + +/** + * kho_get_target_mk_id - Get target multikernel ID from KHO serialization + * @ser: KHO serialization structure + * + * Returns the target multikernel instance ID, or 0 if not set. + */ +int kho_get_target_mk_id(struct kho_serialization *ser) +{ + return ser ? ser->target_mk_id : 0; +} + static int kho_out_finalize_get(void *data, u64 *val) { mutex_lock(&kho_out.lock); @@ -1174,7 +1313,7 @@ static int kho_out_finalize_set(void *data, u64 _val) } if (val) - ret = kho_finalize(); + ret = kho_finalize(NULL); else ret = kho_abort(); @@ -1262,6 +1401,11 @@ static const void *kho_get_fdt(void) return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; } +phys_addr_t kho_get_fdt_phys(void) +{ + return kho_in.fdt_phys; +} + /** * is_kho_boot - check if current kernel was booted via KHO-enabled * kexec @@ -1474,6 +1618,55 @@ void __init kho_memory_init(void) } } +/** + * mk_kho_populate() - Populate multikernel KHO data during early boot + * @fdt_phys: Physical address of the multikernel FDT + * @fdt_len: Length of the FDT + * + * This function handles multikernel FDT revival during early boot. Unlike + * regular KHO, multikernel doesn't use scratch areas and has a different + * FDT format with 'multikernel-v1' compatibility. + */ +void __init mk_kho_populate(phys_addr_t fdt_phys, u64 fdt_len) +{ + void *fdt = NULL; + int err = 0; + + pr_info("Multikernel KHO: processing FDT at 0x%llx (size: %llu)\n", fdt_phys, fdt_len); + + /* Validate the input FDT */ + fdt = early_memremap(fdt_phys, fdt_len); + if (!fdt) { + pr_warn("Multikernel KHO: failed to memremap FDT (0x%llx)\n", fdt_phys); + goto out; + } + + err = fdt_check_header(fdt); + if (err) { + pr_warn("Multikernel KHO: handover FDT (0x%llx) is invalid: %d\n", + fdt_phys, err); + goto out; + } + + err = fdt_node_check_compatible(fdt, 0, MK_FDT_COMPATIBLE); + if (err) { + pr_warn("Multikernel KHO: handover FDT (0x%llx) is incompatible with 'multikernel-v1': %d\n", + fdt_phys, err); + goto out; + } + + kho_in.fdt_phys = fdt_phys; + kho_in.scratch_phys = 0; + + pr_info("Multikernel KHO: successfully populated FDT data\n"); + +out: + if (fdt) + early_memunmap(fdt, fdt_len); + if (err) + pr_warn("Multikernel KHO: disabling multikernel revival\n"); +} + void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { diff --git a/kernel/multikernel/Kconfig b/kernel/multikernel/Kconfig index a9582a4d0c54..1b0191beffa7 100644 --- a/kernel/multikernel/Kconfig +++ b/kernel/multikernel/Kconfig @@ -6,6 +6,7 @@ config MULTIKERNEL bool "Multikernel support" depends on KEXEC_CORE + depends on KEXEC_HANDOVER select LIBFDT help Enable multikernel support, which allows running multiple kernel @@ -17,5 +18,9 @@ config MULTIKERNEL - Device tree based resource specification - Memory pool management for kernel instances - Integration with kexec for kernel loading + - KHO (Kexec HandOver) support for seamless DTB transfer + + Requires KEXEC_HANDOVER for preserving device trees and instance + state across kernel boundaries. If unsure, say N. diff --git a/kernel/multikernel/Makefile b/kernel/multikernel/Makefile index f133e1eaf534..3d81a3233bba 100644 --- a/kernel/multikernel/Makefile +++ b/kernel/multikernel/Makefile @@ -3,7 +3,7 @@ # Makefile for multikernel support # -obj-y += core.o mem.o kernfs.o dts.o ipi.o messaging.o +obj-y += core.o mem.o kernfs.o dts.o ipi.o messaging.o kho.o # Add libfdt include path for device tree parsing CFLAGS_dts.o = -I $(srctree)/scripts/dtc/libfdt diff --git a/kernel/multikernel/core.c b/kernel/multikernel/core.c index 37dbf0cf4be6..95d27b3b15cc 100644 --- a/kernel/multikernel/core.c +++ b/kernel/multikernel/core.c @@ -518,6 +518,12 @@ static int __init multikernel_init(void) return ret; } + ret = mk_kho_init(); + if (ret < 0) { + pr_warn("Failed to initialize KHO support: %d\n", ret); + /* Continue without KHO support - this is not fatal */ + } + pr_info("Multikernel support initialized\n"); return 0; } diff --git a/kernel/multikernel/kho.c b/kernel/multikernel/kho.c new file mode 100644 index 000000000000..8f25b4d73fa1 --- /dev/null +++ b/kernel/multikernel/kho.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Multikernel Technologies, Inc. All rights reserved + * + * Multikernel KHO (Kexec HandOver) + * + * Provides KHO support for preserving and restoring multikernel instance + * device trees across kexec boundaries using shared memory. + */ + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KEXEC_HANDOVER +#include +#include +#include +#include "internal.h" + +#define PROP_SUB_FDT "fdt" +#endif + +#ifdef CONFIG_KEXEC_HANDOVER + +/** + * mk_kexec_notifier() - Multikernel kexec notifier callback for DTB preservation + * @nb: Notifier block + * @action: Notifier action (unused) + * @data: Multikernel kexec data + * + * Called by multikernel kexec subsystem during kexec to preserve multikernel DTBs + * in shared memory for the target kernel. This uses a simplified interface + * compared to the full KHO system. + */ +static int mk_kexec_notifier(struct notifier_block *nb, unsigned long action, void *data) +{ + struct mk_kexec_data { + struct kimage *image; + void *fdt; + int mk_id; + } *mk_data = data; + + struct mk_instance *instance; + int ret = 0; + + pr_info("Preserving multikernel DTB for instance %d\n", mk_data->mk_id); + + /* Find the target multikernel instance */ + instance = mk_instance_find(mk_data->mk_id); + if (!instance) { + pr_err("Target multikernel instance %d not found\n", mk_data->mk_id); + return NOTIFY_STOP; + } + + if (!instance->dtb_data || instance->dtb_size == 0) { + pr_err("Target multikernel instance %d has no DTB data - did you write to device_tree file?\n", mk_data->mk_id); + mk_instance_put(instance); + return NOTIFY_STOP; + } + + ret |= fdt_begin_node(mk_data->fdt, "multikernel"); + ret |= fdt_property(mk_data->fdt, "dtb-data", instance->dtb_data, instance->dtb_size); + ret |= fdt_end_node(mk_data->fdt); + + if (ret) { + pr_err("Failed to add DTB for instance %d to FDT: %d\n", mk_data->mk_id, ret); + mk_instance_put(instance); + return notifier_from_errno(ret); + } + + pr_info("Preserved DTB for instance %d (%zu bytes)\n", mk_data->mk_id, instance->dtb_size); + mk_instance_put(instance); + return NOTIFY_OK; +} + +/* Multikernel kexec notifier block */ +static struct notifier_block mk_kexec_nb = { + .notifier_call = mk_kexec_notifier, +}; + +/** + * mk_dt_extract_instance_info() - Extract instance ID and name from DTB + * @dtb_data: Device tree blob data + * @dtb_size: Size of DTB data + * @instance_id: Output parameter for instance ID + * @instance_name: Output parameter for instance name (caller must free) + * + * Parses the DTB to find the first instance in the instances node and + * extracts its ID and name. + * + * Returns: 0 on success, negative error code on failure + */ +static int mk_dt_extract_instance_info(const void *dtb_data, size_t dtb_size, + int *instance_id, const char **instance_name) +{ + const void *fdt = dtb_data; + int instances_node, instance_node; + const fdt32_t *id_prop; + const char *name; + + if (!dtb_data || !instance_id || !instance_name) { + return -EINVAL; + } + + /* Find /instances node */ + instances_node = fdt_path_offset(fdt, "/instances"); + if (instances_node < 0) { + pr_err("No /instances node found in device tree\n"); + return -ENOENT; + } + + /* Find the first instance child node */ + instance_node = fdt_first_subnode(fdt, instances_node); + if (instance_node < 0) { + pr_err("No instance found in /instances node\n"); + return -ENOENT; + } + + /* Get the instance name (node name) */ + name = fdt_get_name(fdt, instance_node, NULL); + if (!name) { + pr_err("Failed to get instance name\n"); + return -EINVAL; + } + + /* Get the instance ID property */ + id_prop = fdt_getprop(fdt, instance_node, "id", NULL); + if (!id_prop) { + pr_err("No 'id' property found in instance '%s'\n", name); + return -ENOENT; + } + + *instance_id = fdt32_to_cpu(*id_prop); + *instance_name = name; + + return 0; +} + +/** + * mk_kho_restore_dtbs() - Restore DTB from KHO shared memory + * + * Called during multikernel initialization in the spawned kernel to restore + * the single DTB that was preserved by the host kernel via KHO. The spawned + * kernel receives exactly one DTB and parses the instance ID from it. + * + * Returns: 0 on success, negative error code on failure + */ +int __init mk_kho_restore_dtbs(void) +{ + void *dtb_virt; + int dtb_len; + int ret; + struct mk_instance *instance; + struct mk_dt_config config; + int instance_id; + const char *instance_name; + const void *kho_fdt = NULL; + phys_addr_t fdt_phys; + + fdt_phys = kho_get_fdt_phys(); + if (!fdt_phys) { + pr_info("No KHO FDT available for multikernel DTB restoration\n"); + return 0; + } + + pr_info("Restoring multikernel DTB from KHO (phys: 0x%llx)\n", fdt_phys); + + /* Map the FDT for early boot access */ + kho_fdt = early_memremap(fdt_phys, PAGE_SIZE); + if (!kho_fdt) { + pr_err("Failed to map KHO FDT at 0x%llx\n", fdt_phys); + return -EFAULT; + } + + int mk_node = fdt_subnode_offset(kho_fdt, 0, "multikernel"); + if (mk_node < 0) { + pr_info("No multikernel node found in KHO FDT\n"); + ret = 0; + goto cleanup_fdt; + } + + const void *dtb_data = fdt_getprop(kho_fdt, mk_node, "dtb-data", &dtb_len); + if (!dtb_data || dtb_len <= 0) { + pr_info("No dtb-data property found in multikernel node\n"); + ret = 0; + goto cleanup_fdt; + } + + pr_info("Found preserved multikernel DTB (%d bytes)\n", dtb_len); + + /* Validate DTB header */ + ret = fdt_check_header(dtb_data); + if (ret) { + pr_err("Invalid DTB header from KHO: %d\n", ret); + ret = -EINVAL; + goto cleanup_fdt; + } + + if (dtb_len > SZ_1M) { + pr_err("DTB size too large: %d bytes\n", dtb_len); + ret = -EINVAL; + goto cleanup_fdt; + } + + dtb_virt = kmalloc(dtb_len, GFP_KERNEL); + if (!dtb_virt) { + pr_err("Failed to allocate memory for DTB (%d bytes)\n", dtb_len); + ret = -ENOMEM; + goto cleanup_fdt; + } + memcpy(dtb_virt, dtb_data, dtb_len); + + /* Parse DTB to get the actual instance ID and name */ + ret = mk_dt_extract_instance_info(dtb_virt, dtb_len, &instance_id, &instance_name); + if (ret) { + pr_err("Failed to extract instance info from DTB: %d\n", ret); + goto cleanup_dtb; + } + + pr_info("DTB contains instance ID %d, name '%s'\n", instance_id, instance_name); + + /* Parse DTB configuration */ + mk_dt_config_init(&config); + ret = mk_dt_parse(dtb_virt, dtb_len, &config); + if (ret) { + pr_err("Failed to parse DTB from KHO: %d\n", ret); + goto config_free; + } + + /* Create a new instance for this DTB */ + instance = kzalloc(sizeof(*instance), GFP_KERNEL); + if (!instance) { + pr_err("Failed to allocate memory for multikernel instance\n"); + ret = -ENOMEM; + goto config_free; + } + + /* Initialize instance with parsed data */ + instance->id = instance_id; + instance->name = kstrdup(instance_name, GFP_KERNEL); + if (!instance->name) { + ret = -ENOMEM; + goto cleanup_instance; + } + + instance->dtb_data = kmalloc(dtb_len, GFP_KERNEL); + if (!instance->dtb_data) { + pr_err("Failed to allocate memory for DTB restoration\n"); + ret = -ENOMEM; + goto cleanup_instance_name; + } + + memcpy(instance->dtb_data, dtb_virt, dtb_len); + instance->dtb_size = dtb_len; + + INIT_LIST_HEAD(&instance->memory_regions); + INIT_LIST_HEAD(&instance->list); + kref_init(&instance->refcount); + + ret = mk_instance_reserve_resources(instance, &config); + if (ret == 0) { + mk_instance_set_state(instance, MK_STATE_READY); + + mutex_lock(&mk_instance_mutex); + list_add_tail(&instance->list, &mk_instance_list); + mutex_unlock(&mk_instance_mutex); + + pr_info("Successfully restored multikernel instance %d ('%s') from KHO (%d bytes)\n", + instance_id, instance_name, dtb_len); + mk_dt_config_free(&config); + kfree(dtb_virt); + early_memunmap((void *)kho_fdt, PAGE_SIZE); + return 0; + } else { + pr_err("Failed to reserve memory for restored instance: %d\n", ret); + mk_instance_set_state(instance, MK_STATE_FAILED); + /* Fall through to cleanup */ + } + +cleanup_instance_name: + kfree(instance->name); +cleanup_instance: + kfree(instance->dtb_data); + kfree(instance); +config_free: + mk_dt_config_free(&config); +cleanup_dtb: + kfree(dtb_virt); +cleanup_fdt: + early_memunmap((void *)kho_fdt, PAGE_SIZE); + return ret; +} + +/** + * mk_kho_init() - Initialize KHO support for multikernel + * + * Registers the KHO notifier and attempts to restore DTBs from + * a previous KHO boot. + * + * Returns: 0 on success, negative error code on failure + */ +int __init mk_kho_init(void) +{ + int ret; + + /* Register multikernel kexec notifier for DTB preservation */ + ret = mk_kexec_register_notifier(&mk_kexec_nb); + if (ret) { + pr_warn("Failed to register multikernel kexec notifier: %d\n", ret); + return ret; + } + + pr_info("Registered multikernel kexec notifier for DTB preservation\n"); + + /* Restore DTBs from previous kernel if KHO boot */ + ret = mk_kho_restore_dtbs(); + if (ret) { + pr_warn("Failed to restore DTBs from KHO: %d\n", ret); + /* Continue - this is not fatal */ + } + + return 0; +} + +/** + * mk_kho_cleanup() - Cleanup multikernel kexec support + * + * Unregisters the multikernel kexec notifier. + */ +void mk_kho_cleanup(void) +{ + mk_kexec_unregister_notifier(&mk_kexec_nb); + pr_debug("Unregistered multikernel kexec notifier\n"); +} + +#else /* !CONFIG_KEXEC_HANDOVER */ + +/* Stub functions when KHO is not enabled */ +int __init mk_kho_restore_dtbs(void) +{ + return 0; +} + +int __init mk_kho_init(void) +{ + return 0; +} + +void mk_kho_cleanup(void) +{ +} + +#endif /* CONFIG_KEXEC_HANDOVER */ -- 2.34.1