The log message in kho_populate() currently states "Will skip init for some devices". This implies that Kexec Handover always involves skipping device initialization. However, KHO is a generic mechanism used to preserve kernel memory across reboot for various purposes, such as memfd, telemetry, or reserve_mem. Skipping device initialization is a specific property of live update drivers using KHO, not a property of the mechanism itself. Remove the misleading suffix to accurately reflect the generic nature of KHO discovery. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 9f0913e101be..6ad45e12f53b 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1470,7 +1470,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; kho_scratch_cnt = scratch_cnt; - pr_info("found kexec handover data. Will skip init for some devices\n"); + pr_info("found kexec handover data.\n"); out: if (fdt) -- 2.52.0.rc1.455.g30608eb744-goog The internal helper __kho_abort() always returns 0 and has no failure paths. Its return value is ignored by __kho_finalize and checked needlessly by kho_abort. Change the return type to void to reflect that this function cannot fail, and simplify kho_abort by removing dead error handling code. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 6ad45e12f53b..bc7f046a1313 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1117,20 +1117,16 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -static int __kho_abort(void) +static void __kho_abort(void) { if (kho_out.preserved_mem_map) { kho_mem_ser_free(kho_out.preserved_mem_map); kho_out.preserved_mem_map = NULL; } - - return 0; } int kho_abort(void) { - int ret = 0; - if (!kho_enable) return -EOPNOTSUPP; @@ -1138,10 +1134,7 @@ int kho_abort(void) if (!kho_out.finalized) return -ENOENT; - ret = __kho_abort(); - if (ret) - return ret; - + __kho_abort(); kho_out.finalized = false; kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt); -- 2.52.0.rc1.455.g30608eb744-goog Currently, the FDT folio is preserved inside __kho_finalize(). If the user performs multiple finalize/abort cycles, kho_preserve_folio() is called repeatedly for the same FDT folio. Since the FDT folio is allocated once during kho_init(), it should be marked for preservation at the same time. Move the preservation call to kho_init() to align the preservation state with the object's lifecycle and simplify the finalize path. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index bc7f046a1313..a4b33ca79246 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1164,10 +1164,6 @@ static int __kho_finalize(void) if (err) goto abort; - err = kho_preserve_folio(virt_to_folio(kho_out.fdt)); - if (err) - goto abort; - err = kho_mem_serialize(&kho_out); if (err) goto abort; @@ -1319,6 +1315,10 @@ static __init int kho_init(void) if (err) goto err_free_fdt; + err = kho_preserve_folio(virt_to_folio(kho_out.fdt)); + if (err) + goto err_free_fdt; + if (fdt) { kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; -- 2.52.0.rc1.455.g30608eb744-goog During boot, kho_restore_folio() relies on the memory map having been successfully deserialized. If deserialization fails or no map is present, attempting to restore the FDT folio is unsafe. Update kho_mem_deserialize() to return a boolean indicating success. Use this return value in kho_memory_init() to disable KHO if deserialization fails. Also, the incoming FDT folio is never used, there is no reason to restore it. Additionally, use memcpy() to retrieve the memory map pointer from the FDT. FDT properties are not guaranteed to be naturally aligned, and accessing a 64-bit value via a pointer that is only 32-bit aligned can cause faults. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index a4b33ca79246..83aca3b4af15 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -450,20 +450,28 @@ static void __init deserialize_bitmap(unsigned int order, } } -static void __init kho_mem_deserialize(const void *fdt) +/* Return true if memory was deserizlied */ +static bool __init kho_mem_deserialize(const void *fdt) { struct khoser_mem_chunk *chunk; - const phys_addr_t *mem; + const void *mem_ptr; + u64 mem; int len; - mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); - - if (!mem || len != sizeof(*mem)) { + mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); - return; + return false; } + /* FDT guarantees 32-bit alignment, have to use memcpy */ + memcpy(&mem, mem_ptr, len); + + chunk = mem ? phys_to_virt(mem) : NULL; + + /* No preserved physical pages were passed, no deserialization */ + if (!chunk) + return false; - chunk = *mem ? phys_to_virt(*mem) : NULL; while (chunk) { unsigned int i; @@ -472,6 +480,8 @@ static void __init kho_mem_deserialize(const void *fdt) &chunk->bitmaps[i]); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); } + + return true; } /* @@ -1377,16 +1387,12 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - struct folio *folio; - if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - kho_mem_deserialize(kho_get_fdt()); - folio = kho_restore_folio(kho_in.fdt_phys); - if (!folio) - pr_warn("failed to restore folio for KHO fdt\n"); + if (!kho_mem_deserialize(kho_get_fdt())) + kho_in.fdt_phys = 0; } else { kho_reserve_scratch(); } -- 2.52.0.rc1.455.g30608eb744-goog Currently, the output FDT is added to debugfs only when KHO is finalized and removed when aborted. There is no need to hide the FDT based on the state. Always expose it starting from initialization. This aids the transition toward removing the explicit abort functionality and converting KHO to be fully stateless. Also, pre-zero the FDT tree so we do not expose random bits to the user and to the next kernel. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 83aca3b4af15..cd8641725343 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1147,8 +1147,6 @@ int kho_abort(void) __kho_abort(); kho_out.finalized = false; - kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt); - return 0; } @@ -1219,9 +1217,6 @@ int kho_finalize(void) kho_out.finalized = true; - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - kho_out.fdt, true)); - return 0; } @@ -1310,7 +1305,7 @@ static __init int kho_init(void) if (!kho_enable) return 0; - fdt_page = alloc_page(GFP_KERNEL); + fdt_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!fdt_page) { err = -ENOMEM; goto err_free_scratch; @@ -1344,6 +1339,9 @@ static __init int kho_init(void) init_cma_reserved_pageblock(pfn_to_page(pfn)); } + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + kho_out.fdt, true)); + return 0; err_free_fdt: -- 2.52.0.rc1.455.g30608eb744-goog Currently, __kho_finalize() performs memory serialization in the middle of FDT construction. If FDT construction fails later, the function must manually clean up the serialized memory via __kho_abort(). Refactor __kho_finalize() to perform kho_mem_serialize() only after the FDT has been successfully constructed and finished. This reordering has two benefits: 1. It avoids expensive serialization work if FDT generation fails. 2. It removes the need for cleanup in the FDT error path. As a result, the internal helper __kho_abort() is no longer needed for internal error handling. Inline its remaining logic (cleanup of the preserved memory map) directly into kho_abort() and remove the helper. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 41 +++++++++++++----------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index cd8641725343..aea58e5a6b49 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1127,14 +1127,6 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -static void __kho_abort(void) -{ - if (kho_out.preserved_mem_map) { - kho_mem_ser_free(kho_out.preserved_mem_map); - kho_out.preserved_mem_map = NULL; - } -} - int kho_abort(void) { if (!kho_enable) @@ -1144,7 +1136,8 @@ int kho_abort(void) if (!kho_out.finalized) return -ENOENT; - __kho_abort(); + kho_mem_ser_free(kho_out.preserved_mem_map); + kho_out.preserved_mem_map = NULL; kho_out.finalized = false; return 0; @@ -1152,12 +1145,12 @@ int kho_abort(void) static int __kho_finalize(void) { - int err = 0; - u64 *preserved_mem_map; void *root = kho_out.fdt; struct kho_sub_fdt *fdt; + u64 *preserved_mem_map; + int err; - err |= fdt_create(root, PAGE_SIZE); + err = fdt_create(root, PAGE_SIZE); err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); @@ -1170,13 +1163,7 @@ static int __kho_finalize(void) sizeof(*preserved_mem_map), (void **)&preserved_mem_map); if (err) - goto abort; - - err = kho_mem_serialize(&kho_out); - if (err) - goto abort; - - *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); + goto err_exit; mutex_lock(&kho_out.fdts_lock); list_for_each_entry(fdt, &kho_out.sub_fdts, l) { @@ -1190,13 +1177,19 @@ static int __kho_finalize(void) err |= fdt_end_node(root); err |= fdt_finish(root); + if (err) + goto err_exit; -abort: - if (err) { - pr_err("Failed to convert KHO state tree: %d\n", err); - __kho_abort(); - } + err = kho_mem_serialize(&kho_out); + if (err) + goto err_exit; + + *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); + + return 0; +err_exit: + pr_err("Failed to convert KHO state tree: %d\n", err); return err; } -- 2.52.0.rc1.455.g30608eb744-goog Currently, the serialized memory map is tracked via kho_out.preserved_mem_map and copied to the FDT during finalization. This double tracking is redundant. Remove preserved_mem_map from kho_out. Instead, maintain the physical address of the head chunk directly in the preserved-memory-map FDT property. Introduce kho_update_memory_map() to manage this property. This function handles: 1. Retrieving and freeing any existing serialized map (handling the abort/retry case). 2. Updating the FDT property with the new chunk address. This establishes the FDT as the single source of truth for the handover state. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 43 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index aea58e5a6b49..f1c3dd1ef680 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -117,9 +117,6 @@ struct kho_out { struct mutex fdts_lock; struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; - struct kho_debugfs dbg; }; @@ -380,6 +377,27 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) } } +/* + * Update memory map property, if old one is found discard it via + * kho_mem_ser_free(). + */ +static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) +{ + void *ptr; + u64 phys; + + ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + + /* Check and discard previous memory map */ + memcpy(&phys, ptr, sizeof(u64)); + if (phys) + kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); + + /* Update with the new value */ + phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; + memcpy(ptr, &phys, sizeof(u64)); +} + static int kho_mem_serialize(struct kho_out *kho_out) { struct khoser_mem_chunk *first_chunk = NULL; @@ -420,7 +438,7 @@ static int kho_mem_serialize(struct kho_out *kho_out) } } - kho_out->preserved_mem_map = first_chunk; + kho_update_memory_map(first_chunk); return 0; @@ -1136,8 +1154,7 @@ int kho_abort(void) if (!kho_out.finalized) return -ENOENT; - kho_mem_ser_free(kho_out.preserved_mem_map); - kho_out.preserved_mem_map = NULL; + kho_update_memory_map(NULL); kho_out.finalized = false; return 0; @@ -1147,21 +1164,15 @@ static int __kho_finalize(void) { void *root = kho_out.fdt; struct kho_sub_fdt *fdt; - u64 *preserved_mem_map; + u64 empty_mem_map = 0; int err; err = fdt_create(root, PAGE_SIZE); err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - /** - * Reserve the preserved-memory-map property in the root FDT, so - * that all property definitions will precede subnodes created by - * KHO callers. - */ - err |= fdt_property_placeholder(root, PROP_PRESERVED_MEMORY_MAP, - sizeof(*preserved_mem_map), - (void **)&preserved_mem_map); + err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + sizeof(empty_mem_map)); if (err) goto err_exit; @@ -1184,8 +1195,6 @@ static int __kho_finalize(void) if (err) goto err_exit; - *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); - return 0; err_exit: -- 2.52.0.rc1.455.g30608eb744-goog Previously, KHO required a dedicated kho_abort() function to clean up state before kho_finalize() could be called again. This was necessary to handle complex unwind paths when using notifiers. With the shift to direct memory preservation, the explicit abort step is no longer strictly necessary. Remove kho_abort() and refactor kho_finalize() to handle re-entry. If kho_finalize() is called while KHO is already finalized, it will now automatically clean up the previous memory map and state before generating a new one. This allows the KHO state to be updated/refreshed simply by triggering finalize again. Update debugfs to return -EINVAL if userspace attempts to write 0 to the finalize attribute, as explicit abort is no longer supported. Suggested-by: Mike Rapoport (Microsoft) Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 21 ++++----------------- kernel/liveupdate/kexec_handover_debugfs.c | 2 +- kernel/liveupdate/kexec_handover_internal.h | 1 - 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index f1c3dd1ef680..8ab77cb85ca9 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1145,21 +1145,6 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -int kho_abort(void) -{ - if (!kho_enable) - return -EOPNOTSUPP; - - guard(mutex)(&kho_out.lock); - if (!kho_out.finalized) - return -ENOENT; - - kho_update_memory_map(NULL); - kho_out.finalized = false; - - return 0; -} - static int __kho_finalize(void) { void *root = kho_out.fdt; @@ -1210,8 +1195,10 @@ int kho_finalize(void) return -EOPNOTSUPP; guard(mutex)(&kho_out.lock); - if (kho_out.finalized) - return -EEXIST; + if (kho_out.finalized) { + kho_update_memory_map(NULL); + kho_out.finalized = false; + } ret = __kho_finalize(); if (ret) diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index ac739d25094d..2abbf62ba942 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -87,7 +87,7 @@ static int kho_out_finalize_set(void *data, u64 val) if (val) return kho_finalize(); else - return kho_abort(); + return -EINVAL; } DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 52ed73659fe6..0202c85ad14f 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -24,7 +24,6 @@ extern unsigned int kho_scratch_cnt; bool kho_finalized(void); int kho_finalize(void); -int kho_abort(void); #ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS int kho_debugfs_init(void); -- 2.52.0.rc1.455.g30608eb744-goog Currently, sub-FDTs were tracked in a list (kho_out.sub_fdts) and the final FDT is constructed entirely from scratch during kho_finalize(). We can maintain the FDT dynamically: 1. Initialize a valid, empty FDT in kho_init(). 2. Use fdt_add_subnode and fdt_setprop in kho_add_subtree to update the FDT immediately when a subsystem registers. 3. Use fdt_del_node in kho_remove_subtree to remove entries. This removes the need for the intermediate sub_fdts list and the reconstruction logic in kho_finalize(). kho_finalize() now only needs to trigger memory map serialization. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 144 ++++++++++++++--------------- 1 file changed, 68 insertions(+), 76 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 8ab77cb85ca9..822da961d4c9 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -102,20 +102,11 @@ struct kho_mem_track { struct khoser_mem_chunk; -struct kho_sub_fdt { - struct list_head l; - const char *name; - void *fdt; -}; - struct kho_out { void *fdt; bool finalized; struct mutex lock; /* protects KHO FDT finalization */ - struct list_head sub_fdts; - struct mutex fdts_lock; - struct kho_mem_track track; struct kho_debugfs dbg; }; @@ -125,8 +116,6 @@ static struct kho_out kho_out = { .track = { .orders = XARRAY_INIT(kho_out.track.orders, 0), }, - .sub_fdts = LIST_HEAD_INIT(kho_out.sub_fdts), - .fdts_lock = __MUTEX_INITIALIZER(kho_out.fdts_lock), .finalized = false, }; @@ -724,37 +713,67 @@ static void __init kho_reserve_scratch(void) */ int kho_add_subtree(const char *name, void *fdt) { - struct kho_sub_fdt *sub_fdt; + phys_addr_t phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int err = -ENOMEM; + int off, fdt_err; - sub_fdt = kmalloc(sizeof(*sub_fdt), GFP_KERNEL); - if (!sub_fdt) - return -ENOMEM; + guard(mutex)(&kho_out.lock); + + fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (fdt_err < 0) + return err; - INIT_LIST_HEAD(&sub_fdt->l); - sub_fdt->name = name; - sub_fdt->fdt = fdt; + off = fdt_add_subnode(root_fdt, 0, name); + if (off < 0) { + if (off == -FDT_ERR_EXISTS) + err = -EEXIST; + goto out_pack; + } + + err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + if (err < 0) + goto out_pack; - guard(mutex)(&kho_out.fdts_lock); - list_add_tail(&sub_fdt->l, &kho_out.sub_fdts); WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); - return 0; +out_pack: + fdt_pack(root_fdt); + + return err; } EXPORT_SYMBOL_GPL(kho_add_subtree); void kho_remove_subtree(void *fdt) { - struct kho_sub_fdt *sub_fdt; + phys_addr_t target_phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int off; + int err; + + guard(mutex)(&kho_out.lock); - guard(mutex)(&kho_out.fdts_lock); - list_for_each_entry(sub_fdt, &kho_out.sub_fdts, l) { - if (sub_fdt->fdt == fdt) { - list_del(&sub_fdt->l); - kfree(sub_fdt); + err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (err < 0) + return; + + for (off = fdt_first_subnode(root_fdt, 0); off >= 0; + off = fdt_next_subnode(root_fdt, off)) { + const u64 *val; + int len; + + val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + if (!val || len != sizeof(phys_addr_t)) + continue; + + if ((phys_addr_t)*val == target_phys) { + fdt_del_node(root_fdt, off); kho_debugfs_fdt_remove(&kho_out.dbg, fdt); break; } } + + fdt_pack(root_fdt); } EXPORT_SYMBOL_GPL(kho_remove_subtree); @@ -1145,48 +1164,6 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -static int __kho_finalize(void) -{ - void *root = kho_out.fdt; - struct kho_sub_fdt *fdt; - u64 empty_mem_map = 0; - int err; - - err = fdt_create(root, PAGE_SIZE); - err |= fdt_finish_reservemap(root); - err |= fdt_begin_node(root, ""); - err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, - sizeof(empty_mem_map)); - if (err) - goto err_exit; - - mutex_lock(&kho_out.fdts_lock); - list_for_each_entry(fdt, &kho_out.sub_fdts, l) { - phys_addr_t phys = virt_to_phys(fdt->fdt); - - err |= fdt_begin_node(root, fdt->name); - err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); - err |= fdt_end_node(root); - } - mutex_unlock(&kho_out.fdts_lock); - - err |= fdt_end_node(root); - err |= fdt_finish(root); - if (err) - goto err_exit; - - err = kho_mem_serialize(&kho_out); - if (err) - goto err_exit; - - return 0; - -err_exit: - pr_err("Failed to convert KHO state tree: %d\n", err); - return err; -} - int kho_finalize(void) { int ret; @@ -1195,12 +1172,7 @@ int kho_finalize(void) return -EOPNOTSUPP; guard(mutex)(&kho_out.lock); - if (kho_out.finalized) { - kho_update_memory_map(NULL); - kho_out.finalized = false; - } - - ret = __kho_finalize(); + ret = kho_mem_serialize(&kho_out); if (ret) return ret; @@ -1285,6 +1257,26 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); +static __init int kho_out_fdt_setup(void) +{ + void *root = kho_out.fdt; + u64 empty_mem_map = 0; + int err; + + err = fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + sizeof(empty_mem_map)); + err |= fdt_end_node(root); + err |= fdt_finish(root); + if (err) + return err; + + return kho_preserve_folio(virt_to_folio(kho_out.fdt)); +} + static __init int kho_init(void) { int err = 0; @@ -1309,7 +1301,7 @@ static __init int kho_init(void) if (err) goto err_free_fdt; - err = kho_preserve_folio(virt_to_folio(kho_out.fdt)); + err = kho_out_fdt_setup(); if (err) goto err_free_fdt; -- 2.52.0.rc1.455.g30608eb744-goog Currently, kho_fill_kimage() checks kho_out.finalized and returns early if KHO is not yet finalized. This enforces a strict ordering where userspace must finalize KHO *before* loading the kexec image. This is restrictive, as standard workflows often involve loading the target kernel early in the lifecycle and finalizing the state (FDT) only immediately before the reboot. Since the KHO FDT resides at a physical address allocated during boot (kho_init), its location is stable. We can attach this stable address to the kimage regardless of whether the content has been finalized yet. Relax the check to only require kho_enable, allowing kexec_file_load to proceed at any time. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 822da961d4c9..27ef20565a5f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1467,7 +1467,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_out.finalized) + if (!kho_enable) return 0; image->kho.fdt = virt_to_phys(kho_out.fdt); -- 2.52.0.rc1.455.g30608eb744-goog Currently, kho_preserve_* and kho_unpreserve_* return -EBUSY if KHO is finalized. This enforces a rigid "freeze" on the KHO memory state. With the introduction of re-entrant finalization, this restriction is no longer necessary. Users should be allowed to modify the preservation set (e.g., adding new pages or freeing old ones) even after an initial finalization. The intended workflow for updates is now: 1. Modify state (preserve/unpreserve). 2. Call kho_finalize() again to refresh the serialized metadata. Remove the kho_out.finalized checks to enable this dynamic behavior. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/kexec_handover.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 27ef20565a5f..87e9b488237d 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -183,10 +183,6 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; might_sleep(); - - if (kho_out.finalized) - return -EBUSY; - physxa = xa_load(&track->orders, order); if (!physxa) { int err; @@ -815,9 +811,6 @@ int kho_unpreserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.track; - if (kho_out.finalized) - return -EBUSY; - __kho_unpreserve_order(track, pfn, order); return 0; } @@ -885,9 +878,6 @@ int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; - if (kho_out.finalized) - return -EBUSY; - __kho_unpreserve(track, start_pfn, end_pfn); return 0; @@ -1066,9 +1056,6 @@ EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); */ int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) { - if (kho_out.finalized) - return -EBUSY; - kho_vmalloc_free_chunks(preservation); return 0; -- 2.52.0.rc1.455.g30608eb744-goog Currently, Kexec Handover must be explicitly enabled via the kernel command line parameter `kho=on`. For workloads that rely on KHO as a foundational requirement (such as the upcoming Live Update Orchestrator), requiring an explicit boot parameter adds redundant configuration steps. Introduce CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT. When selected, KHO defaults to enabled. This is equivalent to passing kho=on at boot. The behavior can still be disabled at runtime by passing kho=off. Signed-off-by: Pasha Tatashin --- kernel/liveupdate/Kconfig | 14 ++++++++++++++ kernel/liveupdate/kexec_handover.c | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index eae428309332..a973a54447de 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -37,4 +37,18 @@ config KEXEC_HANDOVER_DEBUGFS Also, enables inspecting the KHO fdt trees with the debugfs binary blobs. +config KEXEC_HANDOVER_ENABLE_DEFAULT + bool "Enable kexec handover by default" + depends on KEXEC_HANDOVER + help + Enable Kexec Handover by default. This avoids the need to + explicitly pass 'kho=on' on the kernel command line. + + This is useful for systems where KHO is a prerequisite for other + features, such as Live Update, ensuring the mechanism is always + active. + + The default behavior can still be overridden at boot time by + passing 'kho=off'. + endmenu diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 87e9b488237d..a905bccf5f65 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -50,7 +50,7 @@ union kho_page_info { static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); -static bool kho_enable __ro_after_init; +static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); bool kho_is_enabled(void) { -- 2.52.0.rc1.455.g30608eb744-goog Currently, clients of KHO must manually allocate memory (e.g., via alloc_pages), calculate the page order, and explicitly call kho_preserve_folio(). Similarly, cleanup requires separate calls to unpreserve and free the memory. Introduce a high-level API to streamline this common pattern: - kho_alloc_preserve(size): Allocates physically contiguous, zeroed memory and immediately marks it for preservation. - kho_free_unpreserve(ptr, size): Unpreserves and frees the memory in the current kernel. - kho_free_restore(ptr, size): Restores the struct page state of preserved memory in the new kernel and immediately frees it to the page allocator. Signed-off-by: Pasha Tatashin --- include/linux/kexec_handover.h | 22 +++++-- kernel/liveupdate/kexec_handover.c | 101 +++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 7 deletions(-) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 80ece4232617..76c496e01877 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -2,8 +2,9 @@ #ifndef LINUX_KEXEC_HANDOVER_H #define LINUX_KEXEC_HANDOVER_H -#include +#include #include +#include struct kho_scratch { phys_addr_t addr; @@ -48,6 +49,9 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages); int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); +void *kho_alloc_preserve(size_t size); +void kho_free_unpreserve(void *mem, size_t size); +void kho_free_restore(void *mem, size_t size); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); @@ -101,6 +105,14 @@ static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) return -EOPNOTSUPP; } +void *kho_alloc_preserve(size_t size) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +void kho_free_unpreserve(void *mem, size_t size) { } +void kho_free_restore(void *mem, size_t size) { } + static inline struct folio *kho_restore_folio(phys_addr_t phys) { return NULL; @@ -122,18 +134,14 @@ static inline int kho_add_subtree(const char *name, void *fdt) return -EOPNOTSUPP; } -static inline void kho_remove_subtree(void *fdt) -{ -} +static inline void kho_remove_subtree(void *fdt) { } static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { return -EOPNOTSUPP; } -static inline void kho_memory_init(void) -{ -} +static inline void kho_memory_init(void) { } static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index a905bccf5f65..9f05849fd68e 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -4,6 +4,7 @@ * Copyright (C) 2023 Alexander Graf * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport * Copyright (C) 2025 Google LLC, Changyuan Lyu + * Copyright (C) 2025 Pasha Tatashin */ #define pr_fmt(fmt) "KHO: " fmt @@ -1151,6 +1152,106 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); +/** + * kho_alloc_preserve - Allocate, zero, and preserve memory. + * @size: The number of bytes to allocate. + * + * Allocates a physically contiguous block of zeroed pages that is large + * enough to hold @size bytes. The allocated memory is then registered with + * KHO for preservation across a kexec. + * + * Note: The actual allocated size will be rounded up to the nearest + * power-of-two page boundary. + * + * @return A virtual pointer to the allocated and preserved memory on success, + * or an ERR_PTR() encoded error on failure. + */ +void *kho_alloc_preserve(size_t size) +{ + struct folio *folio; + int order, ret; + + if (!size) + return ERR_PTR(-EINVAL); + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-E2BIG); + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); + if (!folio) + return ERR_PTR(-ENOMEM); + + ret = kho_preserve_folio(folio); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); + } + + return folio_address(folio); +} +EXPORT_SYMBOL_GPL(kho_alloc_preserve); + +/** + * kho_free_unpreserve - Unpreserve and free memory. + * @mem: Pointer to the memory allocated by kho_alloc_preserve(). + * @size: The original size requested during allocation. This is used to + * recalculate the correct order for freeing the pages. + * + * Unregisters the memory from KHO preservation and frees the underlying + * pages back to the system. This function should be called to clean up + * memory allocated with kho_alloc_preserve(). + */ +void kho_free_unpreserve(void *mem, size_t size) +{ + struct folio *folio; + unsigned int order; + + if (!mem || !size) + return; + + order = get_order(size); + if (WARN_ON_ONCE(order > MAX_PAGE_ORDER)) + return; + + folio = virt_to_folio(mem); + WARN_ON_ONCE(kho_unpreserve_folio(folio)); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(kho_free_unpreserve); + +/** + * kho_free_restore - Restore and free memory after kexec. + * @mem: Pointer to the memory (in the new kernel's address space) + * that was allocated by the old kernel. + * @size: The original size requested during allocation. This is used to + * recalculate the correct order for freeing the pages. + * + * This function is intended to be called in the new kernel (post-kexec) + * to take ownership of and free a memory region that was preserved by the + * old kernel using kho_alloc_preserve(). + * + * It first restores the pages from KHO (using their physical address) + * and then frees the pages back to the new kernel's page allocator. + */ +void kho_free_restore(void *mem, size_t size) +{ + struct folio *folio; + unsigned int order; + + if (!mem || !size) + return; + + order = get_order(size); + if (WARN_ON_ONCE(order > MAX_PAGE_ORDER)) + return; + + folio = kho_restore_folio(__pa(mem)); + if (!WARN_ON(!folio)) + free_pages((unsigned long)mem, order); +} +EXPORT_SYMBOL_GPL(kho_free_restore); + int kho_finalize(void) { int ret; -- 2.52.0.rc1.455.g30608eb744-goog