From: YiFei Zhu The caller is expected to mark each HWPT to be preserved with an ioctl call, with a token that will be used in restore. At preserve time, each HWPT's domain is then called with iommu_domain_preserve to preserve the iommu domain. The HWPTs containing dma mappings backed by unpreserved memory should not be preserved. During preservation check if the mappings contained in the HWPT being preserved are only file based and all the files are preserved. The memfd file preservation check is not enough when preserving iommufd. The memfd might have shrunk between the mapping and memfd preservation. This means that if it shrunk some pages that are right now pinned due to iommu mappings are not preserved with the memfd. Only allow iommufd preservation when all the iopt_pages are file backed and the memory file was seal sealed during mapping. This guarantees that all the pages that were backing memfd when it was mapped are preserved. Once HWPT is preserved the iopt associated with the HWPT is made immutable. Since the map and unmap ioctls operates directly on iopt, which contains an array of domains, while each hwpt contains only one domain. The logic then becomes that mapping and unmapping is prohibited if any of the domains in an iopt belongs to a preserved hwpt. However, tracing to the hwpt through the domain is a lot more tedious than tracing through the ioas, so if an hwpt is preserved, hwpt->ioas->iopt is made immutable. When undoing this (making the iopts mutable again), there's never a need to make some iopts mutable and some kept immutable, since the undo only happen on unpreserve and error path of preserve. Simply iterate all the ioas and clear the immutability flag on all their iopts. Signed-off-by: YiFei Zhu Signed-off-by: Samiullah Khawaja --- drivers/iommu/iommufd/io_pagetable.c | 17 ++ drivers/iommu/iommufd/io_pagetable.h | 1 + drivers/iommu/iommufd/iommufd_private.h | 25 ++ drivers/iommu/iommufd/liveupdate.c | 300 ++++++++++++++++++++++++ drivers/iommu/iommufd/main.c | 14 +- drivers/iommu/iommufd/pages.c | 8 + include/linux/kho/abi/iommufd.h | 39 +++ 7 files changed, 403 insertions(+), 1 deletion(-) create mode 100644 include/linux/kho/abi/iommufd.h diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 436992331111..43e8a2443793 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -270,6 +270,11 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, } down_write(&iopt->iova_rwsem); + if (iopt_lu_map_immutable(iopt)) { + rc = -EBUSY; + goto out_unlock; + } + if ((length & (iopt->iova_alignment - 1)) || !length) { rc = -EINVAL; goto out_unlock; @@ -328,6 +333,7 @@ static void iopt_abort_area(struct iopt_area *area) WARN_ON(area->pages); if (area->iopt) { down_write(&area->iopt->iova_rwsem); + WARN_ON(iopt_lu_map_immutable(area->iopt)); interval_tree_remove(&area->node, &area->iopt->area_itree); up_write(&area->iopt->iova_rwsem); } @@ -755,6 +761,12 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, again: down_read(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); + + if (iopt_lu_map_immutable(iopt)) { + rc = -EBUSY; + goto out_unlock_iova; + } + while ((area = iopt_area_iter_first(iopt, start, last))) { unsigned long area_last = iopt_area_last_iova(area); unsigned long area_first = iopt_area_iova(area); @@ -1398,6 +1410,11 @@ int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, int i; down_write(&iopt->iova_rwsem); + if (iopt_lu_map_immutable(iopt)) { + up_write(&iopt->iova_rwsem); + return -EBUSY; + } + for (i = 0; i < num_iovas; i++) { struct iopt_area *area; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 14cd052fd320..b64cb4cf300c 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -234,6 +234,7 @@ struct iopt_pages { struct { /* IOPT_ADDRESS_FILE */ struct file *file; unsigned long start; + u32 seals; }; /* IOPT_ADDRESS_DMABUF */ struct iopt_pages_dmabuf dmabuf; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 6424e7cea5b2..f8366a23999f 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -94,6 +94,9 @@ struct io_pagetable { /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; +#ifdef CONFIG_IOMMU_LIVEUPDATE + bool lu_map_immutable; +#endif unsigned long iova_alignment; }; @@ -712,12 +715,34 @@ iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) } #ifdef CONFIG_IOMMU_LIVEUPDATE +int iommufd_liveupdate_register_lufs(void); +int iommufd_liveupdate_unregister_lufs(void); + int iommufd_hwpt_lu_set_preserve(struct iommufd_ucmd *ucmd); +static inline bool iopt_lu_map_immutable(const struct io_pagetable *iopt) +{ + return iopt->lu_map_immutable; +} #else +static inline int iommufd_liveupdate_register_lufs(void) +{ + return 0; +} + +static inline int iommufd_liveupdate_unregister_lufs(void) +{ + return 0; +} + static inline int iommufd_hwpt_lu_set_preserve(struct iommufd_ucmd *ucmd) { return -ENOTTY; } + +static inline bool iopt_lu_map_immutable(const struct io_pagetable *iopt) +{ + return false; +} #endif #ifdef CONFIG_IOMMUFD_TEST diff --git a/drivers/iommu/iommufd/liveupdate.c b/drivers/iommu/iommufd/liveupdate.c index ae74f5b54735..ec11ae345fe7 100644 --- a/drivers/iommu/iommufd/liveupdate.c +++ b/drivers/iommu/iommufd/liveupdate.c @@ -4,9 +4,15 @@ #include #include +#include +#include #include +#include +#include +#include #include "iommufd_private.h" +#include "io_pagetable.h" int iommufd_hwpt_lu_set_preserve(struct iommufd_ucmd *ucmd) { @@ -47,3 +53,297 @@ int iommufd_hwpt_lu_set_preserve(struct iommufd_ucmd *ucmd) return rc; } +static void iommufd_set_ioas_mutable(struct iommufd_ctx *ictx) +{ + struct iommufd_object *obj; + struct iommufd_ioas *ioas; + unsigned long index; + + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + if (obj->type != IOMMUFD_OBJ_IOAS) + continue; + + ioas = container_of(obj, struct iommufd_ioas, obj); + + /* + * Not taking any IOAS lock here. All writers take LUO + * session mutex, and this writer racing with readers is not + * really a problem. + */ + WRITE_ONCE(ioas->iopt.lu_map_immutable, false); + } + xa_unlock(&ictx->objects); +} + +static int check_iopt_pages_preserved(struct liveupdate_session *s, + struct iommufd_hwpt_paging *hwpt) +{ + u32 req_seals = F_SEAL_SEAL | F_SEAL_GROW | F_SEAL_SHRINK; + struct iopt_area *area; + int ret; + + for (area = iopt_area_iter_first(&hwpt->ioas->iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + /* Only allow file based mapping */ + if (pages->type != IOPT_ADDRESS_FILE) + return -EINVAL; + + /* + * When this memory file was mapped it should be sealed and seal + * should be sealed. This means that since mapping was done the + * memory file was not grown or shrink and the pages being used + * until now remain pinnned and preserved. + */ + if ((pages->seals & req_seals) != req_seals) + return -EINVAL; + + /* Make sure that the file was preserved. */ + ret = liveupdate_get_token_outgoing(s, pages->file, NULL); + if (ret) + return ret; + } + + return 0; +} + +static int iommufd_save_hwpts(struct iommufd_ctx *ictx, + struct iommufd_lu *iommufd_lu, + struct liveupdate_session *session) +{ + struct iommufd_hwpt_paging *hwpt, **hwpts = NULL; + struct iommu_domain_ser *domain_ser; + struct iommufd_hwpt_lu *hwpt_lu; + struct iommufd_object *obj; + unsigned int nr_hwpts = 0; + unsigned long index; + unsigned int i; + int rc = 0; + + if (iommufd_lu) { + hwpts = kcalloc(iommufd_lu->nr_hwpts, sizeof(*hwpts), + GFP_KERNEL); + if (!hwpts) + return -ENOMEM; + } + + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + hwpt = container_of(obj, struct iommufd_hwpt_paging, common.obj); + if (!hwpt->lu_preserve) + continue; + + if (hwpt->ioas) { + /* + * Obtain exclusive access to the IOAS and IOPT while we + * set immutability + */ + mutex_lock(&hwpt->ioas->mutex); + down_write(&hwpt->ioas->iopt.domains_rwsem); + down_write(&hwpt->ioas->iopt.iova_rwsem); + + hwpt->ioas->iopt.lu_map_immutable = true; + + up_write(&hwpt->ioas->iopt.iova_rwsem); + up_write(&hwpt->ioas->iopt.domains_rwsem); + mutex_unlock(&hwpt->ioas->mutex); + } + + if (!hwpt->common.domain) { + rc = -EINVAL; + xa_unlock(&ictx->objects); + goto out; + } + + if (!iommufd_lu) { + rc = check_iopt_pages_preserved(session, hwpt); + if (rc) { + xa_unlock(&ictx->objects); + goto out; + } + } else if (iommufd_lu) { + hwpts[nr_hwpts] = hwpt; + hwpt_lu = &iommufd_lu->hwpts[nr_hwpts]; + + hwpt_lu->token = hwpt->lu_token; + hwpt_lu->reclaimed = false; + } + + nr_hwpts++; + } + xa_unlock(&ictx->objects); + + if (WARN_ON(iommufd_lu && iommufd_lu->nr_hwpts != nr_hwpts)) { + rc = -EFAULT; + goto out; + } + + if (iommufd_lu) { + /* + * iommu_domain_preserve may sleep and must be called + * outside of xa_lock + */ + for (i = 0; i < nr_hwpts; i++) { + hwpt = hwpts[i]; + hwpt_lu = &iommufd_lu->hwpts[i]; + + rc = iommu_domain_preserve(hwpt->common.domain, &domain_ser); + if (rc < 0) + goto out; + + hwpt_lu->domain_data = __pa(domain_ser); + } + } + + rc = nr_hwpts; + +out: + kfree(hwpts); + return rc; +} + +static int iommufd_liveupdate_preserve(struct liveupdate_file_op_args *args) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file); + struct iommufd_lu *iommufd_lu; + size_t serial_size; + void *mem; + int rc; + + if (IS_ERR(ictx)) + return PTR_ERR(ictx); + + rc = iommufd_save_hwpts(ictx, NULL, args->session); + if (rc < 0) + goto err_ioas_mutable; + + serial_size = struct_size(iommufd_lu, hwpts, rc); + + mem = kho_alloc_preserve(serial_size); + if (!mem) { + rc = -ENOMEM; + goto err_ioas_mutable; + } + + iommufd_lu = mem; + iommufd_lu->nr_hwpts = rc; + rc = iommufd_save_hwpts(ictx, iommufd_lu, args->session); + if (rc < 0) + goto err_free; + + args->serialized_data = virt_to_phys(iommufd_lu); + iommufd_ctx_put(ictx); + return 0; + +err_free: + kho_unpreserve_free(mem); +err_ioas_mutable: + iommufd_set_ioas_mutable(ictx); + iommufd_ctx_put(ictx); + return rc; +} + +static int iommufd_liveupdate_freeze(struct liveupdate_file_op_args *args) +{ + /* No-Op; everything should be made read-only */ + return 0; +} + +static void iommufd_liveupdate_unpreserve(struct liveupdate_file_op_args *args) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file); + struct iommufd_hwpt_paging *hwpt; + struct iommufd_object *obj; + unsigned long index; + + if (WARN_ON(IS_ERR(ictx))) + return; + + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + hwpt = container_of(obj, struct iommufd_hwpt_paging, common.obj); + if (!hwpt->lu_preserve) + continue; + if (!hwpt->common.domain) + continue; + + iommu_domain_unpreserve(hwpt->common.domain); + } + xa_unlock(&ictx->objects); + + kho_unpreserve_free(phys_to_virt(args->serialized_data)); + + iommufd_set_ioas_mutable(ictx); + iommufd_ctx_put(ictx); +} + +static int iommufd_liveupdate_retrieve(struct liveupdate_file_op_args *args) +{ + return -EOPNOTSUPP; +} + +static bool iommufd_liveupdate_can_finish(struct liveupdate_file_op_args *args) +{ + return false; +} + +static void iommufd_liveupdate_finish(struct liveupdate_file_op_args *args) +{ +} + +static bool iommufd_liveupdate_can_preserve(struct liveupdate_file_handler *handler, + struct file *file) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(file); + + if (IS_ERR(ictx)) + return false; + + iommufd_ctx_put(ictx); + return true; +} + +static struct liveupdate_file_ops iommufd_lu_file_ops = { + .can_preserve = iommufd_liveupdate_can_preserve, + .preserve = iommufd_liveupdate_preserve, + .unpreserve = iommufd_liveupdate_unpreserve, + .freeze = iommufd_liveupdate_freeze, + .retrieve = iommufd_liveupdate_retrieve, + .can_finish = iommufd_liveupdate_can_finish, + .finish = iommufd_liveupdate_finish, +}; + +static struct liveupdate_file_handler iommufd_lu_handler = { + .compatible = IOMMUFD_LUO_COMPATIBLE, + .ops = &iommufd_lu_file_ops, +}; + +int iommufd_liveupdate_register_lufs(void) +{ + int ret; + + ret = liveupdate_register_file_handler(&iommufd_lu_handler); + if (ret) + return ret; + + ret = iommu_liveupdate_register_flb(&iommufd_lu_handler); + if (ret) + liveupdate_unregister_file_handler(&iommufd_lu_handler); + + return ret; +} + +int iommufd_liveupdate_unregister_lufs(void) +{ + WARN_ON(iommu_liveupdate_unregister_flb(&iommufd_lu_handler)); + + return liveupdate_unregister_file_handler(&iommufd_lu_handler); +} diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index e1a9b3051f65..d7683244c67a 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -775,11 +775,21 @@ static int __init iommufd_init(void) if (ret) goto err_misc; } + + if (IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE)) { + ret = iommufd_liveupdate_register_lufs(); + if (ret) + goto err_vfio_misc; + } + ret = iommufd_test_init(); if (ret) - goto err_vfio_misc; + goto err_lufs; return 0; +err_lufs: + if (IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE)) + iommufd_liveupdate_unregister_lufs(); err_vfio_misc: if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); @@ -791,6 +801,8 @@ static int __init iommufd_init(void) static void __exit iommufd_exit(void) { iommufd_test_exit(); + if (IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE)) + iommufd_liveupdate_unregister_lufs(); if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); misc_deregister(&iommu_misc_dev); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index dbe51ecb9a20..cc0e3265ba4e 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include "double_span.h" @@ -1420,6 +1421,7 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, { struct iopt_pages *pages; + int seals; pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) @@ -1427,6 +1429,12 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, pages->file = get_file(file); pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; + + pages->seals = 0; + seals = memfd_get_seals(file); + if (seals > 0) + pages->seals = seals; + return pages; } diff --git a/include/linux/kho/abi/iommufd.h b/include/linux/kho/abi/iommufd.h new file mode 100644 index 000000000000..f7393ac78aa9 --- /dev/null +++ b/include/linux/kho/abi/iommufd.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2025, Google LLC + * Author: Samiullah Khawaja + */ + +#ifndef _LINUX_KHO_ABI_IOMMUFD_H +#define _LINUX_KHO_ABI_IOMMUFD_H + +#include +#include +#include + +/** + * DOC: IOMMUFD Live Update ABI + * + * This header defines the ABI for preserving the state of an IOMMUFD file + * across a kexec reboot using LUO. + * + * This interface is a contract. Any modification to any of the serialization + * structs defined here constitutes a breaking change. Such changes require + * incrementing the version number in the IOMMUFD_LUO_COMPATIBLE string. + */ + +#define IOMMUFD_LUO_COMPATIBLE "iommufd-v1" + +struct iommufd_hwpt_lu { + u32 token; + u64 domain_data; + bool reclaimed; +} __packed; + +struct iommufd_lu { + unsigned int nr_hwpts; + struct iommufd_hwpt_lu hwpts[]; +}; + +#endif /* _LINUX_KHO_ABI_IOMMUFD_H */ -- 2.53.0.rc2.204.g2597b5adb4-goog