Add a new config IOMMU_DEBUG_PAGEALLOC, which registers new data to page_ext. This config will be used by the IOMMU API to track pages mapped in the IOMMU to catch drivers trying to free kernel memory that they still map in their domains, causing all types of memory corruption. This behaviour is disabled by default and can be enabled using kernel cmdline iommu.debug_pagealloc. Signed-off-by: Mostafa Saleh --- .../admin-guide/kernel-parameters.txt | 6 ++++ drivers/iommu/Kconfig | 19 +++++++++++ drivers/iommu/Makefile | 1 + drivers/iommu/iommu-debug-pagealloc.c | 32 +++++++++++++++++++ include/linux/iommu-debug-pagealloc.h | 17 ++++++++++ mm/page_ext.c | 4 +++ 6 files changed, 79 insertions(+) create mode 100644 drivers/iommu/iommu-debug-pagealloc.c create mode 100644 include/linux/iommu-debug-pagealloc.h diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6c42061ca20e..dddf435a1c11 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2557,6 +2557,12 @@ 1 - Bypass the IOMMU for DMA. unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH. + iommu.debug_pagealloc= + [KNL,EARLY] When CONFIG_IOMMU_DEBUG_PAGEALLOC is set, this + parameter enables the feature at boot time. By default, it + is disabled and the system behave the same way as a kernel + built without CONFIG_IOMMU_DEBUG_PAGEALLOC. + io7= [HW] IO7 for Marvel-based Alpha systems See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 70d29b14d851..c9c1a1c1820e 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -383,4 +383,23 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. +config IOMMU_DEBUG_PAGEALLOC + bool "Debug IOMMU mappings against page allocations" + depends on DEBUG_PAGEALLOC && IOMMU_API && PAGE_EXTENSION + help + This enables a consistency check between the kernel page allocator and + the IOMMU subsystem. It verifies that pages being allocated or freed + are not currently mapped in any IOMMU domain. + + This helps detect DMA use-after-free bugs where a driver frees a page + but forgets to unmap it from the IOMMU, potentially allowing a device + to overwrite memory that the kernel has repurposed. + + These checks are best-effort and may not detect all problems. + + Due to performance overhead, this feature is disabled by default. + You must enable "iommu.debug_pagealloc" from the kernel command + line to activate the runtime checks. + + If unsure, say N. endif # IOMMU_SUPPORT diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 355294fa9033..8f5130b6a671 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o obj-$(CONFIG_APPLE_DART) += apple-dart.o +obj-$(CONFIG_IOMMU_DEBUG_PAGEALLOC) += iommu-debug-pagealloc.o diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c new file mode 100644 index 000000000000..4022e9af7f27 --- /dev/null +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 - Google Inc + * Author: Mostafa Saleh + * IOMMU API debug page alloc sanitizer + */ +#include +#include +#include +#include + +static bool needed; + +struct iommu_debug_metadata { + atomic_t ref; +}; + +static __init bool need_iommu_debug(void) +{ + return needed; +} + +struct page_ext_operations page_iommu_debug_ops = { + .size = sizeof(struct iommu_debug_metadata), + .need = need_iommu_debug, +}; + +static int __init iommu_debug_pagealloc(char *str) +{ + return kstrtobool(str, &needed); +} +early_param("iommu.debug_pagealloc", iommu_debug_pagealloc); diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h new file mode 100644 index 000000000000..83e64d70bf6c --- /dev/null +++ b/include/linux/iommu-debug-pagealloc.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 - Google Inc + * Author: Mostafa Saleh + * IOMMU API debug page alloc sanitizer + */ + +#ifndef __LINUX_IOMMU_DEBUG_PAGEALLOC_H +#define __LINUX_IOMMU_DEBUG_PAGEALLOC_H + +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + +extern struct page_ext_operations page_iommu_debug_ops; + +#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ + +#endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ diff --git a/mm/page_ext.c b/mm/page_ext.c index d7396a8970e5..297e4cd8ce90 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * struct page extension @@ -89,6 +90,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif +#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC + &page_iommu_debug_ops, +#endif }; unsigned long page_ext_size; -- 2.52.0.460.gd25c4c69ec-goog Add calls for the new iommu debug config IOMMU_DEBUG_PAGEALLOC: - iommu_debug_init: Enable the debug mode if configured by the user. - iommu_debug_map: Track iommu pages mapped, using physical address. - iommu_debug_unmap_begin: Track start of iommu unmap operation, with IOVA and size. - iommu_debug_unmap_end: Track the end of unmap operation, passing the actual unmapped size versus the tracked one at unmap_begin. We have to do the unmap_begin/end as once pages are unmapped we lose the information of the physical address. This is racy, but the API is racy by construction as it uses refcounts and doesn't attempt to lock/synchronize with the IOMMU API as that will be costly, meaning that possibility of false negative exists. Signed-off-by: Mostafa Saleh --- drivers/iommu/iommu-debug-pagealloc.c | 26 +++++++++++++ drivers/iommu/iommu.c | 12 +++++- include/linux/iommu-debug-pagealloc.h | 56 +++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index 4022e9af7f27..5665fd7360e0 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -5,11 +5,13 @@ * IOMMU API debug page alloc sanitizer */ #include +#include #include #include #include static bool needed; +DEFINE_STATIC_KEY_FALSE(iommu_debug_initialized); struct iommu_debug_metadata { atomic_t ref; @@ -25,6 +27,30 @@ struct page_ext_operations page_iommu_debug_ops = { .need = need_iommu_debug, }; +void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) +{ +} + +void __iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ +} + +void __iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ +} + +void iommu_debug_init(void) +{ + if (!needed) + return; + + pr_info("iommu: Debugging page allocations, expect overhead or disable iommu.debug_pagealloc"); + static_branch_enable(&iommu_debug_initialized); +} + static int __init iommu_debug_pagealloc(char *str) { return kstrtobool(str, &needed); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 59244c744eab..4d1dc321e400 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -231,6 +232,8 @@ static int __init iommu_subsys_init(void) if (!nb) return -ENOMEM; + iommu_debug_init(); + for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { nb[i].notifier_call = iommu_bus_notifier; bus_register_notifier(iommu_buses[i], &nb[i]); @@ -2544,10 +2547,12 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, } /* unroll mapping in case something went wrong */ - if (ret) + if (ret) { iommu_unmap(domain, orig_iova, orig_size - size); - else + } else { trace_map(orig_iova, orig_paddr, orig_size); + iommu_debug_map(domain, orig_paddr, orig_size); + } return ret; } @@ -2609,6 +2614,8 @@ static size_t __iommu_unmap(struct iommu_domain *domain, pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); + iommu_debug_unmap_begin(domain, iova, size); + /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. @@ -2629,6 +2636,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain, } trace_unmap(orig_iova, size, unmapped); + iommu_debug_unmap_end(domain, orig_iova, size, unmapped); return unmapped; } diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index 83e64d70bf6c..454303ec09c2 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -8,10 +8,66 @@ #ifndef __LINUX_IOMMU_DEBUG_PAGEALLOC_H #define __LINUX_IOMMU_DEBUG_PAGEALLOC_H +struct iommu_domain; + #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC +DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); + extern struct page_ext_operations page_iommu_debug_ops; +void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, + size_t size); +void __iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size); +void __iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, size_t unmapped); + +static inline void iommu_debug_map(struct iommu_domain *domain, + phys_addr_t phys, size_t size) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_map(domain, phys, size); +} + +static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_unmap_begin(domain, iova, size); +} + +static inline void iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_unmap_end(domain, iova, size, unmapped); +} + +void iommu_debug_init(void); + +#else +static inline void iommu_debug_map(struct iommu_domain *domain, + phys_addr_t phys, size_t size) +{ +} + +static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ +} + +static inline void iommu_debug_unmap_end(struct iommu_domain *domain, + unsigned long iova, size_t size, + size_t unmapped) +{ +} + +static inline void iommu_debug_init(void) +{ +} + #endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ #endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ -- 2.52.0.460.gd25c4c69ec-goog Using the new calls, use an atomic refcount to track how many times a page is mapped in any of the IOMMUs. For unmap we need to use iova_to_phys() to get the physical address of the pages. We use the smallest supported page size as the granularity of tracking per domain. This is important as it is possible to map pages and unmap them with larger sizes (as in map_sg()) cases. Signed-off-by: Mostafa Saleh --- drivers/iommu/iommu-debug-pagealloc.c | 91 +++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index 5665fd7360e0..c420b006ef08 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -27,19 +27,110 @@ struct page_ext_operations page_iommu_debug_ops = { .need = need_iommu_debug, }; +static struct page_ext *get_iommu_page_ext(phys_addr_t phys) +{ + struct page *page = phys_to_page(phys); + struct page_ext *page_ext = page_ext_get(page); + + return page_ext; +} + +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext) +{ + return page_ext_data(page_ext, &page_iommu_debug_ops); +} + +static void iommu_debug_inc_page(phys_addr_t phys) +{ + struct page_ext *page_ext = get_iommu_page_ext(phys); + struct iommu_debug_metadata *d = get_iommu_data(page_ext); + + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0); + page_ext_put(page_ext); +} + +static void iommu_debug_dec_page(phys_addr_t phys) +{ + struct page_ext *page_ext = get_iommu_page_ext(phys); + struct iommu_debug_metadata *d = get_iommu_data(page_ext); + + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0); + page_ext_put(page_ext); +} + +/* + * IOMMU page size doesn't have tomatch the CPU page size. So, we use + * the smallest IOMMU page size to refcount the pages in the vmemmap. + * That is important as both map and unmap has to use the same page size + * to update the refcount to avoid double counting the same page. + * And as we can't know from iommu_unmap() what was the original page size + * used for map, we just use the minimum supported one for both. + */ +static size_t iommu_debug_page_size(struct iommu_domain *domain) +{ + return 1UL << __ffs(domain->pgsize_bitmap); +} + void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) { + size_t off, end; + size_t page_size = iommu_debug_page_size(domain); + + if (WARN_ON(!phys || check_add_overflow(phys, size, &end))) + return; + + for (off = 0 ; off < size ; off += page_size) { + if (!pfn_valid(__phys_to_pfn(phys + off))) + continue; + iommu_debug_inc_page(phys + off); + } +} + +static void __iommu_debug_update_iova(struct iommu_domain *domain, + unsigned long iova, size_t size, bool inc) +{ + size_t off, end; + size_t page_size = iommu_debug_page_size(domain); + + if (WARN_ON(check_add_overflow(iova, size, &end))) + return; + + for (off = 0 ; off < size ; off += page_size) { + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off); + + if (!phys || !pfn_valid(__phys_to_pfn(phys))) + continue; + + if (inc) + iommu_debug_inc_page(phys); + else + iommu_debug_dec_page(phys); + } } void __iommu_debug_unmap_begin(struct iommu_domain *domain, unsigned long iova, size_t size) { + __iommu_debug_update_iova(domain, iova, size, false); } void __iommu_debug_unmap_end(struct iommu_domain *domain, unsigned long iova, size_t size, size_t unmapped) { + if (unmapped == size) + return; + + /* + * If unmap failed, re-increment the refcount, but if it unmapped + * larger size, decrement the extra part. + */ + if (unmapped < size) + __iommu_debug_update_iova(domain, iova + unmapped, + size - unmapped, true); + else + __iommu_debug_update_iova(domain, iova + size, + unmapped - size, false); } void iommu_debug_init(void) -- 2.52.0.460.gd25c4c69ec-goog Now, as the page_ext holds count of IOMMU mappings, we can use it to assert that any page allocated/freed is indeed not in the IOMMU. The sanitizer doesn’t protect against mapping/unmapping during this period. However, that’s less harmful as the page is not used by the kernel. Signed-off-by: Mostafa Saleh --- drivers/iommu/iommu-debug-pagealloc.c | 23 +++++++++++++++++++++++ include/linux/iommu-debug-pagealloc.h | 12 ++++++++++++ include/linux/mm.h | 5 +++++ 3 files changed, 40 insertions(+) diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c index c420b006ef08..8af11630d0e3 100644 --- a/drivers/iommu/iommu-debug-pagealloc.c +++ b/drivers/iommu/iommu-debug-pagealloc.c @@ -9,6 +9,7 @@ #include #include #include +#include static bool needed; DEFINE_STATIC_KEY_FALSE(iommu_debug_initialized); @@ -71,6 +72,28 @@ static size_t iommu_debug_page_size(struct iommu_domain *domain) return 1UL << __ffs(domain->pgsize_bitmap); } +static bool iommu_debug_page_count(const struct page *page) +{ + unsigned int ref; + struct page_ext *page_ext = page_ext_get(page); + struct iommu_debug_metadata *d = get_iommu_data(page_ext); + + ref = atomic_read(&d->ref); + page_ext_put(page_ext); + return ref != 0; +} + +void __iommu_debug_check_unmapped(const struct page *page, int numpages) +{ + while (numpages--) { + if (WARN_ON(iommu_debug_page_count(page))) { + pr_warn("iommu: Detected page leak!\n"); + dump_page_owner(page); + } + page++; + } +} + void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) { size_t off, end; diff --git a/include/linux/iommu-debug-pagealloc.h b/include/linux/iommu-debug-pagealloc.h index 454303ec09c2..87f593305465 100644 --- a/include/linux/iommu-debug-pagealloc.h +++ b/include/linux/iommu-debug-pagealloc.h @@ -22,6 +22,7 @@ void __iommu_debug_unmap_begin(struct iommu_domain *domain, unsigned long iova, size_t size); void __iommu_debug_unmap_end(struct iommu_domain *domain, unsigned long iova, size_t size, size_t unmapped); +void __iommu_debug_check_unmapped(const struct page *page, int numpages); static inline void iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) @@ -45,6 +46,12 @@ static inline void iommu_debug_unmap_end(struct iommu_domain *domain, __iommu_debug_unmap_end(domain, iova, size, unmapped); } +static inline void iommu_debug_check_unmapped(const struct page *page, int numpages) +{ + if (static_branch_unlikely(&iommu_debug_initialized)) + __iommu_debug_check_unmapped(page, numpages); +} + void iommu_debug_init(void); #else @@ -68,6 +75,11 @@ static inline void iommu_debug_init(void) { } +static inline void iommu_debug_check_unmapped(const struct page *page, + int numpages) +{ +} + #endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ #endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..739cd3e168d7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -3818,12 +3819,16 @@ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { + iommu_debug_check_unmapped(page, numpages); + if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } -- 2.52.0.460.gd25c4c69ec-goog