madvise(MADV_HWPOISON) can trigger a recursive spinlock self-deadlock (AA deadlock) on hugetlb_lock due to a race with concurrent folio unmapping. The race scenario: Thread 1 (madvise MADV_HWPOISON) Thread 2 (unmap) ------------------------------- ----------------- madvise_inject_error() get_user_pages_fast() <- refcount++ memory_failure(MF_COUNT_INCREASED) get_huge_page_for_hwpoison() spin_lock_irq(&hugetlb_lock) // refcount == 2 (gup + map) // MF_COUNT_INCREASED path: count_increased = true zap_pte_range() page_remove_rmap() put_page() <- drops map ref // refcount: 2 -> 1 hugetlb_update_hwpoison() -> MF_HUGETLB_FOLIO_PRE_POISONED -> goto out out: folio_put(folio) <- drops gup ref // refcount: 1 -> 0 free_huge_folio() spin_lock_irq(&hugetlb_lock) <- AA DEADLOCK When Thread 2's put_page() drops the mapping reference while Thread 1 holds hugetlb_lock, the folio refcount drops to 1. The subsequent folio_put() at the out: label frees the folio, and free_huge_folio() attempts to re-acquire the non-recursive hugetlb_lock on the same CPU, resulting in an AA self-deadlock. The same deadlock can also occur on the folio_try_get() path: when a migratable folio is found and folio_try_get() succeeds (refcount rises to refcount+1), a concurrent unmap and a hugetlb_update_hwpoison() returning pre-poisoned status will land at out: where folio_put() again may free the folio under hugetlb_lock. Fix this by removing the hugetlb_lock wrapper from hugetlb.c and moving the lock acquisition directly into get_huge_page_for_hwpoison() (formerly __get_huge_page_for_hwpoison) in memory-failure.c. Place spin_unlock_irq() before folio_put() at the out: label so that the folio is always released outside the lock, preventing any recursive lock acquisition. Remove the now-incorrect "Called from hugetlb code with hugetlb_lock held" comment, and update the stale __get_huge_page_for_hwpoison declarations in include/linux/mm.h. Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()") Signed-off-by: Wupeng Ma --- include/linux/hugetlb.h | 8 -------- include/linux/mm.h | 8 -------- mm/hugetlb.c | 11 ----------- mm/memory-failure.c | 8 ++++---- 4 files changed, 4 insertions(+), 31 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65910437be1ca..aa3eb42e0a01a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); -int get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared); void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -422,12 +420,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, return 0; } -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared) -{ - return 0; -} - static inline void folio_putback_hugetlb(struct folio *folio) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f064..46e5936dabaa8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4602,8 +4602,6 @@ extern int soft_offline_page(unsigned long pfn, int flags); */ extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else @@ -4611,12 +4609,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags) { } -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared) -{ - return 0; -} - static inline void num_poisoned_pages_inc(unsigned long pfn) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 327eaa4074d39..4c99bb868ad08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7170,17 +7170,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison return ret; } -int get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared) -{ - int ret; - - spin_lock_irq(&hugetlb_lock); - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); - spin_unlock_irq(&hugetlb_lock); - return ret; -} - /** * folio_putback_hugetlb - unisolate a hugetlb folio * @folio: the isolated hugetlb folio diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ee42d43613097..28522180cf7f8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1966,10 +1966,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) folio_free_raw_hwp(folio, true); } -/* - * Called from hugetlb code with hugetlb_lock held. - */ -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, +static int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); @@ -1977,6 +1974,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool count_increased = false; int ret, rc; + spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio)) { ret = MF_HUGETLB_NON_HUGEPAGE; goto out; @@ -2013,8 +2011,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, *migratable_cleared = true; } + spin_unlock_irq(&hugetlb_lock); return ret; out: + spin_unlock_irq(&hugetlb_lock); if (count_increased) folio_put(folio); return ret; -- 2.43.0