From b21478b5333e2bf48391914d109bfd97a50d5203 Mon Sep 17 00:00:00 2001 From: Subrata Nath Date: Sat, 9 Aug 2025 11:08:30 +0000 Subject: [PATCH] mm: prevent RCU stalls in kswapd by adding cond_resched() Based on: v6.1.128 The kswapd0 thread can spend extended time in page_vma_mapped_walk() -> queued_spin_lock_slowpath() without yielding the CPU. Even with CONFIG_PREEMPTION=y, the rcu_preempt kthread cannot preempt kswapd0 because preemption and interrupts are disabled while holding the spinlock. Example stall report: rcu: INFO: rcu_preempt self-detected stall on CPU rcu: rcu_preempt kthread starved for 65939907 jiffies! Call trace: queued_spin_lock_slowpath page_vma_mapped_walk folio_referenced_one kswapd Similar stalls occur in shrink_zones(), where long-running loops prevent CPUs from reporting a quiescent state during the RCU grace period. Without such reports, RCU stall warnings can escalate to soft lockups or OOM kills. A quiescent state is reported when a CPU exits an RCU read-side critical section, enters idle/user mode, performs a context switch, or voluntarily reschedules. Fix this by adding cond_resched() after all spinlock release points in page_vma_mapped_walk() and in the main loop of shrink_zones(). These calls, placed outside spinlock-held sections, allow voluntary scheduling and ensure timely quiescent state reporting, avoiding prolonged RCU stalls. Signed-off-by: Subrata Nath --- mm/page_vma_mapped.c | 3 +++ mm/vmscan.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 93e13fc17..7775c151f 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -234,6 +234,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) } /* THP pmd was split under us: handle on pte level */ spin_unlock(pvmw->ptl); + cond_resched(); pvmw->ptl = NULL; } else if (!pmd_present(pmde)) { /* @@ -247,6 +248,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); spin_unlock(ptl); + cond_resched(); } step_forward(pvmw, PMD_SIZE); continue; @@ -265,6 +267,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { if (pvmw->ptl) { spin_unlock(pvmw->ptl); + cond_resched(); pvmw->ptl = NULL; } pte_unmap(pvmw->pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index be863204d..02064b4fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6415,6 +6415,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); + cond_resched(); } if (first_pgdat) @@ -6490,6 +6491,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; + cond_resched(); } while (--sc->priority >= 0); last_pgdat = NULL; @@ -6508,6 +6510,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, zone->zone_pgdat); clear_bit(LRUVEC_CONGESTED, &lruvec->flags); } + cond_resched(); } delayacct_freepages_end(); -- 2.34.1