While we handle pte_lockptr() == pmd_lockptr() correctly in
zap_pte_table_if_empty(), we don't handle it in zap_empty_pte_table(),
making the spin_trylock() always fail and forcing us onto the slow path.

So let's handle the scenario where pte_lockptr() == pmd_lockptr()
better, which can only happen if CONFIG_SPLIT_PTE_PTLOCKS is not set.

This is only relevant once we unlock CONFIG_PT_RECLAIM on architectures
that are not x86-64.

Signed-off-by: David Hildenbrand (Red Hat) <david@kernel.org>
---
 mm/memory.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index c3055b2577c27..3852075ea62d4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1833,16 +1833,18 @@ static bool pte_table_reclaim_possible(unsigned long start, unsigned long end,
 	return details && details->reclaim_pt && (end - start >= PMD_SIZE);
 }
 
-static bool zap_empty_pte_table(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval)
+static bool zap_empty_pte_table(struct mm_struct *mm, pmd_t *pmd,
+		spinlock_t *ptl, pmd_t *pmdval)
 {
 	spinlock_t *pml = pmd_lockptr(mm, pmd);
 
-	if (!spin_trylock(pml))
+	if (ptl != pml && !spin_trylock(pml))
 		return false;
 
 	*pmdval = pmdp_get(pmd);
 	pmd_clear(pmd);
-	spin_unlock(pml);
+	if (ptl != pml)
+		spin_unlock(pml);
 	return true;
 }
 
@@ -1934,7 +1936,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	 * from being repopulated by another thread.
 	 */
 	if (can_reclaim_pt && direct_reclaim && addr == end)
-		direct_reclaim = zap_empty_pte_table(mm, pmd, &pmdval);
+		direct_reclaim = zap_empty_pte_table(mm, pmd, ptl, &pmdval);
 
 	add_mm_rss_vec(mm, rss);
 	lazy_mmu_mode_disable();
-- 
2.52.0