Since the TLB Flush IPI is avoided when the modified mm is not running
on the target CPU, next time when target CPU switches to the modified
mm, it has to check percpu TLB Flush queue and perform TLB FLush for the
midified mm.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/mm/tlbflush.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index b5a2d9874d62b..0083fac87c2bc 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -365,8 +365,10 @@ void local_load_tlb_mm(struct mm_struct *mm)
 {
 	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
 	struct tlb_context *contexts = info->contexts;
+	struct tlb_flush_queue *queue = NULL;
 	struct mm_struct *victim = NULL;
-	unsigned int i, pos = 0, min = UINT_MAX;
+	unsigned int i, len, pos = 0, min = UINT_MAX;
+	unsigned long asid, start, size, stride;
 
 	for (i = 0; i < MAX_LOADED_MM; i++) {
 		if (contexts[i].mm == mm) {
@@ -387,11 +389,36 @@ void local_load_tlb_mm(struct mm_struct *mm)
 		mmgrab_lazy_mm(mm);
 		victim = contexts[pos].mm;
 		contexts[pos].mm = mm;
+		contexts[pos].need_flush = false;
+
+		queue = &info->flush_queues[pos];
+		atomic_set(&queue->len, 0);
+		queue->flag = 0;
 	}
 	contexts[pos].gen = new_tlb_gen(info);
 
 	write_unlock(&info->rwlock);
 
+	if (contexts[pos].need_flush) {
+		queue = &info->flush_queues[pos];
+		asid = get_mm_asid(mm);
+		if (queue->flag & FLUSH_TLB_ALL_ASID) {
+			local_flush_tlb_all_asid(asid);
+		} else {
+			len = atomic_read(&queue->len);
+			for (i = 0; i < len; i++) {
+				start = queue->tasks[i].start;
+				size = queue->tasks[i].size;
+				stride = queue->tasks[i].stride;
+				local_flush_tlb_range_asid(start, size,
+							   stride, asid);
+			}
+		}
+		contexts[pos].need_flush = false;
+		atomic_set(&queue->len, 0);
+		queue->flag = 0;
+	}
+
 	if (victim) {
 		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(victim));
 		local_flush_tlb_all_asid(get_mm_asid(victim));
-- 
2.20.1