This feature avoids unnecessary TLB Flush IPIs. After memory mapping
modifications on certain mm_struct, instead of sending IPIs, this feature
records the TLB Flush information on percpu buffer, defer the TLB Flush
to the moment when target CPUs really load this mm_struct.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/Kconfig | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 22cda9c452d2a..d219c7f4b129e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -925,6 +925,18 @@ config RISCV_VECTOR_MISALIGNED
 	help
 	  Enable detecting support for vector misaligned loads and stores.
 
+config RISCV_LAZY_TLB_FLUSH
+	bool "Defer TLB Flush to context switch to avoid IPIs"
+	depends on MMU && SMP
+	def_bool n
+	help
+	  This feature avoids unnecessary TLB Flush IPIs. After memory mapping
+	  modifications on certain mm_struct, instead of sending IPIs, this feature
+	  records the TLB Flush information on percpu buffer, defer the TLB Flush
+	  to the moment when target CPUs really load this mm_struct.
+
+	  If unsure what to do here, say N.
+
 choice
 	prompt "Unaligned Accesses Support"
 	default RISCV_PROBE_UNALIGNED_ACCESS
-- 
2.20.1


Since each CPU has limited TLB entries, there exist limited active ASIDs
in each CPU's TLB at the same time. Thus we apply a threshold here. When
a mm_struct is loaded, we mark its ASID as active. If the number of
active ASIDs exceeds the threshold, we evict the mm_struct that has not
been used for the longest time, flush its TLB entries, mark its ASID
inactive, and clear current CPU in its mm_cpumask.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/include/asm/tlbflush.h | 27 +++++++++++++
 arch/riscv/mm/context.c           |  1 +
 arch/riscv/mm/tlbflush.c          | 66 +++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+)

diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index eed0abc405143..3f83fd5ef36db 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -66,6 +66,33 @@ void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 
 extern unsigned long tlb_flush_all_threshold;
+
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+
+#define MAX_LOADED_MM					6
+
+struct tlb_context {
+	struct mm_struct *mm;
+	unsigned int gen;
+};
+
+struct tlb_info {
+	rwlock_t rwlock;
+	struct mm_struct *active_mm;
+	unsigned int next_gen;
+	struct tlb_context contexts[MAX_LOADED_MM];
+};
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo);
+
+void local_load_tlb_mm(struct mm_struct *mm);
+
+#else /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
+static inline void local_load_tlb_mm(struct mm_struct *mm) {}
+
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
 #else /* CONFIG_MMU */
 #define local_flush_tlb_all()			do { } while (0)
 #endif /* CONFIG_MMU */
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 55c20ad1f7444..a7cf36ad34678 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -217,6 +217,7 @@ static inline void set_mm(struct mm_struct *prev,
 	 */
 	cpumask_set_cpu(cpu, mm_cpumask(next));
 	if (static_branch_unlikely(&use_asid_allocator)) {
+		local_load_tlb_mm(next);
 		set_mm_asid(next, cpu);
 	} else {
 		cpumask_clear_cpu(cpu, mm_cpumask(prev));
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 8404530ec00f9..0b1c21c7aafb8 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -103,6 +103,15 @@ struct flush_tlb_range_data {
 	unsigned long stride;
 };
 
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo) = {
+	.rwlock = __RW_LOCK_UNLOCKED(tlbinfo.rwlock),
+	.active_mm = NULL,
+	.next_gen = 1,
+	.contexts = { { NULL, 0, }, },
+};
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
 static void __ipi_flush_tlb_range_asid(void *info)
 {
 	struct flush_tlb_range_data *d = info;
@@ -240,3 +249,60 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 			  0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
 	cpumask_clear(&batch->cpumask);
 }
+
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+
+static inline unsigned int new_tlb_gen(struct tlb_info *info)
+{
+	unsigned int gen = info->next_gen++;
+	unsigned int i;
+
+	if (unlikely(!info->next_gen)) {
+		for (i = 0; i < MAX_LOADED_MM; i++) {
+			if (info->contexts[i].gen)
+				info->contexts[i].gen = 1;
+		}
+		info->next_gen = 1;
+		gen = info->next_gen++;
+	}
+
+	return gen;
+}
+
+void local_load_tlb_mm(struct mm_struct *mm)
+{
+	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
+	struct tlb_context *contexts = info->contexts;
+	struct mm_struct *victim = NULL;
+	unsigned int i, pos = 0, min = UINT_MAX;
+
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (contexts[i].mm == mm) {
+			pos = i;
+			break;
+		}
+		if (min > contexts[i].gen) {
+			min = contexts[i].gen;
+			pos = i;
+		}
+	}
+
+	write_lock(&info->rwlock);
+
+	info->active_mm = mm;
+
+	if (contexts[pos].mm != mm) {
+		victim = contexts[pos].mm;
+		contexts[pos].mm = mm;
+	}
+	contexts[pos].gen = new_tlb_gen(info);
+
+	write_unlock(&info->rwlock);
+
+	if (victim) {
+		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(victim));
+		local_flush_tlb_all_asid(get_mm_asid(victim));
+	}
+}
+
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
-- 
2.20.1


We maintain an array of mm_structs whose ASIDs are active on the current
CPU. To avoid these mm_structs getting released, we grab their mm_count
before loaded them into the array. And drop their mm_count via tasklet
when they are evicted out of the array.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/include/asm/mmu.h |  4 +++
 arch/riscv/mm/tlbflush.c     | 47 ++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index cf8e6eac77d52..913fa535b3d19 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -30,6 +30,10 @@ typedef struct {
 #ifdef CONFIG_RISCV_ISA_SUPM
 	u8 pmlen;
 #endif
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+	atomic_t lazy_tlb_cnt;
+	void *next;
+#endif
 } mm_context_t;
 
 /* Lock the pointer masking mode because this mm is multithreaded */
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 0b1c21c7aafb8..4b2ce06cbe6bd 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -104,12 +104,57 @@ struct flush_tlb_range_data {
 };
 
 #ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo) = {
 	.rwlock = __RW_LOCK_UNLOCKED(tlbinfo.rwlock),
 	.active_mm = NULL,
 	.next_gen = 1,
 	.contexts = { { NULL, 0, }, },
 };
+
+static DEFINE_PER_CPU(mm_context_t *, mmdrop_victims);
+
+static void mmdrop_lazy_mms(struct tasklet_struct *tasklet)
+{
+	mm_context_t *victim = xchg_relaxed(this_cpu_ptr(&mmdrop_victims), NULL);
+	struct mm_struct *mm = NULL;
+
+	while (victim) {
+		mm = container_of(victim, struct mm_struct, context);
+		while (atomic_dec_return_relaxed(&victim->lazy_tlb_cnt) != 0)
+			mmdrop_lazy_tlb(mm);
+		victim = victim->next;
+	}
+}
+
+static DEFINE_PER_CPU(struct tasklet_struct, mmdrop_tasklets) = {
+	.count = ATOMIC_INIT(0),
+	.callback = mmdrop_lazy_mms,
+	.use_callback = true,
+};
+
+static inline void mmgrab_lazy_mm(struct mm_struct *mm)
+{
+	mmgrab_lazy_tlb(mm);
+	atomic_inc(&mm->context.lazy_tlb_cnt);
+}
+
+static inline void mmdrop_lazy_mm(struct mm_struct *mm)
+{
+	mm_context_t **head, *list, *context = &mm->context;
+
+	if (atomic_inc_return_relaxed(&context->lazy_tlb_cnt) == 1) {
+		head = this_cpu_ptr(&mmdrop_victims);
+
+		do {
+			list = *head;
+			context->next = list;
+		} while (cmpxchg_relaxed(head, list, context) != list);
+
+		tasklet_schedule(this_cpu_ptr(&mmdrop_tasklets));
+	}
+}
+
 #endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
 
 static void __ipi_flush_tlb_range_asid(void *info)
@@ -292,6 +337,7 @@ void local_load_tlb_mm(struct mm_struct *mm)
 	info->active_mm = mm;
 
 	if (contexts[pos].mm != mm) {
+		mmgrab_lazy_mm(mm);
 		victim = contexts[pos].mm;
 		contexts[pos].mm = mm;
 	}
@@ -302,6 +348,7 @@ void local_load_tlb_mm(struct mm_struct *mm)
 	if (victim) {
 		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(victim));
 		local_flush_tlb_all_asid(get_mm_asid(victim));
+		mmdrop_lazy_mm(victim);
 	}
 }
 
-- 
2.20.1


When both CONFIG_RISCV_LAZY_TLB_FLUSH and CONFIG_MMU_LSYZ_TLB_SHOOTDOWN
is enabled, riscv needs an arch special method to free the mm that needs
to be shot down. Thus we add arch override for  do_shoot_lazy_tlb().

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 kernel/fork.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a95..b6d11acd6ac10 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -612,7 +612,8 @@ static void do_check_lazy_tlb(void *arg)
 	WARN_ON_ONCE(current->active_mm == mm);
 }
 
-static void do_shoot_lazy_tlb(void *arg)
+#ifndef arch_do_shoot_lazy_tlb
+static void arch_do_shoot_lazy_tlb(void *arg)
 {
 	struct mm_struct *mm = arg;
 
@@ -622,6 +623,7 @@ static void do_shoot_lazy_tlb(void *arg)
 		switch_mm(mm, &init_mm, current);
 	}
 }
+#endif
 
 static void cleanup_lazy_tlbs(struct mm_struct *mm)
 {
@@ -661,7 +663,7 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
 	 * - A delayed freeing and RCU-like quiescing sequence based on mm
 	 *   switching to avoid IPIs completely.
 	 */
-	on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+	on_each_cpu_mask(mm_cpumask(mm), arch_do_shoot_lazy_tlb, (void *)mm, 1);
 	if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
 		on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
 }
-- 
2.20.1


When an active_mm is shot down, we switch it to the init_mm, evict it
out of percpu active mm array.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/include/asm/mmu_context.h |  5 ++++
 arch/riscv/include/asm/tlbflush.h    | 11 +++++++++
 arch/riscv/mm/context.c              | 19 ++++++++++++++++
 arch/riscv/mm/tlbflush.c             | 34 ++++++++++++++++++++++++----
 4 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h
index 8c4bc49a3a0f5..bc73cc3262ae6 100644
--- a/arch/riscv/include/asm/mmu_context.h
+++ b/arch/riscv/include/asm/mmu_context.h
@@ -16,6 +16,11 @@
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	struct task_struct *task);
 
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+#define arch_do_shoot_lazy_tlb	arch_do_shoot_lazy_tlb
+void arch_do_shoot_lazy_tlb(void *arg);
+#endif
+
 #define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev,
 			       struct mm_struct *next)
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 3f83fd5ef36db..e7365a53265a6 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -15,6 +15,11 @@
 #define FLUSH_TLB_NO_ASID       ((unsigned long)-1)
 
 #ifdef CONFIG_MMU
+static inline unsigned long get_mm_asid(struct mm_struct *mm)
+{
+	return mm ? cntx2asid(atomic_long_read(&mm->context.id)) : FLUSH_TLB_NO_ASID;
+}
+
 static inline void local_flush_tlb_all(void)
 {
 	__asm__ __volatile__ ("sfence.vma" : : : "memory");
@@ -86,11 +91,17 @@ struct tlb_info {
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo);
 
 void local_load_tlb_mm(struct mm_struct *mm);
+void local_flush_tlb_mm(struct mm_struct *mm);
 
 #else /* CONFIG_RISCV_LAZY_TLB_FLUSH */
 
 static inline void local_load_tlb_mm(struct mm_struct *mm) {}
 
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	local_flush_tlb_all_asid(get_mm_asid(mm));
+}
+
 #endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
 
 #else /* CONFIG_MMU */
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index a7cf36ad34678..3335080e5f720 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -274,6 +274,25 @@ static int __init asids_init(void)
 	return 0;
 }
 early_initcall(asids_init);
+
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+void arch_do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+
+	if (!static_branch_unlikely(&use_asid_allocator) || !mm)
+		return;
+
+	local_flush_tlb_mm(mm);
+}
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
 #else
 static inline void set_mm(struct mm_struct *prev,
 			  struct mm_struct *next, unsigned int cpu)
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 4b2ce06cbe6bd..a47bacf5801ab 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -164,11 +164,6 @@ static void __ipi_flush_tlb_range_asid(void *info)
 	local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid);
 }
 
-static inline unsigned long get_mm_asid(struct mm_struct *mm)
-{
-	return mm ? cntx2asid(atomic_long_read(&mm->context.id)) : FLUSH_TLB_NO_ASID;
-}
-
 static void __flush_tlb_range(struct mm_struct *mm,
 			      const struct cpumask *cmask,
 			      unsigned long start, unsigned long size,
@@ -352,4 +347,33 @@ void local_load_tlb_mm(struct mm_struct *mm)
 	}
 }
 
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
+	struct tlb_context *contexts = info->contexts;
+	unsigned long asid = get_mm_asid(mm);
+	unsigned int i;
+
+	if (!mm || mm == info->active_mm) {
+		local_flush_tlb_all_asid(asid);
+		return;
+	}
+
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (contexts[i].mm != mm)
+			continue;
+
+		write_lock(&info->rwlock);
+		contexts[i].mm = NULL;
+		contexts[i].gen = 0;
+		write_unlock(&info->rwlock);
+
+		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(mm));
+		mmdrop_lazy_mm(mm);
+		break;
+	}
+
+	local_flush_tlb_all_asid(asid);
+}
+
 #endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
-- 
2.20.1


When memory mapping of a mm is modified, instead of sending IPI to all
CPUs recorded in its mm_cpumask, we check whether each target CPU is
using this mm right now. If not, we just store the TLB Flush information
in target CPU's percpu TLB Flush queue, avoiding the IPI.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/include/asm/tlbflush.h | 19 +++++++++
 arch/riscv/mm/context.c           |  2 +
 arch/riscv/mm/tlbflush.c          | 71 ++++++++++++++++++++++++++++++-
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index e7365a53265a6..c9630267c58cd 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -75,23 +75,40 @@ extern unsigned long tlb_flush_all_threshold;
 #ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
 
 #define MAX_LOADED_MM					6
+#define MAX_TLB_FLUSH_TASK				32
+#define FLUSH_TLB_ALL_ASID				0x1
 
 struct tlb_context {
 	struct mm_struct *mm;
 	unsigned int gen;
+	bool need_flush;
 };
 
+struct tlb_flush_task {
+	unsigned long start;
+	unsigned long size;
+	unsigned long stride;
+};
+
+struct tlb_flush_queue {
+	atomic_t len;
+	unsigned int flag;
+	struct tlb_flush_task tasks[MAX_TLB_FLUSH_TASK];
+} ____cacheline_aligned_in_smp;
+
 struct tlb_info {
 	rwlock_t rwlock;
 	struct mm_struct *active_mm;
 	unsigned int next_gen;
 	struct tlb_context contexts[MAX_LOADED_MM];
+	struct tlb_flush_queue *flush_queues;
 };
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo);
 
 void local_load_tlb_mm(struct mm_struct *mm);
 void local_flush_tlb_mm(struct mm_struct *mm);
+void __init lazy_tlb_flush_init(void);
 
 #else /* CONFIG_RISCV_LAZY_TLB_FLUSH */
 
@@ -102,6 +119,8 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	local_flush_tlb_all_asid(get_mm_asid(mm));
 }
 
+static inline void lazy_tlb_flush_init(void) {}
+
 #endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
 
 #else /* CONFIG_MMU */
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 3335080e5f720..c381c4ed46bfb 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -263,6 +263,8 @@ static int __init asids_init(void)
 
 		__set_bit(0, context_asid_map);
 
+		lazy_tlb_flush_init();
+
 		static_branch_enable(&use_asid_allocator);
 
 		pr_info("ASID allocator using %lu bits (%lu entries)\n",
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index a47bacf5801ab..b5a2d9874d62b 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -97,6 +97,7 @@ void flush_tlb_all(void)
 }
 
 struct flush_tlb_range_data {
+	struct mm_struct *mm;
 	unsigned long asid;
 	unsigned long start;
 	unsigned long size;
@@ -109,7 +110,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo) = {
 	.rwlock = __RW_LOCK_UNLOCKED(tlbinfo.rwlock),
 	.active_mm = NULL,
 	.next_gen = 1,
-	.contexts = { { NULL, 0, }, },
+	.contexts = { { NULL, 0, false, }, },
+	.next_gen = 0,
 };
 
 static DEFINE_PER_CPU(mm_context_t *, mmdrop_victims);
@@ -155,6 +157,47 @@ static inline void mmdrop_lazy_mm(struct mm_struct *mm)
 	}
 }
 
+static bool should_ipi_flush(int cpu, void *data)
+{
+	struct tlb_info *info = per_cpu_ptr(&tlbinfo, cpu);
+	struct tlb_context *contexts = info->contexts;
+	struct tlb_flush_queue *queue = NULL;
+	struct flush_tlb_range_data *ftd = data;
+	unsigned int i, index;
+	unsigned long flags;
+
+	if (info->active_mm == ftd->mm)
+		return true;
+
+	read_lock_irqsave(&info->rwlock, flags);
+
+	if (info->active_mm == ftd->mm) {
+		read_unlock_irqrestore(&info->rwlock, flags);
+		return true;
+	}
+
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (contexts[i].mm != ftd->mm)
+			continue;
+
+		queue = &info->flush_queues[i];
+		index = atomic_fetch_add_unless(&queue->len, 1, MAX_TLB_FLUSH_TASK);
+		if (index < MAX_TLB_FLUSH_TASK) {
+			queue->tasks[index].start = ftd->start;
+			queue->tasks[index].stride = ftd->stride;
+			queue->tasks[index].size = ftd->size;
+		} else {
+			queue->flag |= FLUSH_TLB_ALL_ASID;
+		}
+		contexts[i].need_flush = true;
+		break;
+	}
+
+	read_unlock_irqrestore(&info->rwlock, flags);
+
+	return false;
+}
+
 #endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
 
 static void __ipi_flush_tlb_range_asid(void *info)
@@ -185,11 +228,20 @@ static void __flush_tlb_range(struct mm_struct *mm,
 	} else {
 		struct flush_tlb_range_data ftd;
 
+		ftd.mm = mm;
 		ftd.asid = asid;
 		ftd.start = start;
 		ftd.size = size;
 		ftd.stride = stride;
-		on_each_cpu_mask(cmask, __ipi_flush_tlb_range_asid, &ftd, 1);
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+		if (static_branch_unlikely(&use_asid_allocator) && mm)
+			on_each_cpu_cond_mask(should_ipi_flush,
+					      __ipi_flush_tlb_range_asid,
+					      &ftd, 1, cmask);
+		else
+#endif
+			on_each_cpu_mask(cmask, __ipi_flush_tlb_range_asid,
+					 &ftd, 1);
 	}
 
 	put_cpu();
@@ -376,4 +428,19 @@ void local_flush_tlb_mm(struct mm_struct *mm)
 	local_flush_tlb_all_asid(asid);
 }
 
+void __init lazy_tlb_flush_init(void)
+{
+	struct tlb_flush_queue *queue;
+	unsigned int cpu, size;
+
+	size = MAX_LOADED_MM * sizeof(struct tlb_flush_queue);
+	for_each_possible_cpu(cpu) {
+		queue = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+		if (!queue)
+			panic("Failed to alloc per cpu tlb flush queue\n");
+
+		per_cpu(tlbinfo, cpu).flush_queues = queue;
+	}
+}
+
 #endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
-- 
2.20.1


Since the TLB Flush IPI is avoided when the modified mm is not running
on the target CPU, next time when target CPU switches to the modified
mm, it has to check percpu TLB Flush queue and perform TLB FLush for the
midified mm.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/mm/tlbflush.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index b5a2d9874d62b..0083fac87c2bc 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -365,8 +365,10 @@ void local_load_tlb_mm(struct mm_struct *mm)
 {
 	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
 	struct tlb_context *contexts = info->contexts;
+	struct tlb_flush_queue *queue = NULL;
 	struct mm_struct *victim = NULL;
-	unsigned int i, pos = 0, min = UINT_MAX;
+	unsigned int i, len, pos = 0, min = UINT_MAX;
+	unsigned long asid, start, size, stride;
 
 	for (i = 0; i < MAX_LOADED_MM; i++) {
 		if (contexts[i].mm == mm) {
@@ -387,11 +389,36 @@ void local_load_tlb_mm(struct mm_struct *mm)
 		mmgrab_lazy_mm(mm);
 		victim = contexts[pos].mm;
 		contexts[pos].mm = mm;
+		contexts[pos].need_flush = false;
+
+		queue = &info->flush_queues[pos];
+		atomic_set(&queue->len, 0);
+		queue->flag = 0;
 	}
 	contexts[pos].gen = new_tlb_gen(info);
 
 	write_unlock(&info->rwlock);
 
+	if (contexts[pos].need_flush) {
+		queue = &info->flush_queues[pos];
+		asid = get_mm_asid(mm);
+		if (queue->flag & FLUSH_TLB_ALL_ASID) {
+			local_flush_tlb_all_asid(asid);
+		} else {
+			len = atomic_read(&queue->len);
+			for (i = 0; i < len; i++) {
+				start = queue->tasks[i].start;
+				size = queue->tasks[i].size;
+				stride = queue->tasks[i].stride;
+				local_flush_tlb_range_asid(start, size,
+							   stride, asid);
+			}
+		}
+		contexts[pos].need_flush = false;
+		atomic_set(&queue->len, 0);
+		queue->flag = 0;
+	}
+
 	if (victim) {
 		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(victim));
 		local_flush_tlb_all_asid(get_mm_asid(victim));
-- 
2.20.1


When local_flush_tlb_all_asid() is called, the target mm's TLB entries
are all flushed out, then we can clear current CPU in its mm_cpumask so
that next time the mm's memory mapping is modified, no IPI will be sent
to current CPU.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/mm/tlbflush.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 0083fac87c2bc..88a1e45bcf508 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -35,7 +35,8 @@ static inline void local_sinval_vma(unsigned long vma, unsigned long asid)
  */
 unsigned long tlb_flush_all_threshold __read_mostly = 64;
 
-static void local_flush_tlb_range_threshold_asid(unsigned long start,
+static void local_flush_tlb_range_threshold_asid(struct mm_struct *mm,
+						 unsigned long start,
 						 unsigned long size,
 						 unsigned long stride,
 						 unsigned long asid)
@@ -44,7 +45,7 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start,
 	int i;
 
 	if (nr_ptes_in_range > tlb_flush_all_threshold) {
-		local_flush_tlb_all_asid(asid);
+		local_flush_tlb_mm(mm);
 		return;
 	}
 
@@ -64,21 +65,26 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start,
 	}
 }
 
-static inline void local_flush_tlb_range_asid(unsigned long start,
-		unsigned long size, unsigned long stride, unsigned long asid)
+static inline void local_flush_tlb_range_asid(struct mm_struct *mm,
+					      unsigned long start,
+					      unsigned long size,
+					      unsigned long stride,
+					      unsigned long asid)
 {
-	if (size <= stride)
+	if (size <= stride) {
 		local_flush_tlb_page_asid(start, asid);
-	else if (size == FLUSH_TLB_MAX_SIZE)
-		local_flush_tlb_all_asid(asid);
-	else
-		local_flush_tlb_range_threshold_asid(start, size, stride, asid);
+	} else if (size == FLUSH_TLB_MAX_SIZE) {
+		local_flush_tlb_mm(mm);
+	} else {
+		local_flush_tlb_range_threshold_asid(mm, start, size, stride,
+						     asid);
+	}
 }
 
 /* Flush a range of kernel pages without broadcasting */
 void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	local_flush_tlb_range_asid(start, end - start, PAGE_SIZE, FLUSH_TLB_NO_ASID);
+	local_flush_tlb_range_asid(NULL, start, end - start, PAGE_SIZE, FLUSH_TLB_NO_ASID);
 }
 
 static void __ipi_flush_tlb_all(void *info)
@@ -204,7 +210,7 @@ static void __ipi_flush_tlb_range_asid(void *info)
 {
 	struct flush_tlb_range_data *d = info;
 
-	local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid);
+	local_flush_tlb_range_asid(d->mm, d->start, d->size, d->stride, d->asid);
 }
 
 static void __flush_tlb_range(struct mm_struct *mm,
@@ -222,7 +228,7 @@ static void __flush_tlb_range(struct mm_struct *mm,
 
 	/* Check if the TLB flush needs to be sent to other CPUs. */
 	if (cpumask_any_but(cmask, cpu) >= nr_cpu_ids) {
-		local_flush_tlb_range_asid(start, size, stride, asid);
+		local_flush_tlb_range_asid(mm, start, size, stride, asid);
 	} else if (riscv_use_sbi_for_rfence()) {
 		sbi_remote_sfence_vma_asid(cmask, start, size, asid);
 	} else {
@@ -410,7 +416,7 @@ void local_load_tlb_mm(struct mm_struct *mm)
 				start = queue->tasks[i].start;
 				size = queue->tasks[i].size;
 				stride = queue->tasks[i].stride;
-				local_flush_tlb_range_asid(start, size,
+				local_flush_tlb_range_asid(mm, start, size,
 							   stride, asid);
 			}
 		}
-- 
2.20.1


Now that we maintain an array of active mms on each CPU, when
local_flush_tlb_all() is called, we can clear current CPU in the
mm_cpumask of all active mms on current CPU.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
 arch/riscv/include/asm/tlbflush.h |  6 ++++++
 arch/riscv/mm/context.c           |  2 +-
 arch/riscv/mm/tlbflush.c          | 31 +++++++++++++++++++++++++++++--
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index c9630267c58cd..fd62b27172d4a 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -108,6 +108,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo);
 
 void local_load_tlb_mm(struct mm_struct *mm);
 void local_flush_tlb_mm(struct mm_struct *mm);
+void local_flush_tlb_all_mm(void);
 void __init lazy_tlb_flush_init(void);
 
 #else /* CONFIG_RISCV_LAZY_TLB_FLUSH */
@@ -119,6 +120,11 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	local_flush_tlb_all_asid(get_mm_asid(mm));
 }
 
+static inline void local_flush_tlb_all_mm(void)
+{
+	local_flush_tlb_all();
+}
+
 static inline void lazy_tlb_flush_init(void) {}
 
 #endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index c381c4ed46bfb..b6657681948f9 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -194,7 +194,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
 		  satp_mode);
 
 	if (need_flush_tlb)
-		local_flush_tlb_all();
+		local_flush_tlb_all_mm();
 }
 
 static void set_mm_noasid(struct mm_struct *mm)
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 88a1e45bcf508..73c0a7ef61cb1 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -89,13 +89,13 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
 static void __ipi_flush_tlb_all(void *info)
 {
-	local_flush_tlb_all();
+	local_flush_tlb_all_mm();
 }
 
 void flush_tlb_all(void)
 {
 	if (num_online_cpus() < 2)
-		local_flush_tlb_all();
+		local_flush_tlb_all_mm();
 	else if (riscv_use_sbi_for_rfence())
 		sbi_remote_sfence_vma_asid(NULL, 0, FLUSH_TLB_MAX_SIZE, FLUSH_TLB_NO_ASID);
 	else
@@ -461,6 +461,33 @@ void local_flush_tlb_mm(struct mm_struct *mm)
 	local_flush_tlb_all_asid(asid);
 }
 
+void local_flush_tlb_all_mm(void)
+{
+	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
+	struct tlb_context *contexts = info->contexts;
+	struct mm_struct *mms[MAX_LOADED_MM];
+	unsigned int cpu = raw_smp_processor_id();
+	unsigned int i, num = 0;
+
+	write_lock(&info->rwlock);
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (!contexts[i].mm || contexts[i].mm == info->active_mm)
+			continue;
+
+		mms[num++] = contexts[i].mm;
+		contexts[i].mm = NULL;
+		contexts[i].gen = 0;
+	}
+	write_unlock(&info->rwlock);
+
+	for (i = 0; i < num; i++) {
+		cpumask_clear_cpu(cpu, mm_cpumask(mms[i]));
+		mmdrop_lazy_mm(mms[i]);
+	}
+
+	local_flush_tlb_all();
+}
+
 void __init lazy_tlb_flush_init(void)
 {
 	struct tlb_flush_queue *queue;
-- 
2.20.1