From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Introduce foundational infrastructure for the vCPU debooster mechanism
to improve yield_to() effectiveness in virtualization workloads.

Add per-rq tracking fields for rate limiting (yield_deboost_last_time_ns)
and debouncing (yield_deboost_last_src/dst_pid, last_pair_time_ns).
Introduce global sysctl knob sysctl_sched_vcpu_debooster_enabled for
runtime control, defaulting to enabled. Add debugfs interface for
observability and initialization in sched_init().

The infrastructure is inert at this stage as no deboost logic is
implemented yet, allowing independent verification that existing
behavior remains unchanged.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 kernel/sched/core.c  | 7 +++++--
 kernel/sched/debug.c | 3 +++
 kernel/sched/fair.c  | 5 +++++
 kernel/sched/sched.h | 9 +++++++++
 4 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de848..03380790088b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8706,9 +8706,12 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
 
 	for_each_possible_cpu(i) {
-		struct rq *rq;
+		struct rq *rq = cpu_rq(i);
+		/* init per-rq debounce tracking */
+		rq->yield_deboost_last_src_pid = -1;
+		rq->yield_deboost_last_dst_pid = -1;
+		rq->yield_deboost_last_pair_time_ns = 0;
 
-		rq = cpu_rq(i);
 		raw_spin_lock_init(&rq->__lock);
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a790..905f303af752 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -508,6 +508,9 @@ static __init int sched_init_debug(void)
 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
 	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+	debugfs_create_u32("sched_vcpu_debooster_enabled", 0644, debugfs_sched,
+		&sysctl_sched_vcpu_debooster_enabled);
+
 
 	sched_domains_mutex_lock();
 	update_sched_domain_debugfs();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b..5b7fcc86ccff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -81,6 +81,11 @@ static unsigned int normalized_sysctl_sched_base_slice	= 700000ULL;
 
 __read_mostly unsigned int sysctl_sched_migration_cost	= 500000UL;
 
+/*
+ * vCPU debooster sysctl control
+ */
+unsigned int sysctl_sched_vcpu_debooster_enabled __read_mostly = 1;
+
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
 	pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..e9b4be024f89 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1292,6 +1292,13 @@ struct rq {
 	unsigned int		push_busy;
 	struct cpu_stop_work	push_work;
 
+	/* vCPU debooster rate-limit */
+	u64			yield_deboost_last_time_ns;
+	/* per-rq debounce state to avoid cross-CPU races */
+	pid_t			yield_deboost_last_src_pid;
+	pid_t			yield_deboost_last_dst_pid;
+	u64			yield_deboost_last_pair_time_ns;
+
 #ifdef CONFIG_SCHED_CORE
 	/* per rq */
 	struct rq		*core;
@@ -2816,6 +2823,8 @@ extern int sysctl_resched_latency_warn_once;
 
 extern unsigned int sysctl_sched_tunable_scaling;
 
+extern unsigned int sysctl_sched_vcpu_debooster_enabled;
+
 extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Implement core safety mechanisms for yield deboost operations.

Add yield_deboost_rate_limit() for high-frequency gating to prevent
excessive overhead on compute-intensive workloads. Use 6ms threshold
with lockless READ_ONCE/WRITE_ONCE to minimize cache line contention
while providing effective rate limiting.

Add yield_deboost_validate_tasks() for comprehensive validation
ensuring feature is enabled via sysctl, both tasks are valid and
distinct, both belong to fair_sched_class, entities are on the same
runqueue, and tasks are runnable.

The rate limiter prevents pathological high-frequency cases while
validation ensures only appropriate task pairs proceed. Both functions
are static and will be integrated in subsequent patches.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b7fcc86ccff..a7dc21c2dbdb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8990,6 +8990,74 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
 	}
 }
 
+/*
+ * High-frequency yield gating to reduce overhead on compute-intensive workloads.
+ * Returns true if the yield should be skipped due to frequency limits.
+ *
+ * Optimized: single threshold with READ_ONCE/WRITE_ONCE, refresh timestamp on every call.
+ */
+static bool yield_deboost_rate_limit(struct rq *rq, u64 now_ns)
+{
+	u64 last = READ_ONCE(rq->yield_deboost_last_time_ns);
+	bool limited = false;
+
+	if (last) {
+		u64 delta = now_ns - last;
+		limited = (delta <= 6000ULL * NSEC_PER_USEC);
+	}
+
+	WRITE_ONCE(rq->yield_deboost_last_time_ns, now_ns);
+	return limited;
+}
+
+/*
+ * Validate tasks and basic parameters for yield deboost operation.
+ * Performs comprehensive safety checks including feature enablement,
+ * NULL pointer validation, task state verification, and same-rq requirement.
+ * Returns false with appropriate debug logging if any validation fails,
+ * ensuring only safe and meaningful yield operations proceed.
+ */
+static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target,
+					  struct task_struct **p_yielding_out,
+					  struct sched_entity **se_y_out,
+					  struct sched_entity **se_t_out)
+{
+	struct task_struct *p_yielding;
+	struct sched_entity *se_y, *se_t;
+	u64 now_ns;
+
+	if (!sysctl_sched_vcpu_debooster_enabled)
+		return false;
+
+	if (!rq || !p_target)
+		return false;
+
+	now_ns = rq->clock;
+
+	if (yield_deboost_rate_limit(rq, now_ns))
+		return false;
+
+	p_yielding = rq->curr;
+	if (!p_yielding || p_yielding == p_target ||
+	    p_target->sched_class != &fair_sched_class ||
+	    p_yielding->sched_class != &fair_sched_class)
+		return false;
+
+	se_y = &p_yielding->se;
+	se_t = &p_target->se;
+
+	if (!se_t || !se_y || !se_t->on_rq || !se_y->on_rq)
+		return false;
+
+	if (task_rq(p_yielding) != rq || task_rq(p_target) != rq)
+		return false;
+
+	*p_yielding_out = p_yielding;
+	*se_y_out = se_y;
+	*se_t_out = se_t;
+	return true;
+}
+
 /*
  * sched_yield() is very simple
  */
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Implement yield_deboost_find_lca() to locate the lowest common ancestor
(LCA) in the cgroup hierarchy for EEVDF-aware yield operations.

The LCA represents the appropriate hierarchy level where vruntime
adjustments should be applied to ensure fairness is maintained across
cgroup boundaries. This is critical for virtualization workloads where
vCPUs may be organized in nested cgroups.

For CONFIG_FAIR_GROUP_SCHED, walk up both entity hierarchies by
aligning depths, then ascend together until a common cfs_rq is found.
For flat hierarchy, verify both entities share the same cfs_rq.
Validate that meaningful contention exists (nr_queued > 1) and ensure
the yielding entity has non-zero slice for safe penalty calculation.

The function operates under rq->lock protection. This static helper
will be integrated in subsequent patches.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 kernel/sched/fair.c | 60 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a7dc21c2dbdb..740c002b8f1c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9058,6 +9058,66 @@ static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct ta
 	return true;
 }
 
+/*
+ * Find the lowest common ancestor (LCA) in the cgroup hierarchy for EEVDF.
+ * We walk up both entity hierarchies under rq->lock protection.
+ * Task migration requires task_rq_lock, ensuring parent chains remain stable.
+ * We locate the first common cfs_rq where both entities coexist, representing
+ * the appropriate level for vruntime adjustments and EEVDF field updates
+ * (deadline, vlag) to maintain scheduler consistency.
+ */
+static bool __maybe_unused yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t,
+				    struct sched_entity **se_y_lca_out,
+				    struct sched_entity **se_t_lca_out,
+				    struct cfs_rq **cfs_rq_common_out)
+{
+	struct sched_entity *se_y_lca, *se_t_lca;
+	struct cfs_rq *cfs_rq_common;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	se_t_lca = se_t;
+	se_y_lca = se_y;
+
+	while (se_t_lca && se_y_lca && se_t_lca->depth != se_y_lca->depth) {
+		if (se_t_lca->depth > se_y_lca->depth)
+			se_t_lca = se_t_lca->parent;
+		else
+			se_y_lca = se_y_lca->parent;
+	}
+
+	while (se_t_lca && se_y_lca) {
+		if (cfs_rq_of(se_t_lca) == cfs_rq_of(se_y_lca)) {
+			cfs_rq_common = cfs_rq_of(se_t_lca);
+			goto found_lca;
+		}
+		se_t_lca = se_t_lca->parent;
+		se_y_lca = se_y_lca->parent;
+	}
+	return false;
+#else
+	if (cfs_rq_of(se_y) != cfs_rq_of(se_t))
+		return false;
+	cfs_rq_common = cfs_rq_of(se_y);
+	se_y_lca = se_y;
+	se_t_lca = se_t;
+#endif
+
+found_lca:
+	if (!se_y_lca || !se_t_lca)
+		return false;
+
+	if (cfs_rq_common->nr_queued <= 1)
+		return false;
+
+	if (!se_y_lca->slice)
+		return false;
+
+	*se_y_lca_out = se_y_lca;
+	*se_t_lca_out = se_t_lca;
+	*cfs_rq_common_out = cfs_rq_common;
+	return true;
+}
+
 /*
  * sched_yield() is very simple
  */
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Implement core penalty calculation and application mechanisms for
yield deboost operations.

Add yield_deboost_apply_debounce() for reverse-pair debouncing to
prevent ping-pong behavior. When A→B then B→A occurs within ~600us,
downscale the penalty.

Add yield_deboost_calculate_penalty() to calculate vruntime penalty
based on the fairness gap (vruntime delta between yielding and target
tasks), scheduling granularity with safety floor for abnormal values,
and queue-size-based caps (2 tasks: 6.0×gran, 3: 4.0×, 4-6: 2.5×,
7-8: 2.0×, 9-12: 1.5×, >12: 1.0×). Apply special handling for zero
gap with refined multipliers and 10% boost weighting on positive gaps.

Add yield_deboost_apply_penalty() to apply the penalty with overflow
protection and update EEVDF fields (deadline, vlag) and min_vruntime.

The penalty is tuned to provide meaningful preference while avoiding
starvation, scales with queue depth, and prevents oscillation through
debouncing. These static functions will be integrated in the next
patch.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 kernel/sched/fair.c | 153 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 740c002b8f1c..4bad324f3662 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9118,6 +9118,159 @@ static bool __maybe_unused yield_deboost_find_lca(struct sched_entity *se_y, str
 	return true;
 }
 
+/*
+ * Apply debounce for reverse pair within ~600us to reduce ping-pong.
+ * Downscales penalty to max(need, gran) when the previous pair was target->source,
+ * and updates per-rq debounce tracking fields to avoid cross-CPU races.
+ */
+static u64 yield_deboost_apply_debounce(struct rq *rq, struct sched_entity *se_t,
+					u64 penalty, u64 need, u64 gran)
+{
+	u64 now_ns = rq->clock;
+	struct task_struct *p_yielding = rq->curr;
+	struct task_struct *p_target = task_of(se_t);
+
+	if (p_yielding && p_target) {
+		pid_t src_pid = p_yielding->pid;
+		pid_t dst_pid = p_target->pid;
+		pid_t last_src = rq->yield_deboost_last_src_pid;
+		pid_t last_dst = rq->yield_deboost_last_dst_pid;
+		u64  last_ns  = rq->yield_deboost_last_pair_time_ns;
+
+		if (last_src == dst_pid && last_dst == src_pid &&
+		    (now_ns - last_ns) <= (600ULL * NSEC_PER_USEC)) {
+			u64 alt = need;
+			if (alt < gran)
+				alt = gran;
+			if (penalty > alt)
+				penalty = alt;
+		}
+
+		/* Update per-rq tracking */
+		rq->yield_deboost_last_src_pid = src_pid;
+		rq->yield_deboost_last_dst_pid = dst_pid;
+		rq->yield_deboost_last_pair_time_ns = now_ns;
+	}
+
+	return penalty;
+}
+
+/*
+ * Calculate penalty with debounce logic for EEVDF yield deboost.
+ * Computes vruntime penalty based on fairness gap (need) plus granularity,
+ * applies queue-size-based caps to prevent excessive penalties in small queues,
+ * and implements reverse-pair debounce (~300us) to reduce ping-pong effects.
+ * Returns 0 if no penalty needed, otherwise returns clamped penalty value.
+ */
+static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+				    struct sched_entity *se_t_lca, struct sched_entity *se_t,
+				    int nr_queued)
+{
+	u64 gran, need, penalty, maxp;
+	u64 gran_floor;
+	u64 weighted_need, base;
+
+	gran = calc_delta_fair(sysctl_sched_base_slice, se_y_lca);
+	/* Low-bound safeguard for gran when slice is abnormally small */
+	gran_floor = calc_delta_fair(sysctl_sched_base_slice >> 1, se_y_lca);
+	if (gran < gran_floor)
+		gran = gran_floor;
+
+	need = 0;
+	if (se_t_lca->vruntime > se_y_lca->vruntime)
+		need = se_t_lca->vruntime - se_y_lca->vruntime;
+
+	/* Apply 10% boost to need when positive (weighted_need = need * 1.10) */
+	penalty = gran;
+	if (need) {
+		/* weighted_need = need + 10% */
+		weighted_need = need + need / 10;
+		/* clamp to avoid overflow when adding to gran (still capped later) */
+		if (weighted_need > U64_MAX - penalty)
+			weighted_need = U64_MAX - penalty;
+		penalty += weighted_need;
+	}
+
+	/* Apply debounce via helper to avoid ping-pong */
+	penalty = yield_deboost_apply_debounce(rq, se_t, penalty, need, gran);
+
+	/* Upper bound (cap): slightly more aggressive for mid-size queues */
+	if (nr_queued == 2)
+		maxp = gran * 6;		/* Strongest push for 2-task ping-pong */
+	else if (nr_queued == 3)
+		maxp = gran * 4;		/* 4.0 * gran */
+	else if (nr_queued <= 6)
+		maxp = (gran * 5) / 2;		/* 2.5 * gran */
+	else if (nr_queued <= 8)
+		maxp = gran * 2;		/* 2.0 * gran */
+	else if (nr_queued <= 12)
+		maxp = (gran * 3) / 2;		/* 1.5 * gran */
+	else
+		maxp = gran;			/* 1.0 * gran */
+
+	if (penalty < gran)
+		penalty = gran;
+	if (penalty > maxp)
+		penalty = maxp;
+
+	/* If no need, apply refined baseline push (low risk + mid risk combined). */
+	if (need == 0) {
+		/*
+		 * Baseline multiplier for need==0:
+		 *   2        -> 1.00 * gran
+		 *   3        -> 0.9375 * gran
+		 *   4–6      -> 0.625 * gran
+		 *   7–8      -> 0.50  * gran
+		 *   9–12     -> 0.375 * gran
+		 *   >12      -> 0.25  * gran
+		 */
+		base = gran;
+		if (nr_queued == 3)
+			base = (gran * 15) / 16;	/* 0.9375 */
+		else if (nr_queued >= 4 && nr_queued <= 6)
+			base = (gran * 5) / 8;		/* 0.625 */
+		else if (nr_queued >= 7 && nr_queued <= 8)
+			base = gran / 2;		/* 0.5 */
+		else if (nr_queued >= 9 && nr_queued <= 12)
+			base = (gran * 3) / 8;		/* 0.375 */
+		else if (nr_queued > 12)
+			base = gran / 4;		/* 0.25 */
+
+		if (penalty < base)
+			penalty = base;
+	}
+
+	return penalty;
+}
+
+/*
+ * Apply penalty and update EEVDF fields for scheduler consistency.
+ * Safely applies vruntime penalty with overflow protection, then updates
+ * EEVDF-specific fields (deadline, vlag) and cfs_rq min_vruntime to maintain
+ * scheduler state consistency. Returns true on successful application,
+ * false if penalty cannot be safely applied.
+ */
+static void __maybe_unused yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+				 struct cfs_rq *cfs_rq_common, u64 penalty)
+{
+	u64 new_vruntime;
+
+	/* Overflow protection */
+	if (se_y_lca->vruntime > (U64_MAX - penalty))
+		return;
+
+	new_vruntime = se_y_lca->vruntime + penalty;
+
+	/* Validity check */
+	if (new_vruntime <= se_y_lca->vruntime)
+		return;
+
+	se_y_lca->vruntime = new_vruntime;
+	se_y_lca->deadline = se_y_lca->vruntime + calc_delta_fair(se_y_lca->slice, se_y_lca);
+	se_y_lca->vlag = avg_vruntime(cfs_rq_common) - se_y_lca->vruntime;
+	update_min_vruntime(cfs_rq_common);
+}
+
 /*
  * sched_yield() is very simple
  */
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Integrate the yield deboost mechanism into yield_to_task_fair() to
improve yield_to() effectiveness for virtualization workloads.

Add yield_to_deboost() as the main entry point that validates tasks,
finds cgroup LCA, updates rq clock and accounting, calculates penalty,
and applies EEVDF field adjustments.

The integration point after set_next_buddy() and before yield_task_fair()
works in concert with the existing buddy mechanism: set_next_buddy()
provides immediate preference, yield_to_deboost() applies bounded
vruntime penalty for sustained advantage, and yield_task_fair()
completes the standard yield path.

This is particularly beneficial for vCPU workloads where lock holder
detection triggers yield_to(), the holder needs sustained preference
to make progress, vCPUs may be organized in nested cgroups,
high-frequency yields require rate limiting, and ping-pong patterns
need debouncing.

Operation occurs under rq->lock with bounded penalties. The feature
can be disabled at runtime via
/sys/kernel/debug/sched/sched_vcpu_debooster_enabled.

Dbench workload in a virtualized environment (16 pCPUs host, 16 vCPUs
per VM running dbench-16 benchmark) shows consistent gains:
  2 VMs: +14.4% throughput
  3 VMs:  +9.8% throughput
  4 VMs:  +6.7% throughput

Performance gains stem from more effective yield_to() behavior,
enabling lock holders to make faster progress and reducing contention
overhead in overcommitted scenarios.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 kernel/sched/fair.c | 58 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4bad324f3662..619af60b7ce6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9017,7 +9017,7 @@ static bool yield_deboost_rate_limit(struct rq *rq, u64 now_ns)
  * Returns false with appropriate debug logging if any validation fails,
  * ensuring only safe and meaningful yield operations proceed.
  */
-static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target,
+static bool yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target,
 					  struct task_struct **p_yielding_out,
 					  struct sched_entity **se_y_out,
 					  struct sched_entity **se_t_out)
@@ -9066,7 +9066,7 @@ static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct ta
  * the appropriate level for vruntime adjustments and EEVDF field updates
  * (deadline, vlag) to maintain scheduler consistency.
  */
-static bool __maybe_unused yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t,
+static bool yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t,
 				    struct sched_entity **se_y_lca_out,
 				    struct sched_entity **se_t_lca_out,
 				    struct cfs_rq **cfs_rq_common_out)
@@ -9162,7 +9162,7 @@ static u64 yield_deboost_apply_debounce(struct rq *rq, struct sched_entity *se_t
  * and implements reverse-pair debounce (~300us) to reduce ping-pong effects.
  * Returns 0 if no penalty needed, otherwise returns clamped penalty value.
  */
-static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+static u64 yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
 				    struct sched_entity *se_t_lca, struct sched_entity *se_t,
 				    int nr_queued)
 {
@@ -9250,7 +9250,7 @@ static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct
  * scheduler state consistency. Returns true on successful application,
  * false if penalty cannot be safely applied.
  */
-static void __maybe_unused yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+static void yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
 				 struct cfs_rq *cfs_rq_common, u64 penalty)
 {
 	u64 new_vruntime;
@@ -9303,6 +9303,52 @@ static void yield_task_fair(struct rq *rq)
 	se->deadline += calc_delta_fair(se->slice, se);
 }
 
+/*
+ * yield_to_deboost - deboost the yielding task to favor the target on the same rq
+ * @rq: runqueue containing both tasks; rq->lock must be held
+ * @p_target: task to favor in scheduling
+ *
+ * Cooperates with yield_to_task_fair(): buddy provides immediate preference;
+ * this routine applies a bounded vruntime penalty at the cgroup LCA so the
+ * target keeps advantage beyond the buddy effect. EEVDF fields are updated
+ * to keep scheduler state consistent.
+ *
+ * Only operates on tasks resident on the same rq; throttled hierarchies are
+ * rejected early. Penalty is bounded by granularity and queue-size caps.
+ *
+ * Intended primarily for virtualization workloads where a yielding vCPU
+ * should defer to a target vCPU within the same runqueue.
+ * Does not change runnable order directly; complements buddy selection with
+ * a bounded fairness adjustment.
+ */
+static void yield_to_deboost(struct rq *rq, struct task_struct *p_target)
+{
+	struct task_struct *p_yielding;
+	struct sched_entity *se_y, *se_t, *se_y_lca, *se_t_lca;
+	struct cfs_rq *cfs_rq_common;
+	u64 penalty;
+
+	/* Step 1: validate tasks and inputs */
+	if (!yield_deboost_validate_tasks(rq, p_target, &p_yielding, &se_y, &se_t))
+		return;
+
+	/* Step 2: find LCA in cgroup hierarchy */
+	if (!yield_deboost_find_lca(se_y, se_t, &se_y_lca, &se_t_lca, &cfs_rq_common))
+		return;
+
+	/* Step 3: update clock and current accounting */
+	update_rq_clock(rq);
+	if (se_y_lca != cfs_rq_common->curr)
+		update_curr(cfs_rq_common);
+
+	/* Step 4: calculate penalty (caps + debounce) */
+	penalty = yield_deboost_calculate_penalty(rq, se_y_lca, se_t_lca, se_t,
+						  cfs_rq_common->nr_queued);
+
+	/* Step 5: apply penalty and update EEVDF fields */
+	yield_deboost_apply_penalty(rq, se_y_lca, cfs_rq_common, penalty);
+}
+
 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
@@ -9314,6 +9360,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 	/* Tell the scheduler that we'd really like se to run next. */
 	set_next_buddy(se);
 
+	/* Apply deboost under rq lock. */
+	yield_to_deboost(rq, p);
+
+	/* Complete the standard yield path. */
 	yield_task_fair(rq);
 
 	return true;
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

In kvm_vcpu_on_spin(), the loop counter 'i' is incorrectly written to
last_boosted_vcpu instead of the actual vCPU index 'idx'. This causes
last_boosted_vcpu to store the loop iteration count rather than the
vCPU index, leading to incorrect round-robin behavior in subsequent
directed yield operations.

Fix this by using 'idx' instead of 'i' in the assignment.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b7a0ae2a7b20..cde1eddbaa91 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4026,7 +4026,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 
 		yielded = kvm_vcpu_yield_to(vcpu);
 		if (yielded > 0) {
-			WRITE_ONCE(kvm->last_boosted_vcpu, i);
+			WRITE_ONCE(kvm->last_boosted_vcpu, idx);
 			break;
 		} else if (yielded < 0 && !--try) {
 			break;
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Introduce IPI tracking infrastructure for directed yield optimization.

Add per-vCPU IPI tracking context in kvm_vcpu_arch with
last_ipi_sender/receiver to track IPI communication pairs, pending_ipi
flag to indicate awaiting IPI response, and ipi_time_ns monotonic
timestamp for recency validation.

Add module parameters ipi_tracking_enabled (global toggle, default
true) and ipi_window_ns (recency window, default 50ms).

Add core helper functions: kvm_track_ipi_communication() to record
sender/receiver pairs, kvm_vcpu_is_ipi_receiver() to validate recent
IPI relationship, and kvm_vcpu_clear/reset_ipi_context() for lifecycle
management.

Use lockless READ_ONCE/WRITE_ONCE for minimal overhead. The short time
window prevents stale IPI information from affecting throughput
workloads.

The infrastructure is inert until integrated with interrupt delivery in
subsequent patches.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  8 ++++
 arch/x86/kvm/lapic.c            | 65 +++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  6 +++
 arch/x86/kvm/x86.h              |  4 ++
 include/linux/kvm_host.h        |  1 +
 virt/kvm/kvm_main.c             |  5 +++
 6 files changed, 89 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f..b5bdc115ff45 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1052,6 +1052,14 @@ struct kvm_vcpu_arch {
 	int pending_external_vector;
 	int highest_stale_pending_ioapic_eoi;
 
+	/* IPI tracking for directed yield (x86 only) */
+	struct {
+		int last_ipi_sender;    /* vCPU ID of last IPI sender */
+		int last_ipi_receiver;  /* vCPU ID of last IPI receiver */
+		bool pending_ipi;       /* Pending IPI response */
+		u64 ipi_time_ns;        /* Monotonic ns when IPI was sent */
+	} ipi_context;
+
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0ae7f913d782..98ec2b18b02c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -75,6 +75,12 @@ module_param(lapic_timer_advance, bool, 0444);
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
 
+/* IPI tracking window and runtime toggle (runtime-adjustable) */
+static bool ipi_tracking_enabled = true;
+static unsigned long ipi_window_ns = 50 * NSEC_PER_MSEC;
+module_param(ipi_tracking_enabled, bool, 0644);
+module_param(ipi_window_ns, ulong, 0644);
+
 static bool __read_mostly vector_hashing_enabled = true;
 module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444);
 
@@ -1113,6 +1119,65 @@ static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
+/*
+ * Track IPI communication for directed yield when a unique receiver exists.
+ * This only writes sender/receiver context and timestamp; ignores self-IPI.
+ */
+void kvm_track_ipi_communication(struct kvm_vcpu *sender, struct kvm_vcpu *receiver)
+{
+	if (!sender || !receiver || sender == receiver)
+		return;
+	if (unlikely(!READ_ONCE(ipi_tracking_enabled)))
+		return;
+
+	WRITE_ONCE(sender->arch.ipi_context.last_ipi_receiver, receiver->vcpu_idx);
+	WRITE_ONCE(sender->arch.ipi_context.pending_ipi, true);
+	WRITE_ONCE(sender->arch.ipi_context.ipi_time_ns, ktime_get_mono_fast_ns());
+
+	WRITE_ONCE(receiver->arch.ipi_context.last_ipi_sender, sender->vcpu_idx);
+}
+
+/*
+ * Check if 'receiver' is the recent IPI target of 'sender'.
+ *
+ * Rationale:
+ * - Use a short window to avoid stale IPI inflating boost priority
+ *   on throughput-sensitive workloads.
+ */
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, struct kvm_vcpu *receiver)
+{
+	u64 then, now;
+
+	if (unlikely(!READ_ONCE(ipi_tracking_enabled)))
+		return false;
+
+	then = READ_ONCE(sender->arch.ipi_context.ipi_time_ns);
+	now = ktime_get_mono_fast_ns();
+	if (READ_ONCE(sender->arch.ipi_context.pending_ipi) &&
+	    READ_ONCE(sender->arch.ipi_context.last_ipi_receiver) ==
+	    receiver->vcpu_idx &&
+	    now - then <= ipi_window_ns)
+		return true;
+
+	return false;
+}
+
+void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu)
+{
+	WRITE_ONCE(vcpu->arch.ipi_context.pending_ipi, false);
+	WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_sender, -1);
+	WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_receiver, -1);
+}
+
+/*
+ * Reset helper: clear ipi_context and zero ipi_time for hard reset paths.
+ */
+void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_clear_ipi_context(vcpu);
+	WRITE_ONCE(vcpu->arch.ipi_context.ipi_time_ns, 0);
+}
+
 /* Return true if the interrupt can be handled by using *bitmap as index mask
  * for valid destinations in *dst array.
  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4b5d2d09634..649e016c131f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12708,6 +12708,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 		goto free_guest_fpu;
 
 	kvm_xen_init_vcpu(vcpu);
+	/* Initialize IPI tracking */
+	kvm_vcpu_reset_ipi_context(vcpu);
 	vcpu_load(vcpu);
 	kvm_vcpu_after_set_cpuid(vcpu);
 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
@@ -12781,6 +12783,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
+	/* Clear IPI tracking context */
+	kvm_vcpu_reset_ipi_context(vcpu);
 	kvfree(vcpu->arch.cpuid_entries);
 }
 
@@ -12846,6 +12850,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_leave_nested(vcpu);
 
 	kvm_lapic_reset(vcpu, init_event);
+	/* Clear IPI tracking context on reset */
+	kvm_vcpu_clear_ipi_context(vcpu);
 
 	WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
 	vcpu->arch.hflags = 0;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f3dc77f006f9..86a10c653eac 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -451,6 +451,10 @@ fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu);
 fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
 fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu);
 fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu);
+void kvm_track_ipi_communication(struct kvm_vcpu *sender,
+				struct kvm_vcpu *receiver);
+void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu);
 
 extern struct kvm_caps kvm_caps;
 extern struct kvm_host_values kvm_host;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5bd76cf394fa..5ae8327fdf21 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1532,6 +1532,7 @@ static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 }
 #endif
 
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, struct kvm_vcpu *receiver);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cde1eddbaa91..495e769c7ddf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3963,6 +3963,11 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
 	return false;
 }
 
+bool __weak kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, struct kvm_vcpu *receiver)
+{
+	return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
 	int nr_vcpus, start, i, idx, yielded;
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Integrate IPI tracking with LAPIC interrupt delivery and EOI handling.

Hook into kvm_irq_delivery_to_apic() after destination resolution to
record sender/receiver pairs when the interrupt is LAPIC-originated,
APIC_DM_FIXED mode, with exactly one destination vCPU. Use counting
for efficient single-destination detection.

Add kvm_clear_ipi_on_eoi() called from both EOI paths to ensure
complete IPI context cleanup:

1. apic_set_eoi(): Software-emulated EOI path (traditional/non-APICv)
2. kvm_apic_set_eoi_accelerated(): Hardware-accelerated EOI path
   (APICv/AVIC)

Without dual-path cleanup, APICv/AVIC-enabled guests would retain
stale IPI state, causing directed yield to rely on obsolete sender/
receiver information and potentially boosting the wrong vCPU. Both
paths must call kvm_clear_ipi_on_eoi() to maintain consistency across
different virtual interrupt delivery modes.

The cleanup implements two-stage logic to avoid premature clearing:
unconditionally clear the receiver's IPI context, and conditionally
clear the sender's pending flag only when the sender exists,
last_ipi_receiver matches, and the IPI is recent. This prevents
unrelated EOIs from disrupting valid IPI tracking state.

Use lockless accessors for minimal overhead. The tracking only
activates for unicast fixed IPIs where directed yield provides value.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/kvm/lapic.c | 107 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 103 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 98ec2b18b02c..d38e64691b78 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1178,6 +1178,47 @@ void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu)
 	WRITE_ONCE(vcpu->arch.ipi_context.ipi_time_ns, 0);
 }
 
+/*
+ * Clear IPI context on EOI at receiver side; clear sender's pending
+ * only when matches and is fresh.
+ *
+ * This function implements precise cleanup to avoid stale IPI boosts:
+ * 1) Always clear the receiver's IPI context (unconditional cleanup)
+ * 2) Conditionally clear the sender's pending flag only when:
+ *    - The sender vCPU still exists and is valid
+ *    - The sender's last_ipi_receiver matches this receiver
+ *    - The IPI was sent recently (within ~window)
+ */
+static void kvm_clear_ipi_on_eoi(struct kvm_lapic *apic)
+{
+	struct kvm_vcpu *receiver;
+	int sender_idx;
+	u64 then, now;
+
+	if (unlikely(!READ_ONCE(ipi_tracking_enabled)))
+		return;
+
+	receiver = apic->vcpu;
+	sender_idx = READ_ONCE(receiver->arch.ipi_context.last_ipi_sender);
+
+	/* Step 1: Always clear receiver's IPI context */
+	kvm_vcpu_clear_ipi_context(receiver);
+
+	/* Step 2: Conditionally clear sender's pending flag */
+	if (sender_idx >= 0) {
+		struct kvm_vcpu *sender = kvm_get_vcpu(receiver->kvm, sender_idx);
+
+		if (sender &&
+		    READ_ONCE(sender->arch.ipi_context.last_ipi_receiver) ==
+		    receiver->vcpu_idx) {
+			then = READ_ONCE(sender->arch.ipi_context.ipi_time_ns);
+			now = ktime_get_mono_fast_ns();
+			if (now - then <= ipi_window_ns)
+				WRITE_ONCE(sender->arch.ipi_context.pending_ipi, false);
+		}
+	}
+}
+
 /* Return true if the interrupt can be handled by using *bitmap as index mask
  * for valid destinations in *dst array.
  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
@@ -1259,6 +1300,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	struct kvm_lapic **dst = NULL;
 	int i;
 	bool ret;
+	/* Count actual delivered targets to identify a unique recipient. */
+	int targets = 0;
+	int delivered = 0;
+	struct kvm_vcpu *unique = NULL;
 
 	*r = -1;
 
@@ -1280,8 +1325,26 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 		for_each_set_bit(i, &bitmap, 16) {
 			if (!dst[i])
 				continue;
-			*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
+			delivered = kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
+			*r += delivered;
+			/* Fast path may still fan out; count delivered targets. */
+			if (delivered > 0) {
+				targets++;
+				unique = dst[i]->vcpu;
+			}
 		}
+
+		/*
+		 * Record unique recipient for IPI-aware boost:
+		 * only for LAPIC-originated APIC_DM_FIXED without
+		 * shorthand, and when exactly one recipient was
+		 * delivered; ignore self-IPI.
+		 */
+		if (src &&
+		    irq->delivery_mode == APIC_DM_FIXED &&
+		    irq->shorthand == APIC_DEST_NOSHORT &&
+		    targets == 1 && unique && unique != src->vcpu)
+			kvm_track_ipi_communication(src->vcpu, unique);
 	}
 
 	rcu_read_unlock();
@@ -1366,6 +1429,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 	unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 	unsigned int dest_vcpus = 0;
+	/*
+	 * Count actual delivered targets to identify a unique recipient
+	 * for IPI tracking in the slow path.
+	 */
+	int targets = 0;
+	int delivered = 0;
+	struct kvm_vcpu *unique = NULL;
 
 	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
 		return r;
@@ -1389,7 +1459,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		if (!kvm_lowest_prio_delivery(irq)) {
 			if (r < 0)
 				r = 0;
-			r += kvm_apic_set_irq(vcpu, irq, dest_map);
+			delivered = kvm_apic_set_irq(vcpu, irq, dest_map);
+			r += delivered;
+			/* Slow path can deliver to multiple vCPUs; count them. */
+			if (delivered > 0) {
+				targets++;
+				unique = vcpu;
+			}
 		} else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
 			if (!vector_hashing_enabled) {
 				if (!lowest)
@@ -1410,8 +1486,28 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		lowest = kvm_get_vcpu(kvm, idx);
 	}
 
-	if (lowest)
-		r = kvm_apic_set_irq(lowest, irq, dest_map);
+	if (lowest) {
+		delivered = kvm_apic_set_irq(lowest, irq, dest_map);
+		r = delivered;
+		/*
+		 * Lowest-priority / vector-hashing paths ultimately deliver to
+		 * a single vCPU.
+		 */
+		if (delivered > 0) {
+			targets = 1;
+			unique = lowest;
+		}
+	}
+
+	/*
+	 * Record unique recipient for IPI-aware boost only for LAPIC-
+	 * originated APIC_DM_FIXED without shorthand, and when exactly
+	 * one recipient was delivered; ignore self-IPI.
+	 */
+	if (src && irq->delivery_mode == APIC_DM_FIXED &&
+	    irq->shorthand == APIC_DEST_NOSHORT &&
+	    targets == 1 && unique && unique != src->vcpu)
+		kvm_track_ipi_communication(src->vcpu, unique);
 
 	return r;
 }
@@ -1632,6 +1728,7 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
 	trace_kvm_eoi(apic, vector);
 
 	kvm_ioapic_send_eoi(apic, vector);
+	kvm_clear_ipi_on_eoi(apic);
 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_eoi_accelerated);
@@ -2424,6 +2521,8 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	case APIC_EOI:
 		apic_set_eoi(apic);
+		/* Precise cleanup for IPI-aware boost */
+		kvm_clear_ipi_on_eoi(apic);
 		break;
 
 	case APIC_LDR:
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

Integrate IPI tracking with directed yield to improve scheduling when
vCPUs spin waiting for IPI responses.

Implement priority-based candidate selection in kvm_vcpu_on_spin()
with three tiers: Priority 1 uses kvm_vcpu_is_ipi_receiver() to
identify confirmed IPI targets within the recency window, addressing
lock holders spinning on IPI acknowledgment. Priority 2 leverages
existing kvm_arch_dy_has_pending_interrupt() for compatibility with
arch-specific fast paths. Priority 3 falls back to conventional
preemption-based logic when yield_to_kernel_mode is requested,
providing a safety net for non-IPI scenarios.

Add kvm_vcpu_is_good_yield_candidate() helper to consolidate these
checks, preventing over-aggressive boosting while enabling targeted
optimization when IPI patterns are detected.

Performance testing (16 pCPUs host, 16 vCPUs/VM):

Dedup (simlarge):
  2 VMs: +47.1% throughput
  3 VMs: +28.1% throughput
  4 VMs:  +1.7% throughput

VIPS (simlarge):
  2 VMs: +26.2% throughput
  3 VMs: +12.7% throughput
  4 VMs:  +6.0% throughput

Gains stem from effective directed yield when vCPUs spin on IPI
delivery, reducing synchronization overhead. The improvement is most
pronounced at moderate overcommit (2-3 VMs) where contention reduction
outweighs context switching cost.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 virt/kvm/kvm_main.c | 52 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 495e769c7ddf..9cf44b6b396d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3968,6 +3968,47 @@ bool __weak kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, struct kvm_vcpu *r
 	return false;
 }
 
+/*
+ * IPI-aware candidate selection for directed yield
+ *
+ * Priority order:
+ *  1) Confirmed IPI receiver of 'me' within a short window (always boost)
+ *  2) Arch-provided fast pending interrupt (user-mode boost)
+ *  3) Kernel-mode yield: preempted-in-kernel vCPU (traditional boost)
+ *  4) Otherwise, be conservative
+ */
+static bool kvm_vcpu_is_good_yield_candidate(struct kvm_vcpu *me, struct kvm_vcpu *vcpu,
+					     bool yield_to_kernel_mode)
+{
+	/* Priority 1: recently targeted IPI receiver */
+	if (kvm_vcpu_is_ipi_receiver(me, vcpu))
+		return true;
+
+	/* Priority 2: fast pending-interrupt hint (arch-specific). */
+	if (kvm_arch_dy_has_pending_interrupt(vcpu))
+		return true;
+
+	/*
+	 * Minimal preempted gate for remaining cases:
+	 * - If the target is neither a confirmed IPI receiver nor has a fast
+	 *   pending interrupt, require that the target has been preempted.
+	 * - If yielding to kernel mode is requested, additionally require
+	 *   that the target was preempted while in kernel mode.
+	 *
+	 * This avoids expanding the candidate set too aggressively and helps
+	 * prevent overboost in workloads where the IPI context is not
+	 * involved.
+	 */
+	if (!READ_ONCE(vcpu->preempted))
+		return false;
+
+	if (yield_to_kernel_mode &&
+	    !kvm_arch_vcpu_preempted_in_kernel(vcpu))
+		return false;
+
+	return true;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
 	int nr_vcpus, start, i, idx, yielded;
@@ -4015,15 +4056,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 		if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
 			continue;
 
-		/*
-		 * Treat the target vCPU as being in-kernel if it has a pending
-		 * interrupt, as the vCPU trying to yield may be spinning
-		 * waiting on IPI delivery, i.e. the target vCPU is in-kernel
-		 * for the purposes of directed yield.
-		 */
-		if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
-		    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
-		    !kvm_arch_vcpu_preempted_in_kernel(vcpu))
+		/* IPI-aware candidate selection */
+		if (!kvm_vcpu_is_good_yield_candidate(me, vcpu, yield_to_kernel_mode))
 			continue;
 
 		if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
-- 
2.43.0

From: Wanpeng Li <wanpengli@tencent.com>

Add a minimal two-round fallback mechanism in kvm_vcpu_on_spin() to
avoid pathological stalls when the first round finds no eligible
target.

Round 1 applies strict IPI-aware candidate selection (existing
behavior). Round 2 provides a relaxed scan gated only by preempted
state as a safety net, addressing cases where IPI context is missed or
the runnable set is transient.

The second round is controlled by module parameter enable_relaxed_boost
(bool, 0644, default on) to allow easy disablement by distributions if
needed.

Introduce the enable_relaxed_boost parameter, add a first_round flag,
retry label, and reset of yielded counter. Gate the IPI-aware check in
round 1 and use preempted-only gating in round 2. Keep churn minimal
by reusing the same scan logic while preserving all existing
heuristics, tracing, and bookkeeping.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 virt/kvm/kvm_main.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9cf44b6b396d..b03be8d9ae4c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,6 +101,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
 static bool allow_unsafe_mappings;
 module_param(allow_unsafe_mappings, bool, 0444);
 
+static bool enable_relaxed_boost = true;
+module_param(enable_relaxed_boost, bool, 0644);
+
 /*
  * Ordering of locks:
  *
@@ -4015,6 +4018,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 	struct kvm *kvm = me->kvm;
 	struct kvm_vcpu *vcpu;
 	int try = 3;
+	bool first_round = true;
 
 	nr_vcpus = atomic_read(&kvm->online_vcpus);
 	if (nr_vcpus < 2)
@@ -4025,6 +4029,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 
 	kvm_vcpu_set_in_spin_loop(me, true);
 
+retry:
+	yielded = 0;
+
 	/*
 	 * The current vCPU ("me") is spinning in kernel mode, i.e. is likely
 	 * waiting for a resource to become available.  Attempt to yield to a
@@ -4057,7 +4064,12 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 			continue;
 
 		/* IPI-aware candidate selection */
-		if (!kvm_vcpu_is_good_yield_candidate(me, vcpu, yield_to_kernel_mode))
+		if (first_round &&
+			!kvm_vcpu_is_good_yield_candidate(me, vcpu, yield_to_kernel_mode))
+			continue;
+
+		/* Minimal preempted gate for second round */
+		if (!first_round && !READ_ONCE(vcpu->preempted))
 			continue;
 
 		if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
@@ -4071,6 +4083,16 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 			break;
 		}
 	}
+
+	/*
+	 * Second round: relaxed boost as safety net, with preempted gate.
+	 * Only execute when enabled and when the first round yielded nothing.
+	 */
+	if (enable_relaxed_boost && first_round && yielded <= 0) {
+		first_round = false;
+		goto retry;
+	}
+
 	kvm_vcpu_set_in_spin_loop(me, false);
 
 	/* Ensure vcpu is not eligible during next spinloop */
-- 
2.43.0