From: Wanpeng Li <wanpengli@tencent.com>

From: Wanpeng Li <wanpengli@tencent.com>

Integrate the yield deboost mechanism into yield_to_task_fair() to
improve yield_to() effectiveness for virtualization workloads.

Add yield_to_deboost() as the main entry point that validates tasks,
finds cgroup LCA, updates rq clock and accounting, calculates penalty,
and applies EEVDF field adjustments.

The integration point after set_next_buddy() and before yield_task_fair()
works in concert with the existing buddy mechanism: set_next_buddy()
provides immediate preference, yield_to_deboost() applies bounded
vruntime penalty for sustained advantage, and yield_task_fair()
completes the standard yield path.

This is particularly beneficial for vCPU workloads where lock holder
detection triggers yield_to(), the holder needs sustained preference
to make progress, vCPUs may be organized in nested cgroups,
high-frequency yields require rate limiting, and ping-pong patterns
need debouncing.

Operation occurs under rq->lock with bounded penalties. The feature
can be disabled at runtime via
/sys/kernel/debug/sched/sched_vcpu_debooster_enabled.

Dbench workload in a virtualized environment (16 pCPUs host, 16 vCPUs
per VM running dbench-16 benchmark) shows consistent gains:
  2 VMs: +14.4% throughput
  3 VMs:  +9.8% throughput
  4 VMs:  +6.7% throughput

Performance gains stem from more effective yield_to() behavior,
enabling lock holders to make faster progress and reducing contention
overhead in overcommitted scenarios.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 kernel/sched/fair.c | 58 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4bad324f3662..619af60b7ce6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9017,7 +9017,7 @@ static bool yield_deboost_rate_limit(struct rq *rq, u64 now_ns)
  * Returns false with appropriate debug logging if any validation fails,
  * ensuring only safe and meaningful yield operations proceed.
  */
-static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target,
+static bool yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target,
 					  struct task_struct **p_yielding_out,
 					  struct sched_entity **se_y_out,
 					  struct sched_entity **se_t_out)
@@ -9066,7 +9066,7 @@ static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct ta
  * the appropriate level for vruntime adjustments and EEVDF field updates
  * (deadline, vlag) to maintain scheduler consistency.
  */
-static bool __maybe_unused yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t,
+static bool yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t,
 				    struct sched_entity **se_y_lca_out,
 				    struct sched_entity **se_t_lca_out,
 				    struct cfs_rq **cfs_rq_common_out)
@@ -9162,7 +9162,7 @@ static u64 yield_deboost_apply_debounce(struct rq *rq, struct sched_entity *se_t
  * and implements reverse-pair debounce (~300us) to reduce ping-pong effects.
  * Returns 0 if no penalty needed, otherwise returns clamped penalty value.
  */
-static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+static u64 yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca,
 				    struct sched_entity *se_t_lca, struct sched_entity *se_t,
 				    int nr_queued)
 {
@@ -9250,7 +9250,7 @@ static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct
  * scheduler state consistency. Returns true on successful application,
  * false if penalty cannot be safely applied.
  */
-static void __maybe_unused yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
+static void yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca,
 				 struct cfs_rq *cfs_rq_common, u64 penalty)
 {
 	u64 new_vruntime;
@@ -9303,6 +9303,52 @@ static void yield_task_fair(struct rq *rq)
 	se->deadline += calc_delta_fair(se->slice, se);
 }
 
+/*
+ * yield_to_deboost - deboost the yielding task to favor the target on the same rq
+ * @rq: runqueue containing both tasks; rq->lock must be held
+ * @p_target: task to favor in scheduling
+ *
+ * Cooperates with yield_to_task_fair(): buddy provides immediate preference;
+ * this routine applies a bounded vruntime penalty at the cgroup LCA so the
+ * target keeps advantage beyond the buddy effect. EEVDF fields are updated
+ * to keep scheduler state consistent.
+ *
+ * Only operates on tasks resident on the same rq; throttled hierarchies are
+ * rejected early. Penalty is bounded by granularity and queue-size caps.
+ *
+ * Intended primarily for virtualization workloads where a yielding vCPU
+ * should defer to a target vCPU within the same runqueue.
+ * Does not change runnable order directly; complements buddy selection with
+ * a bounded fairness adjustment.
+ */
+static void yield_to_deboost(struct rq *rq, struct task_struct *p_target)
+{
+	struct task_struct *p_yielding;
+	struct sched_entity *se_y, *se_t, *se_y_lca, *se_t_lca;
+	struct cfs_rq *cfs_rq_common;
+	u64 penalty;
+
+	/* Step 1: validate tasks and inputs */
+	if (!yield_deboost_validate_tasks(rq, p_target, &p_yielding, &se_y, &se_t))
+		return;
+
+	/* Step 2: find LCA in cgroup hierarchy */
+	if (!yield_deboost_find_lca(se_y, se_t, &se_y_lca, &se_t_lca, &cfs_rq_common))
+		return;
+
+	/* Step 3: update clock and current accounting */
+	update_rq_clock(rq);
+	if (se_y_lca != cfs_rq_common->curr)
+		update_curr(cfs_rq_common);
+
+	/* Step 4: calculate penalty (caps + debounce) */
+	penalty = yield_deboost_calculate_penalty(rq, se_y_lca, se_t_lca, se_t,
+						  cfs_rq_common->nr_queued);
+
+	/* Step 5: apply penalty and update EEVDF fields */
+	yield_deboost_apply_penalty(rq, se_y_lca, cfs_rq_common, penalty);
+}
+
 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
@@ -9314,6 +9360,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 	/* Tell the scheduler that we'd really like se to run next. */
 	set_next_buddy(se);
 
+	/* Apply deboost under rq lock. */
+	yield_to_deboost(rq, p);
+
+	/* Complete the standard yield path. */
 	yield_task_fair(rq);
 
 	return true;
-- 
2.43.0