From: Wanpeng Li From: Wanpeng Li Integrate the yield deboost mechanism into yield_to_task_fair() to improve yield_to() effectiveness for virtualization workloads. Add yield_to_deboost() as the main entry point that validates tasks, finds cgroup LCA, updates rq clock and accounting, calculates penalty, and applies EEVDF field adjustments. The integration point after set_next_buddy() and before yield_task_fair() works in concert with the existing buddy mechanism: set_next_buddy() provides immediate preference, yield_to_deboost() applies bounded vruntime penalty for sustained advantage, and yield_task_fair() completes the standard yield path. This is particularly beneficial for vCPU workloads where lock holder detection triggers yield_to(), the holder needs sustained preference to make progress, vCPUs may be organized in nested cgroups, high-frequency yields require rate limiting, and ping-pong patterns need debouncing. Operation occurs under rq->lock with bounded penalties. The feature can be disabled at runtime via /sys/kernel/debug/sched/sched_vcpu_debooster_enabled. Dbench workload in a virtualized environment (16 pCPUs host, 16 vCPUs per VM running dbench-16 benchmark) shows consistent gains: 2 VMs: +14.4% throughput 3 VMs: +9.8% throughput 4 VMs: +6.7% throughput Performance gains stem from more effective yield_to() behavior, enabling lock holders to make faster progress and reducing contention overhead in overcommitted scenarios. Signed-off-by: Wanpeng Li --- kernel/sched/fair.c | 58 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4bad324f3662..619af60b7ce6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9017,7 +9017,7 @@ static bool yield_deboost_rate_limit(struct rq *rq, u64 now_ns) * Returns false with appropriate debug logging if any validation fails, * ensuring only safe and meaningful yield operations proceed. */ -static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target, +static bool yield_deboost_validate_tasks(struct rq *rq, struct task_struct *p_target, struct task_struct **p_yielding_out, struct sched_entity **se_y_out, struct sched_entity **se_t_out) @@ -9066,7 +9066,7 @@ static bool __maybe_unused yield_deboost_validate_tasks(struct rq *rq, struct ta * the appropriate level for vruntime adjustments and EEVDF field updates * (deadline, vlag) to maintain scheduler consistency. */ -static bool __maybe_unused yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t, +static bool yield_deboost_find_lca(struct sched_entity *se_y, struct sched_entity *se_t, struct sched_entity **se_y_lca_out, struct sched_entity **se_t_lca_out, struct cfs_rq **cfs_rq_common_out) @@ -9162,7 +9162,7 @@ static u64 yield_deboost_apply_debounce(struct rq *rq, struct sched_entity *se_t * and implements reverse-pair debounce (~300us) to reduce ping-pong effects. * Returns 0 if no penalty needed, otherwise returns clamped penalty value. */ -static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca, +static u64 yield_deboost_calculate_penalty(struct rq *rq, struct sched_entity *se_y_lca, struct sched_entity *se_t_lca, struct sched_entity *se_t, int nr_queued) { @@ -9250,7 +9250,7 @@ static u64 __maybe_unused yield_deboost_calculate_penalty(struct rq *rq, struct * scheduler state consistency. Returns true on successful application, * false if penalty cannot be safely applied. */ -static void __maybe_unused yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca, +static void yield_deboost_apply_penalty(struct rq *rq, struct sched_entity *se_y_lca, struct cfs_rq *cfs_rq_common, u64 penalty) { u64 new_vruntime; @@ -9303,6 +9303,52 @@ static void yield_task_fair(struct rq *rq) se->deadline += calc_delta_fair(se->slice, se); } +/* + * yield_to_deboost - deboost the yielding task to favor the target on the same rq + * @rq: runqueue containing both tasks; rq->lock must be held + * @p_target: task to favor in scheduling + * + * Cooperates with yield_to_task_fair(): buddy provides immediate preference; + * this routine applies a bounded vruntime penalty at the cgroup LCA so the + * target keeps advantage beyond the buddy effect. EEVDF fields are updated + * to keep scheduler state consistent. + * + * Only operates on tasks resident on the same rq; throttled hierarchies are + * rejected early. Penalty is bounded by granularity and queue-size caps. + * + * Intended primarily for virtualization workloads where a yielding vCPU + * should defer to a target vCPU within the same runqueue. + * Does not change runnable order directly; complements buddy selection with + * a bounded fairness adjustment. + */ +static void yield_to_deboost(struct rq *rq, struct task_struct *p_target) +{ + struct task_struct *p_yielding; + struct sched_entity *se_y, *se_t, *se_y_lca, *se_t_lca; + struct cfs_rq *cfs_rq_common; + u64 penalty; + + /* Step 1: validate tasks and inputs */ + if (!yield_deboost_validate_tasks(rq, p_target, &p_yielding, &se_y, &se_t)) + return; + + /* Step 2: find LCA in cgroup hierarchy */ + if (!yield_deboost_find_lca(se_y, se_t, &se_y_lca, &se_t_lca, &cfs_rq_common)) + return; + + /* Step 3: update clock and current accounting */ + update_rq_clock(rq); + if (se_y_lca != cfs_rq_common->curr) + update_curr(cfs_rq_common); + + /* Step 4: calculate penalty (caps + debounce) */ + penalty = yield_deboost_calculate_penalty(rq, se_y_lca, se_t_lca, se_t, + cfs_rq_common->nr_queued); + + /* Step 5: apply penalty and update EEVDF fields */ + yield_deboost_apply_penalty(rq, se_y_lca, cfs_rq_common, penalty); +} + static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -9314,6 +9360,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); + /* Apply deboost under rq lock. */ + yield_to_deboost(rq, p); + + /* Complete the standard yield path. */ yield_task_fair(rq); return true; -- 2.43.0