Now we have the event stream and SEV/SEVL implemented we can finally enable WFET for Aarch64. To avoid issues with QEMU's incomplete ldst exclusive handling causing potential deadlocks in common WFE enabled locking patterns we take advantage of the architectures flexibility and treat being in the exclusive region as a reason to exit. Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée --- v2 - fix exception syndrome by using enum value - use env->halt_reason v3 - fix check_wfx_trap(s/false/true/) as it is a WFE v4 - defer expensive calculations until needed - treat cs->exclusive_addr as a IMPDEF WFE exit - update commit message v5 - use atomic_xchg to consume event_register --- target/arm/tcg/helper-defs.h | 1 + target/arm/tcg/op_helper.c | 94 ++++++++++++++++++++++++++++++++++ target/arm/tcg/translate-a64.c | 15 +++--- 3 files changed, 103 insertions(+), 7 deletions(-) diff --git a/target/arm/tcg/helper-defs.h b/target/arm/tcg/helper-defs.h index ebdf09be38a..5e4d828dd55 100644 --- a/target/arm/tcg/helper-defs.h +++ b/target/arm/tcg/helper-defs.h @@ -56,6 +56,7 @@ DEF_HELPER_1(setend, void, env) DEF_HELPER_2(wfi, void, env, i32) DEF_HELPER_2(wfe, void, env, i32) DEF_HELPER_2(wfit, void, env, i32) +DEF_HELPER_2(wfet, void, env, i32) DEF_HELPER_1(yield, void, env) DEF_HELPER_1(pre_hvc, void, env) DEF_HELPER_2(pre_smc, void, env, i32) diff --git a/target/arm/tcg/op_helper.c b/target/arm/tcg/op_helper.c index 060b155d559..b64b80fa653 100644 --- a/target/arm/tcg/op_helper.c +++ b/target/arm/tcg/op_helper.c @@ -640,6 +640,100 @@ void HELPER(wfe)(CPUARMState *env, uint32_t insn_len) #endif } +void HELPER(wfet)(CPUARMState *env, uint32_t rd) +{ +#ifdef CONFIG_USER_ONLY + /* + * As for WFIT make it NOP here, because trying to raise EXCP_HLT + * would trigger an abort. + */ + return; +#else + CPUState *cs = env_cpu(env); + uint32_t excp; + int target_el; + ARMCPU *cpu; + uint64_t cntval, timeout, offset, cntvct, nexttick; + int64_t next_event; + + /* + * As for WFE if the event register is already set we can consume + * the event and return immediately. + */ + if (qatomic_xchg(&env->event_register, false)) { + return; + } + + /* + * Don't bother to go into our "low power state" if + * we would just wake up immediately. + * + * We want the value that we would get if we read CNTVCT_EL0 from + * the current exception level, so the direct_access offset, not + * the indirect_access one. Compare the pseudocode LocalTimeoutEvent(), + * which calls VirtualCounterTimer(). + */ + cntval = gt_get_countervalue(env); + offset = gt_direct_access_timer_offset(env, GTIMER_VIRT); + cntvct = cntval - offset; + timeout = env->xregs[rd]; + if (cpu_has_work(cs) || cntvct >= timeout) { + return; + } + + /* We might sleep, so now we check to see if we should trap */ + target_el = check_wfx_trap(env, true, &excp); + if (target_el) { + env->pc -= 4; + raise_exception(env, excp, syn_wfx(1, 0xe, rd, true, WFET, false), target_el); + } + + /* + * If the CPU has entered the exclusive region we could sleep + * until the global monitor moves from Exclusive to Open Access. + * However it would be expensive for QEMU to fully model the + * global monitor and not doing so would potentially trigger + * deadlocks in WFE enabled locking code. However as WFE is a hint + * instruction the architecture allows for the PE to leave + * low-power state for any reason. QEMU chooses to treat being in + * an exclusive region as such and return directly. + */ + if (env->exclusive_addr != -1) { + return; + } + + /* + * Finally work out if the timeout or event stream will kick in + * earlier. + * + * The WFET should time out when CNTVCT_EL0 >= the specified value. + */ + cpu = env_archcpu(env); + if (uadd64_overflow(timeout, offset, &nexttick)) { + nexttick = UINT64_MAX; + } + if (nexttick > INT64_MAX / gt_cntfrq_period_ns(cpu)) { + nexttick = INT64_MAX; + } + + next_event = gt_calc_next_event_stream(env); + if (next_event > 0 && next_event < nexttick) { + timer_mod(cpu->wfxt_timer, next_event); + } else { + if (nexttick == INT64_MAX) { + timer_mod_ns(cpu->wfxt_timer, INT64_MAX); + } else { + timer_mod(cpu->wfxt_timer, nexttick); + } + } + + env->halt_reason = HALT_WFE; + cs->exception_index = EXCP_HLT; + cs->halted = 1; + cpu_loop_exit(cs); +#endif +} + void HELPER(yield)(CPUARMState *env) { CPUState *cs = env_cpu(env); diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index a4603e1a5c7..fb9a212df4b 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -2086,14 +2086,15 @@ static bool trans_WFET(DisasContext *s, arg_WFET *a) return false; } - /* - * We rely here on our WFE implementation being a NOP, so we - * don't need to do anything different to handle the WFET timeout - * from what trans_WFE does. - */ - if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) { - s->base.is_jmp = DISAS_WFE; + if (s->ss_active) { + /* Act like a NOP under architectural singlestep */ + return true; } + + gen_a64_update_pc(s, 4); + gen_helper_wfet(tcg_env, tcg_constant_i32(a->rd)); + /* Go back to the main loop to check for interrupts */ + s->base.is_jmp = DISAS_EXIT; return true; } -- 2.47.3