From: Ashish Kalra When SEV-SNP is enabled, all writes to memory are checked to ensure integrity of SNP guest memory. This imposes performance overhead on the whole system. RMPOPT is a new instruction that minimizes the performance overhead of RMP checks on the hypervisor and on non-SNP guests by allowing RMP checks to be skipped for 1GB regions of memory that are known not to contain any SEV-SNP guest memory. Add support for performing RMP optimizations asynchronously using a dedicated workqueue. Enable RMPOPT optimizations for up to 2TB of system RAM starting from the lowest physical memory address aligned down to a 1GB boundary at RMP initialization time. RMP checks can initially be skipped for 1GB memory ranges that do not contain SEV-SNP guest memory (excluding preassigned pages such as the RMP table and firmware pages). As SNP guests are launched, RMPUPDATE will disable the corresponding RMPOPT optimizations. Suggested-by: Thomas Lendacky Suggested-by: Dave Hansen Reviewed-by: Ackerley Tng Signed-off-by: Ashish Kalra --- arch/x86/virt/svm/sev.c | 196 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 3 deletions(-) diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 089c9a14edc7..d7e40a5fe5ca 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -125,7 +126,18 @@ static void *rmp_bookkeeping __ro_after_init; static u64 probed_rmp_base, probed_rmp_size; static cpumask_t rmpopt_cpumask; -static phys_addr_t rmpopt_pa_start; +static phys_addr_t rmpopt_pa_start, rmpopt_pa_end; + +enum rmpopt_function { + RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS, + RMPOPT_FUNC_REPORT_STATUS +}; + +#define RMPOPT_WORK_TIMEOUT 10000 + +static struct workqueue_struct *rmpopt_wq; +static struct delayed_work rmpopt_delayed_work; +static DEFINE_MUTEX(rmpopt_wq_mutex); static LIST_HEAD(snp_leaked_pages_list); static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); @@ -566,6 +578,14 @@ static void rmpopt_cleanup(void) { int cpu; + guard(mutex)(&rmpopt_wq_mutex); + + if (!rmpopt_wq) + return; + + cancel_delayed_work_sync(&rmpopt_delayed_work); + destroy_workqueue(rmpopt_wq); + cpus_read_lock(); for_each_cpu(cpu, &rmpopt_cpumask) @@ -574,7 +594,8 @@ static void rmpopt_cleanup(void) cpus_read_unlock(); cpumask_clear(&rmpopt_cpumask); - rmpopt_pa_start = 0; + rmpopt_pa_start = rmpopt_pa_end = 0; + rmpopt_wq = NULL; } void snp_shutdown(void) @@ -592,6 +613,134 @@ void snp_shutdown(void) } EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp"); +static inline bool __rmpopt(u64 pa_start, u64 op_type) +{ + bool optimized; + + asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfc" + : "=@ccc" (optimized) + : "a" (pa_start), "c" (op_type) + : "memory", "cc"); + + return optimized; +} + +static void rmpopt(u64 pa) +{ + u64 pa_start = ALIGN_DOWN(pa, SZ_1G); + u64 op_type = RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS; + + __rmpopt(pa_start, op_type); +} + +/* + * 'val' is a system physical address. + */ +static void rmpopt_smp(void *val) +{ + rmpopt((u64)val); +} + +/* + * RMPOPT optimizations skip RMP checks at 1GB granularity if this + * range of memory does not contain any SNP guest memory. + */ +static void rmpopt_work_handler(struct work_struct *work) +{ + cpumask_var_t follower_mask; + phys_addr_t pa; + int this_cpu; + + pr_info("Attempt RMP optimizations on physical address range @1GB alignment [0x%016llx - 0x%016llx]\n", + rmpopt_pa_start, rmpopt_pa_end); + + if (!alloc_cpumask_var(&follower_mask, GFP_KERNEL)) + return; + + /* + * RMPOPT scans the RMP table, stores the result of the scan in the + * reserved processor memory. The RMP scan is the most expensive + * part. If a second RMPOPT occurs, it can skip the expensive scan + * if they can see a cached result in the reserved processor memory. + * + * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT + * on every other primary thread. Followers are "designed to" + * skip the scan if they see the "cached" scan results. + */ + cpumask_copy(follower_mask, &rmpopt_cpumask); + + /* + * Pin the worker to the current CPU for the leader loop so that + * this_cpu remains valid and the RMPOPT instruction executes on + * the correct CPU. + * + * Use migrate_disable() rather than get_cpu() to prevent + * migration while still allowing preemption. + */ + migrate_disable(); + this_cpu = smp_processor_id(); + + if (cpumask_test_cpu(this_cpu, follower_mask)) { + /* + * Current CPU is a primary thread in rmpopt_cpumask. + * Run leader locally and remove from follower mask. + */ + cpumask_clear_cpu(this_cpu, follower_mask); + + for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) + rmpopt(pa); + } else if (cpumask_intersects(topology_sibling_cpumask(this_cpu), + follower_mask)) { + /* + * Current CPU is a sibling thread whose primary is in + * rmpopt_cpumask. RMPOPT_BASE MSR is per-core, so it + * is safe to run the leader locally. Remove the sibling's + * primary from the follower mask as this core is already + * covered by the leader. + */ + cpumask_andnot(follower_mask, follower_mask, + topology_sibling_cpumask(this_cpu)); + + for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) + rmpopt(pa); + } else { + /* + * Current CPU does not have RMPOPT_BASE MSR programmed. + * Pick an explicit leader from the cpumask to avoid #UD. + */ + int leader_cpu = cpumask_first(follower_mask); + + if (WARN_ON_ONCE(leader_cpu >= nr_cpu_ids)) { + migrate_enable(); + goto out; + } + + cpumask_clear_cpu(leader_cpu, follower_mask); + + cpus_read_lock(); + for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) + smp_call_function_single(leader_cpu, rmpopt_smp, + (void *)pa, true); + cpus_read_unlock(); + } + + migrate_enable(); + + /* Followers: run RMPOPT on remaining cores */ + cpus_read_lock(); + for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) { + on_each_cpu_mask(follower_mask, rmpopt_smp, + (void *)pa, true); + + /* Give a chance for other threads to run */ + cond_resched(); + } + cpus_read_unlock(); + +out: + free_cpumask_var(follower_mask); +} + void snp_setup_rmpopt(void) { u64 rmpopt_base; @@ -600,11 +749,35 @@ void snp_setup_rmpopt(void) if (!cpu_feature_enabled(X86_FEATURE_RMPOPT)) return; + guard(mutex)(&rmpopt_wq_mutex); + + /* + * Guard against re-initialization. When SNP_SHUTDOWN_EX is issued + * with x86_snp_shutdown=0, snp_shutdown() is not called and + * rmpopt_cleanup() is skipped, but snp_initialized is still cleared. + * A subsequent __sev_snp_init_locked() would call snp_setup_rmpopt() + * again, leaking the existing workqueue, delayed work, debugfs + * entries, and cpumask state. + */ + if (rmpopt_wq) + return; + + /* + * Create an RMPOPT-specific workqueue to avoid scheduling + * RMPOPT workitem on the global system workqueue. + */ + rmpopt_wq = alloc_workqueue("rmpopt_wq", WQ_UNBOUND, 1); + if (!rmpopt_wq) { + pr_err("Failed to allocate RMPOPT workqueue\n"); + return; + } + cpus_read_lock(); /* * The RMPOPT_BASE MSR is per-core, so only one thread per core needs - * to set up the RMPOPT_BASE MSR. + * to set up the RMPOPT_BASE MSR. Likewise, only one thread per core + * needs to issue the RMPOPT instruction. * * Note: only online primary threads are included. If a core's * primary thread is offline, that core is not covered. CPU hotplug @@ -628,6 +801,23 @@ void snp_setup_rmpopt(void) wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base); cpus_read_unlock(); + + INIT_DELAYED_WORK(&rmpopt_delayed_work, rmpopt_work_handler); + + rmpopt_pa_end = ALIGN(PFN_PHYS(max_pfn), SZ_1G); + + /* Limit memory scanning to 2TB of RAM */ + if ((rmpopt_pa_end - rmpopt_pa_start) > SZ_2T) { + pr_info("RMPOPT coverage limited to 2TB; memory above 0x%llx not optimized\n", + rmpopt_pa_start + SZ_2T); + rmpopt_pa_end = rmpopt_pa_start + SZ_2T; + } + + /* + * Once all per-CPU RMPOPT tables have been configured, enable RMPOPT + * optimizations on all physical memory. + */ + queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work, 0); } EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp"); -- 2.43.0