clear_page_rep() and clear_page_erms() are wrappers around "REP; STOS" variations. Inlining gets rid of an unnecessary CALL/RET (which isn't free when using RETHUNK speculative execution mitigations.) Fixup and rename clear_page_orig() to adapt to the changed calling convention. Also add a comment from Dave Hansen detailing various clearing mechanisms used in clear_page(). Signed-off-by: Ankur Arora Tested-by: Raghavendra K T Reviewed-by: Borislav Petkov (AMD) --- arch/x86/include/asm/page_32.h | 6 +++ arch/x86/include/asm/page_64.h | 67 ++++++++++++++++++++++++++-------- arch/x86/lib/clear_page_64.S | 39 ++++---------------- 3 files changed, 66 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 0c623706cb7e..19fddb002cc9 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -17,6 +17,12 @@ extern unsigned long __phys_addr(unsigned long); #include +/** + * clear_page() - clear a page using a kernel virtual address. + * @page: address of kernel page + * + * Does absolutely no exception handling. + */ static inline void clear_page(void *page) { memset(page, 0, PAGE_SIZE); diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 2f0e47be79a4..ec3307234a17 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -48,26 +48,63 @@ static inline unsigned long __phys_addr_symbol(unsigned long x) #define __phys_reloc_hide(x) (x) -void clear_page_orig(void *page); -void clear_page_rep(void *page); -void clear_page_erms(void *page); -KCFI_REFERENCE(clear_page_orig); -KCFI_REFERENCE(clear_page_rep); -KCFI_REFERENCE(clear_page_erms); +void __clear_pages_unrolled(void *page); +KCFI_REFERENCE(__clear_pages_unrolled); -static inline void clear_page(void *page) +/** + * clear_page() - clear a page using a kernel virtual address. + * @addr: address of kernel page + * + * Switch between three implementations of page clearing based on CPU + * capabilities: + * + * - __clear_pages_unrolled(): the oldest, slowest and universally + * supported method. Zeroes via 8-byte MOV instructions unrolled 8x + * to write a 64-byte cacheline in each loop iteration. + * + * - "REP; STOSQ": really old CPUs had crummy REP implementations. + * Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be + * trusted. The instruction writes 8-byte per REP iteration but + * CPUs can internally batch these together and do larger writes. + * + * - "REP; STOSB": used on CPUs with "enhanced REP MOVSB/STOSB", + * which enumerate 'ERMS' and provide an implementation which + * unlike "REP; STOSQ" above wasn't overly picky about alignment. + * The instruction writes 1-byte per REP iteration with CPUs + * internally batching these together into larger writes and is + * generally fastest of the three. + * + * Note that when running as a guest, features exposed by the CPU + * might be mediated by the hypervisor. So, the STOSQ variant might + * be in active use on some systems even when the hardware enumerates + * ERMS. + * + * Does absolutely no exception handling. + */ +static inline void clear_page(void *addr) { + u64 len = PAGE_SIZE; /* * Clean up KMSAN metadata for the page being cleared. The assembly call - * below clobbers @page, so we perform unpoisoning before it. + * below clobbers @addr, so perform unpoisoning before it. */ - kmsan_unpoison_memory(page, PAGE_SIZE); - alternative_call_2(clear_page_orig, - clear_page_rep, X86_FEATURE_REP_GOOD, - clear_page_erms, X86_FEATURE_ERMS, - "=D" (page), - "D" (page), - "cc", "memory", "rax", "rcx"); + kmsan_unpoison_memory(addr, len); + + /* + * The inline asm embeds a CALL instruction and usually that is a no-no + * due to the compiler not knowing that and thus being unable to track + * callee-clobbered registers. + * + * In this case that is fine because the registers clobbered by + * __clear_pages_unrolled() are part of the inline asm register + * specification. + */ + asm volatile(ALTERNATIVE_2("call __clear_pages_unrolled", + "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD, + "rep stosb", X86_FEATURE_ERMS) + : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT + : "a" (0) + : "cc", "memory"); } void copy_page(void *to, void *from); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a508e4a8c66a..f7f356e7218b 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -6,30 +6,15 @@ #include /* - * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is - * recommended to use this when possible and we do use them by default. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original. + * Zero page aligned region. + * %rdi - dest + * %rcx - length */ - -/* - * Zero a page. - * %rdi - page - */ -SYM_TYPED_FUNC_START(clear_page_rep) - movl $4096/8,%ecx - xorl %eax,%eax - rep stosq - RET -SYM_FUNC_END(clear_page_rep) -EXPORT_SYMBOL_GPL(clear_page_rep) - -SYM_TYPED_FUNC_START(clear_page_orig) - xorl %eax,%eax - movl $4096/64,%ecx +SYM_TYPED_FUNC_START(__clear_pages_unrolled) + shrq $6, %rcx .p2align 4 .Lloop: - decl %ecx + decq %rcx #define PUT(x) movq %rax,x*8(%rdi) movq %rax,(%rdi) PUT(1) @@ -43,16 +28,8 @@ SYM_TYPED_FUNC_START(clear_page_orig) jnz .Lloop nop RET -SYM_FUNC_END(clear_page_orig) -EXPORT_SYMBOL_GPL(clear_page_orig) - -SYM_TYPED_FUNC_START(clear_page_erms) - movl $4096,%ecx - xorl %eax,%eax - rep stosb - RET -SYM_FUNC_END(clear_page_erms) -EXPORT_SYMBOL_GPL(clear_page_erms) +SYM_FUNC_END(__clear_pages_unrolled) +EXPORT_SYMBOL_GPL(__clear_pages_unrolled) /* * Default clear user-space. -- 2.31.1