clear_page_rep() and clear_page_erms() are wrappers around "REP; STOS" variations. Inlining gets rid of an unnecessary CALL/RET (which isn't free when using RETHUNK speculative execution mitigations.) Fixup and rename clear_page_orig() to adapt to the changed calling convention. Also add a comment from Dave Hansen detailing various clearing mechanisms used in clear_page(). Signed-off-by: Ankur Arora --- arch/x86/include/asm/page_32.h | 6 +++++ arch/x86/include/asm/page_64.h | 42 ++++++++++++++++++++++++++-------- arch/x86/lib/clear_page_64.S | 39 +++++++------------------------ 3 files changed, 46 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 0c623706cb7e..19fddb002cc9 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -17,6 +17,12 @@ extern unsigned long __phys_addr(unsigned long); #include +/** + * clear_page() - clear a page using a kernel virtual address. + * @page: address of kernel page + * + * Does absolutely no exception handling. + */ static inline void clear_page(void *page) { memset(page, 0, PAGE_SIZE); diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 015d23f3e01f..17b6ae89e211 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -40,23 +40,45 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -void clear_page_orig(void *page); -void clear_page_rep(void *page); -void clear_page_erms(void *page); +void memzero_page_aligned_unrolled(void *addr, u64 len); +/** + * clear_page() - clear a page using a kernel virtual address. + * @page: address of kernel page + * + * Switch between three implementations of page clearing based on CPU + * capabilities: + * + * - memzero_page_aligned_unrolled(): the oldest, slowest and universally + * supported method. Zeroes via 8-byte MOV instructions unrolled 8x + * to write a 64-byte cacheline in each loop iteration.. + * + * - "rep stosq": really old CPUs had crummy REP implementations. + * Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be + * trusted. The instruction writes 8-byte per REP iteration but + * CPUs can internally batch these together and do larger writes. + * + * - "rep stosb": CPUs that enumerate 'ERMS' have an improved STOS + * implementation that is less picky about alignment and where + * STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes + * at a time.) + * + * Does absolutely no exception handling. + */ static inline void clear_page(void *page) { + u64 len = PAGE_SIZE; /* * Clean up KMSAN metadata for the page being cleared. The assembly call * below clobbers @page, so we perform unpoisoning before it. */ - kmsan_unpoison_memory(page, PAGE_SIZE); - alternative_call_2(clear_page_orig, - clear_page_rep, X86_FEATURE_REP_GOOD, - clear_page_erms, X86_FEATURE_ERMS, - "=D" (page), - "D" (page), - "cc", "memory", "rax", "rcx"); + kmsan_unpoison_memory(page, len); + asm volatile(ALTERNATIVE_2("call memzero_page_aligned_unrolled", + "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD, + "rep stosb", X86_FEATURE_ERMS) + : "+c" (len), "+D" (page), ASM_CALL_CONSTRAINT + : "a" (0) + : "cc", "memory"); } void copy_page(void *to, void *from); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a508e4a8c66a..27debe0c018c 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -6,30 +6,15 @@ #include /* - * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is - * recommended to use this when possible and we do use them by default. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original. + * Zero page aligned region. + * %rdi - dest + * %rcx - length */ - -/* - * Zero a page. - * %rdi - page - */ -SYM_TYPED_FUNC_START(clear_page_rep) - movl $4096/8,%ecx - xorl %eax,%eax - rep stosq - RET -SYM_FUNC_END(clear_page_rep) -EXPORT_SYMBOL_GPL(clear_page_rep) - -SYM_TYPED_FUNC_START(clear_page_orig) - xorl %eax,%eax - movl $4096/64,%ecx +SYM_TYPED_FUNC_START(memzero_page_aligned_unrolled) + shrq $6, %rcx .p2align 4 .Lloop: - decl %ecx + decq %rcx #define PUT(x) movq %rax,x*8(%rdi) movq %rax,(%rdi) PUT(1) @@ -43,16 +28,8 @@ SYM_TYPED_FUNC_START(clear_page_orig) jnz .Lloop nop RET -SYM_FUNC_END(clear_page_orig) -EXPORT_SYMBOL_GPL(clear_page_orig) - -SYM_TYPED_FUNC_START(clear_page_erms) - movl $4096,%ecx - xorl %eax,%eax - rep stosb - RET -SYM_FUNC_END(clear_page_erms) -EXPORT_SYMBOL_GPL(clear_page_erms) +SYM_FUNC_END(memzero_page_aligned_unrolled) +EXPORT_SYMBOL_GPL(memzero_page_aligned_unrolled) /* * Default clear user-space. -- 2.31.1