From: David Hildenbrand Let's drop all variants that effectively map to clear_page() and provide it in a generic variant instead. We'll use __HAVE_ARCH_CLEAR_USER_PAGE, similar to __HAVE_ARCH_COPY_USER_HIGHPAGE, to indicate whether an architecture provides it's own variant. We have to be a bit careful if an architecture provides a custom clear_user_highpage(), because then it's very likely that some special flushing magic is happening behind the scenes. Maybe at some point these should be CONFIG_ options. Note that for parisc, clear_page() and clear_user_page() map to clear_page_asm(), so we can just get rid of the custom clear_user_page() implementation. There is a clear_user_page_asm() function on parisc, that seems to be unused. Not sure what's up with that. Signed-off-by: David Hildenbrand --- arch/alpha/include/asm/page.h | 1 - arch/arc/include/asm/page.h | 2 ++ arch/arm/include/asm/page-nommu.h | 1 - arch/arm64/include/asm/page.h | 1 - arch/csky/abiv1/inc/abi/page.h | 1 + arch/csky/abiv2/inc/abi/page.h | 7 ------- arch/hexagon/include/asm/page.h | 1 - arch/loongarch/include/asm/page.h | 1 - arch/m68k/include/asm/page_mm.h | 1 + arch/m68k/include/asm/page_no.h | 1 - arch/microblaze/include/asm/page.h | 1 - arch/mips/include/asm/page.h | 1 + arch/nios2/include/asm/page.h | 1 + arch/openrisc/include/asm/page.h | 1 - arch/parisc/include/asm/page.h | 1 - arch/powerpc/include/asm/page.h | 1 + arch/riscv/include/asm/page.h | 1 - arch/s390/include/asm/page.h | 1 - arch/sparc/include/asm/page_32.h | 2 ++ arch/sparc/include/asm/page_64.h | 1 + arch/um/include/asm/page.h | 1 - arch/x86/include/asm/page.h | 6 ------ arch/xtensa/include/asm/page.h | 1 - include/linux/mm.h | 22 ++++++++++++++++++++++ 24 files changed, 32 insertions(+), 26 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 5ec4c77e432e..d71ef845deca 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -11,7 +11,6 @@ #define STRICT_MM_TYPECHECKS extern void clear_page(void *page); -#define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 9720fe6b2c24..cb4d69b473e6 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -32,6 +32,8 @@ struct page; void copy_user_highpage(struct page *to, struct page *from, unsigned long u_vaddr, struct vm_area_struct *vma); + +#define __HAVE_ARCH_CLEAR_USER_PAGE void clear_user_page(void *to, unsigned long u_vaddr, struct page *page); typedef struct { diff --git a/arch/arm/include/asm/page-nommu.h b/arch/arm/include/asm/page-nommu.h index 7c2c72323d17..e74415c959be 100644 --- a/arch/arm/include/asm/page-nommu.h +++ b/arch/arm/include/asm/page-nommu.h @@ -11,7 +11,6 @@ #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) /* diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 2312e6ee595f..0cb8853c0af4 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -36,7 +36,6 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, void tag_clear_highpage(struct page *to); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct page *pgtable_t; diff --git a/arch/csky/abiv1/inc/abi/page.h b/arch/csky/abiv1/inc/abi/page.h index 2d2159933b76..08a37f5990cc 100644 --- a/arch/csky/abiv1/inc/abi/page.h +++ b/arch/csky/abiv1/inc/abi/page.h @@ -10,6 +10,7 @@ static inline unsigned long pages_do_alias(unsigned long addr1, return (addr1 ^ addr2) & (SHMLBA-1); } +#define __HAVE_ARCH_CLEAR_USER_PAGE static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page) { diff --git a/arch/csky/abiv2/inc/abi/page.h b/arch/csky/abiv2/inc/abi/page.h index cf005f13cd15..a5a255013308 100644 --- a/arch/csky/abiv2/inc/abi/page.h +++ b/arch/csky/abiv2/inc/abi/page.h @@ -1,11 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ - -static inline void clear_user_page(void *addr, unsigned long vaddr, - struct page *page) -{ - clear_page(addr); -} - static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *page) { diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index 137ba7c5de48..f0aed3ed812b 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -113,7 +113,6 @@ static inline void clear_page(void *page) /* * Under assumption that kernel always "sees" user map... */ -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) static inline unsigned long virt_to_pfn(const void *kaddr) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index a3aaf34fba16..b83415fe4ffb 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -30,7 +30,6 @@ extern void clear_page(void *page); extern void copy_page(void *to, void *from); -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) extern unsigned long shm_align_mask; diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index ed782609ca41..10798156121d 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -52,6 +52,7 @@ static inline void clear_page(void *page) #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) #endif +#define __HAVE_ARCH_CLEAR_USER_PAGE #define clear_user_page(addr, vaddr, page) \ do { clear_page(addr); \ flush_dcache_page(page); \ diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 39db2026a4b4..d2532bc407ef 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -10,7 +10,6 @@ extern unsigned long memory_end; #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 90ac9f34b4b4..e1e396367ba7 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -45,7 +45,6 @@ typedef unsigned long pte_basic_t; # define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) # define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) -# define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE) # define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index bc3e3484c1bf..6b41650c27ab 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -90,6 +90,7 @@ static inline void clear_user_page(void *addr, unsigned long vaddr, if (pages_do_alias((unsigned long) addr, vaddr & PAGE_MASK)) flush_data_cache_page((unsigned long)addr); } +#define __HAVE_ARCH_CLEAR_USER_PAGE struct vm_area_struct; extern void copy_user_highpage(struct page *to, struct page *from, diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 00a51623d38a..ea9cac9e1bc1 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -45,6 +45,7 @@ struct page; +#define __HAVE_ARCH_CLEAR_USER_PAGE extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page); extern void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, struct page *to); diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index 85797f94d1d7..d2cdbf3579bb 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -30,7 +30,6 @@ #define clear_page(page) memset((page), 0, PAGE_SIZE) #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) /* diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 8f4e51071ea1..3630b36d07da 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -21,7 +21,6 @@ struct vm_area_struct; void clear_page_asm(void *page); void copy_page_asm(void *to, void *from); -#define clear_user_page(vto, vaddr, page) clear_page_asm(vto) void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma); #define __HAVE_ARCH_COPY_USER_HIGHPAGE diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index b28fbb1d57eb..da56e7d42e25 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -271,6 +271,7 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); +#define __HAVE_ARCH_CLEAR_USER_PAGE extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); extern int devmem_is_allowed(unsigned long pfn); diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index ffe213ad65a4..061b60b954ec 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -50,7 +50,6 @@ void clear_page(void *page); #endif #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr) #define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 9240a363c893..6635ba56d4b2 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -65,7 +65,6 @@ static inline void copy_page(void *to, void *from) : : "memory", "cc"); } -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index c1bccbedf567..572f62619254 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -17,6 +17,8 @@ #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) + +#define __HAVE_ARCH_CLEAR_USER_PAGE #define clear_user_page(addr, vaddr, page) \ do { clear_page(addr); \ sparc_flush_page_to_ram(page); \ diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index d764d8a8586b..52213c92ee94 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -43,6 +43,7 @@ void _clear_page(void *page); #define clear_page(X) _clear_page((void *)(X)) struct page; void clear_user_page(void *addr, unsigned long vaddr, struct page *page); +#define __HAVE_ARCH_CLEAR_USER_PAGE #define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE) void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage); #define __HAVE_ARCH_COPY_USER_HIGHPAGE diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 6f54254aaf44..8cea97a9c8f9 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -26,7 +26,6 @@ struct page; #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) -#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct { unsigned long pte; } pte_t; diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 9265f2fca99a..416dc88e35c1 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -22,12 +22,6 @@ struct page; extern struct range pfn_mapped[]; extern int nr_pfn_mapped; -static inline void clear_user_page(void *page, unsigned long vaddr, - struct page *pg) -{ - clear_page(page); -} - static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 20655174b111..059493256765 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -126,7 +126,6 @@ void clear_user_highpage(struct page *page, unsigned long vaddr); void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma); #else -# define clear_user_page(page, vaddr, pg) clear_page(page) # define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..683168b522b3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3872,6 +3872,28 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef __HAVE_ARCH_CLEAR_USER_PAGE +/** + * clear_user_page() - clear a page to be mapped to user space + * @addr: the address of the page + * @vaddr: the address of the user mapping + * @page: the page + */ +static inline void clear_user_page(void *addr, unsigned long vaddr, struct page *page) +{ +#ifdef clear_user_highpage + /* + * If an architecture defines its own clear_user_highpage() variant, + * then we have to be a bit more careful here and cannot simply + * rely on clear_page(). + */ + clear_user_highpage(page, vaddr); +#else + clear_page(addr); +#endif +} +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); -- 2.43.5 Introduce clear_pages(), to be overridden by architectures that support more efficient clearing of consecutive pages. Also introduce clear_user_pages(), however, we will not expect this function to be overridden anytime soon. We have to place the clear_user_pages() variant that uses clear_user_page() into mm/util.c for now to work around macro magic on sparc and m68k. Signed-off-by: Ankur Arora --- include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++ mm/util.c | 13 +++++++++++++ 2 files changed, 54 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 683168b522b3..ecbcb76df9de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3872,6 +3872,26 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef __HAVE_ARCH_CLEAR_PAGES +/** + * clear_pages() - clear a page range for kernel-internal use. + * @addr: start address + * @npages: number of pages + * + * Use clear_user_pages() instead when clearing a page range to be + * mapped to user space. + * + * Does absolutely no exception handling. + */ +static inline void clear_pages(void *addr, unsigned int npages) +{ + do { + clear_page(addr); + addr += PAGE_SIZE; + } while (--npages); +} +#endif + #ifndef __HAVE_ARCH_CLEAR_USER_PAGE /** * clear_user_page() - clear a page to be mapped to user space @@ -3894,6 +3914,27 @@ static inline void clear_user_page(void *addr, unsigned long vaddr, struct page } #endif +/** + * clear_user_pages() - clear a page range to be mapped to user space + * @addr: start address + * @vaddr: start address of the user mapping + * @page: start page + * @npages: number of pages + * + * Assumes that the region (@addr, +@npages) has been validated + * already so this does no exception handling. + */ +#ifdef __HAVE_ARCH_CLEAR_USER_PAGE +void clear_user_pages(void *addr, unsigned long vaddr, + struct page *page, unsigned int npages); +#else /* !__HAVE_ARCH_CLEAR_USER_PAGE */ +static inline void clear_user_pages(void *addr, unsigned long vaddr, + struct page *page, unsigned int npages) +{ + clear_pages(addr, npages); +} +#endif /* __HAVE_ARCH_CLEAR_USER_PAGE */ + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); diff --git a/mm/util.c b/mm/util.c index 8989d5767528..d3b662b71f33 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1344,3 +1344,16 @@ bool page_range_contiguous(const struct page *page, unsigned long nr_pages) } EXPORT_SYMBOL(page_range_contiguous); #endif + +#ifdef __HAVE_ARCH_CLEAR_USER_PAGE +void clear_user_pages(void *addr, unsigned long vaddr, + struct page *page, unsigned int npages) +{ + do { + clear_user_page(addr, vaddr, page); + addr += PAGE_SIZE; + vaddr += PAGE_SIZE; + page++; + } while (--npages); +} +#endif /* __HAVE_ARCH_CLEAR_USER_PAGE */ -- 2.43.5 Define clear_user_highpages() which clears pages sequentially using the single page variant. With !CONFIG_HIGHMEM, pages are contiguous so use the range clearing primitive clear_user_pages(). Signed-off-by: Ankur Arora Note: fixed --- include/linux/highmem.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 105cc4c00cc3..c5f8b1556fd7 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -199,6 +199,11 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage +/** + * clear_user_highpage() - clear a page to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + */ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); @@ -207,6 +212,30 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif +/** + * clear_user_highpages() - clear a page range to be mapped to user space + * @page: start page + * @vaddr: start address of the user mapping + * @npages: number of pages + * + * Assumes that all the pages in the region (@page, +@npages) are valid + * so this does no exception handling. + */ +static inline void clear_user_highpages(struct page *page, unsigned long vaddr, + unsigned int npages) +{ + if (!IS_ENABLED(CONFIG_HIGHMEM)) { + clear_user_pages(page_address(page), vaddr, page, npages); + return; + } + + do { + clear_user_highpage(page, vaddr); + vaddr += PAGE_SIZE; + page++; + } while (--npages); +} + #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. -- 2.43.5 clear_page_rep() and clear_page_erms() are wrappers around "REP; STOS" variations. Inlining gets rid of an unnecessary CALL/RET (which isn't free when using RETHUNK speculative execution mitigations.) Fixup and rename clear_page_orig() to adapt to the changed calling convention. Also add a comment from Dave Hansen detailing various clearing mechanisms used in clear_page(). Signed-off-by: Ankur Arora Tested-by: Raghavendra K T --- arch/x86/include/asm/page_32.h | 6 +++++ arch/x86/include/asm/page_64.h | 46 +++++++++++++++++++++++++--------- arch/x86/lib/clear_page_64.S | 39 ++++++---------------------- 3 files changed, 48 insertions(+), 43 deletions(-) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 0c623706cb7e..19fddb002cc9 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -17,6 +17,12 @@ extern unsigned long __phys_addr(unsigned long); #include +/** + * clear_page() - clear a page using a kernel virtual address. + * @page: address of kernel page + * + * Does absolutely no exception handling. + */ static inline void clear_page(void *page) { memset(page, 0, PAGE_SIZE); diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 015d23f3e01f..df528cff90ef 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -40,23 +40,45 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -void clear_page_orig(void *page); -void clear_page_rep(void *page); -void clear_page_erms(void *page); +void memzero_page_aligned_unrolled(void *addr, u64 len); -static inline void clear_page(void *page) +/** + * clear_page() - clear a page using a kernel virtual address. + * @addr: address of kernel page + * + * Switch between three implementations of page clearing based on CPU + * capabilities: + * + * - memzero_page_aligned_unrolled(): the oldest, slowest and universally + * supported method. Zeroes via 8-byte MOV instructions unrolled 8x + * to write a 64-byte cacheline in each loop iteration.. + * + * - "rep stosq": really old CPUs had crummy REP implementations. + * Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be + * trusted. The instruction writes 8-byte per REP iteration but + * CPUs can internally batch these together and do larger writes. + * + * - "rep stosb": CPUs that enumerate 'ERMS' have an improved STOS + * implementation that is less picky about alignment and where + * STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes + * at a time.) + * + * Does absolutely no exception handling. + */ +static inline void clear_page(void *addr) { + u64 len = PAGE_SIZE; /* * Clean up KMSAN metadata for the page being cleared. The assembly call - * below clobbers @page, so we perform unpoisoning before it. + * below clobbers @addr, so we perform unpoisoning before it. */ - kmsan_unpoison_memory(page, PAGE_SIZE); - alternative_call_2(clear_page_orig, - clear_page_rep, X86_FEATURE_REP_GOOD, - clear_page_erms, X86_FEATURE_ERMS, - "=D" (page), - "D" (page), - "cc", "memory", "rax", "rcx"); + kmsan_unpoison_memory(addr, len); + asm volatile(ALTERNATIVE_2("call memzero_page_aligned_unrolled", + "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD, + "rep stosb", X86_FEATURE_ERMS) + : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT + : "a" (0) + : "cc", "memory"); } void copy_page(void *to, void *from); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a508e4a8c66a..27debe0c018c 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -6,30 +6,15 @@ #include /* - * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is - * recommended to use this when possible and we do use them by default. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original. + * Zero page aligned region. + * %rdi - dest + * %rcx - length */ - -/* - * Zero a page. - * %rdi - page - */ -SYM_TYPED_FUNC_START(clear_page_rep) - movl $4096/8,%ecx - xorl %eax,%eax - rep stosq - RET -SYM_FUNC_END(clear_page_rep) -EXPORT_SYMBOL_GPL(clear_page_rep) - -SYM_TYPED_FUNC_START(clear_page_orig) - xorl %eax,%eax - movl $4096/64,%ecx +SYM_TYPED_FUNC_START(memzero_page_aligned_unrolled) + shrq $6, %rcx .p2align 4 .Lloop: - decl %ecx + decq %rcx #define PUT(x) movq %rax,x*8(%rdi) movq %rax,(%rdi) PUT(1) @@ -43,16 +28,8 @@ SYM_TYPED_FUNC_START(clear_page_orig) jnz .Lloop nop RET -SYM_FUNC_END(clear_page_orig) -EXPORT_SYMBOL_GPL(clear_page_orig) - -SYM_TYPED_FUNC_START(clear_page_erms) - movl $4096,%ecx - xorl %eax,%eax - rep stosb - RET -SYM_FUNC_END(clear_page_erms) -EXPORT_SYMBOL_GPL(clear_page_erms) +SYM_FUNC_END(memzero_page_aligned_unrolled) +EXPORT_SYMBOL_GPL(memzero_page_aligned_unrolled) /* * Default clear user-space. -- 2.43.5 Performance when clearing with string instructions (x86-64-stosq and similar) can vary significantly based on the chunk-size used. $ perf bench mem memset -k 4KB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 13.748208 GB/sec $ perf bench mem memset -k 2MB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in # arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 15.067900 GB/sec $ perf bench mem memset -k 1GB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 38.104311 GB/sec (Both on AMD Milan.) With a change in chunk-size of 4KB to 1GB, we see the performance go from 13.7 GB/sec to 38.1 GB/sec. For a chunk-size of 2MB the change isn't quite as drastic but it is worth adding a clear_page() variant that can handle contiguous page-extents. Also define ARCH_PAGE_CONTIG_NR to specify the maximum contiguous page range that can be zeroed when running under cooperative preemption models. This limits the worst case preemption latency. Signed-off-by: Ankur Arora Tested-by: Raghavendra K T --- arch/x86/include/asm/page_64.h | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index df528cff90ef..efab5dc26e3e 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -43,8 +43,9 @@ extern unsigned long __phys_addr_symbol(unsigned long); void memzero_page_aligned_unrolled(void *addr, u64 len); /** - * clear_page() - clear a page using a kernel virtual address. - * @addr: address of kernel page + * clear_pages() - clear a page range using a kernel virtual address. + * @addr: start address of kernel page range + * @npages: number of pages * * Switch between three implementations of page clearing based on CPU * capabilities: @@ -65,11 +66,11 @@ void memzero_page_aligned_unrolled(void *addr, u64 len); * * Does absolutely no exception handling. */ -static inline void clear_page(void *addr) +static inline void clear_pages(void *addr, unsigned int npages) { - u64 len = PAGE_SIZE; + u64 len = npages * PAGE_SIZE; /* - * Clean up KMSAN metadata for the page being cleared. The assembly call + * Clean up KMSAN metadata for the pages being cleared. The assembly call * below clobbers @addr, so we perform unpoisoning before it. */ kmsan_unpoison_memory(addr, len); @@ -80,6 +81,21 @@ static inline void clear_page(void *addr) : "a" (0) : "cc", "memory"); } +#define __HAVE_ARCH_CLEAR_PAGES + +/* + * When running under cooperatively scheduled preemption models limit the + * maximum contiguous extent that can be cleared to pages worth 8MB. + * + * With a clearing BW of ~10GBps, this should result in worst case scheduling + * latency of ~1ms. + */ +#define ARCH_PAGE_CONTIG_NR (8 << (20 - PAGE_SHIFT)) + +static inline void clear_page(void *addr) +{ + clear_pages(addr, 1); +} void copy_page(void *to, void *from); KCFI_REFERENCE(copy_page); -- 2.43.5 Clear contiguous page ranges in folio_zero_user() instead of clearing a page-at-a-time. This enables CPU specific optimizations based on the length of the region. Operating on arbitrarily large regions can lead to high preemption latency under cooperative preemption models. So, limit the worst case preemption latency via architecture specified PAGE_CONTIG_NR units. The resultant performance depends on the kinds of optimizations available to the CPU for the region being cleared. Two classes of of optimizations: - clearing iteration costs can be amortized over a range larger than a single page. - cacheline allocation elision (seen on AMD Zen models). Testing a demand fault workload shows an improved baseline from the first optimization and a larger improvement when the region being cleared is large enough for the second optimization. AMD Milan (EPYC 7J13, boost=0, region=64GB on the local NUMA node): $ perf bench mem map -p $pg-sz -f demand -s 64GB -l 5 page-at-a-time contiguous clearing change (GB/s +- %stdev) (GB/s +- %stdev) pg-sz=2MB 12.92 +- 2.55% 17.03 +- 0.70% + 31.8% preempt=* pg-sz=1GB 17.14 +- 2.27% 18.04 +- 1.05% [#] + 5.2% preempt=none|voluntary pg-sz=1GB 17.26 +- 1.24% 42.17 +- 4.21% +144.3% preempt=full|lazy [#] AMD Milan uses a threshold of LLC-size (~32MB) for eliding cacheline allocation, which is larger than ARCH_PAGE_CONTIG_NR, so preempt=none|voluntary see no improvement on the pg-sz=1GB. Also as mentioned earlier, the baseline improvement is not specific to AMD Zen platforms. Intel Icelakex (pg-sz=2MB|1GB) sees a similar improvement as the Milan pg-sz=2MB workload above (~30%). Signed-off-by: Ankur Arora Reviewed-by: Raghavendra K T Tested-by: Raghavendra K T --- include/linux/mm.h | 6 ++++++ mm/memory.c | 42 +++++++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ecbcb76df9de..02db84667f97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3872,6 +3872,12 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef ARCH_PAGE_CONTIG_NR +#define PAGE_CONTIG_NR 1 +#else +#define PAGE_CONTIG_NR ARCH_PAGE_CONTIG_NR +#endif + #ifndef __HAVE_ARCH_CLEAR_PAGES /** * clear_pages() - clear a page range for kernel-internal use. diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..7781b2aa18a8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7144,40 +7144,40 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, - unsigned int nr_pages) +/* + * Clear contiguous pages chunking them up when running under + * non-preemptible models. + */ +static void clear_contig_highpages(struct page *page, unsigned long addr, + unsigned int npages) { - unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); - int i; + unsigned int i, count, unit; - might_sleep(); - for (i = 0; i < nr_pages; i++) { + unit = preempt_model_preemptible() ? npages : PAGE_CONTIG_NR; + + for (i = 0; i < npages; ) { + count = min(unit, npages - i); + clear_user_highpages(page + i, + addr + i * PAGE_SIZE, count); + i += count; cond_resched(); - clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } -static int clear_subpage(unsigned long addr, int idx, void *arg) -{ - struct folio *folio = arg; - - clear_user_highpage(folio_page(folio, idx), addr); - return 0; -} - /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. - * @addr_hint: The address will be accessed or the base address if uncelar. + * @addr_hint: The address accessed by the user or the base address. + * + * Uses architectural support for clear_pages() to zero page extents + * instead of clearing page-at-a-time. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { - unsigned int nr_pages = folio_nr_pages(folio); + unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) - clear_gigantic_page(folio, addr_hint, nr_pages); - else - process_huge_page(addr_hint, nr_pages, clear_subpage, folio); + clear_contig_highpages(folio_page(folio, 0), + base_addr, folio_nr_pages(folio)); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, -- 2.43.5 folio_zero_user() does straight zeroing without caring about temporal locality for caches. This replaced commit c6ddfb6c5890 ("mm, clear_huge_page: move order algorithm into a separate function") where we cleared a page at a time converging to the faulting page from the left and the right. To retain limited temporal locality, split the clearing in three parts: the faulting page and its immediate neighbourhood, and, the remaining regions on the left and the right. The local neighbourhood will be cleared last. Do this only when zeroing small folios (< MAX_ORDER_NR_PAGES) since there isn't much expectation of cache locality for large folios. Performance === AMD Genoa (EPYC 9J14, cpus=2 sockets * 96 cores * 2 threads, memory=2.2 TB, L1d= 16K/thread, L2=512K/thread, L3=2MB/thread) anon-w-seq (vm-scalability): stime utime page-at-a-time 1654.63 ( +- 3.84% ) 811.00 ( +- 3.84% ) contiguous clearing 1602.86 ( +- 3.00% ) 970.75 ( +- 4.68% ) neighbourhood-last 1630.32 ( +- 2.73% ) 886.37 ( +- 5.19% ) Both stime and utime respond in expected ways. stime drops for both contiguous clearing (-3.14%) and neighbourhood-last (-1.46%) approaches. However, utime increases for both contiguous clearing (+19.7%) and neighbourhood-last (+9.28%). In part this is because anon-w-seq runs with 384 processes zeroing anonymously mapped memory which they then access sequentially. As such this is a likely uncommon pattern where the memory bandwidth is saturated while also being cache limited because we access the entire region. Kernel make workload (make -j 12 bzImage): stime utime page-at-a-time 138.16 ( +- 0.31% ) 1015.11 ( +- 0.05% ) contiguous clearing 133.42 ( +- 0.90% ) 1013.49 ( +- 0.05% ) neighbourhood-last 131.20 ( +- 0.76% ) 1011.36 ( +- 0.07% ) For make the utime stays relatively flat with an up to 4.9% improvement in the stime. Signed-off-by: Ankur Arora Reviewed-by: Raghavendra K T Tested-by: Raghavendra K T --- mm/memory.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7781b2aa18a8..53a10c06a26d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7171,13 +7171,53 @@ static void clear_contig_highpages(struct page *page, unsigned long addr, * * Uses architectural support for clear_pages() to zero page extents * instead of clearing page-at-a-time. + * + * Clearing of small folios (< MAX_ORDER_NR_PAGES) is split in three parts: + * pages in the immediate locality of the faulting page, and its left, right + * regions; the local neighbourhood cleared last in order to keep cache + * lines of the target region hot. + * + * For larger folios we assume that there is no expectation of cache locality + * and just do a straight zero. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); + const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE; + const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1); + const int width = 2; /* number of pages cleared last on either side */ + struct range r[3]; + int i; - clear_contig_highpages(folio_page(folio, 0), - base_addr, folio_nr_pages(folio)); + if (folio_nr_pages(folio) > MAX_ORDER_NR_PAGES) { + clear_contig_highpages(folio_page(folio, 0), + base_addr, folio_nr_pages(folio)); + return; + } + + /* + * Faulting page and its immediate neighbourhood. Cleared at the end to + * ensure it sticks around in the cache. + */ + r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - width, pg.start, pg.end), + clamp_t(s64, fault_idx + width, pg.start, pg.end)); + + /* Region to the left of the fault */ + r[1] = DEFINE_RANGE(pg.start, + clamp_t(s64, r[2].start-1, pg.start-1, r[2].start)); + + /* Region to the right of the fault: always valid for the common fault_idx=0 case. */ + r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end+1, r[2].end, pg.end+1), + pg.end); + + for (i = 0; i <= 2; i++) { + unsigned int npages = range_len(&r[i]); + struct page *page = folio_page(folio, r[i].start); + unsigned long addr = base_addr + folio_page_idx(folio, page) * PAGE_SIZE; + + if (npages > 0) + clear_contig_highpages(page, addr, npages); + } } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, -- 2.43.5