We have two copies of each mem benchmark: one using cycles to measure time, the second for gettimeofday(). Unify. Reviewed-by: Namhyung Kim Signed-off-by: Ankur Arora --- tools/perf/bench/mem-functions.c | 110 +++++++++++++------------------ 1 file changed, 46 insertions(+), 64 deletions(-) diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 19d45c377ac1..8599ed96ee1f 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -51,6 +51,11 @@ static const struct option options[] = { OPT_END() }; +union bench_clock { + u64 cycles; + struct timeval tv; +}; + typedef void *(*memcpy_t)(void *, const void *, size_t); typedef void *(*memset_t)(void *, int, size_t); @@ -91,6 +96,26 @@ static u64 get_cycles(void) return clk; } +static void clock_get(union bench_clock *t) +{ + if (use_cycles) + t->cycles = get_cycles(); + else + BUG_ON(gettimeofday(&t->tv, NULL)); +} + +static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e) +{ + union bench_clock t; + + if (use_cycles) + t.cycles = e->cycles - s->cycles; + else + timersub(&e->tv, &s->tv, &t.tv); + + return t; +} + static double timeval2double(struct timeval *ts) { return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC; @@ -109,8 +134,7 @@ static double timeval2double(struct timeval *ts) struct bench_mem_info { const struct function *functions; - u64 (*do_cycles)(const struct function *r, size_t size, void *src, void *dst); - double (*do_gettimeofday)(const struct function *r, size_t size, void *src, void *dst); + union bench_clock (*do_op)(const struct function *r, size_t size, void *src, void *dst); const char *const *usage; bool alloc_src; }; @@ -119,7 +143,7 @@ static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t { const struct function *r = &info->functions[r_idx]; double result_bps = 0.0; - u64 result_cycles = 0; + union bench_clock rt = { 0 }; void *src = NULL, *dst = zalloc(size); printf("# function '%s' (%s)\n", r->name, r->desc); @@ -136,25 +160,23 @@ static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t if (bench_format == BENCH_FORMAT_DEFAULT) printf("# Copying %s bytes ...\n\n", size_str); - if (use_cycles) { - result_cycles = info->do_cycles(r, size, src, dst); - } else { - result_bps = info->do_gettimeofday(r, size, src, dst); - } + rt = info->do_op(r, size, src, dst); switch (bench_format) { case BENCH_FORMAT_DEFAULT: if (use_cycles) { - printf(" %14lf cycles/byte\n", (double)result_cycles/size_total); + printf(" %14lf cycles/byte\n", (double)rt.cycles/size_total); } else { + result_bps = size_total/timeval2double(&rt.tv); print_bps(result_bps); } break; case BENCH_FORMAT_SIMPLE: if (use_cycles) { - printf("%lf\n", (double)result_cycles/size_total); + printf("%lf\n", (double)rt.cycles/size_total); } else { + result_bps = size_total/timeval2double(&rt.tv); printf("%lf\n", result_bps); } break; @@ -235,38 +257,21 @@ static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) fn(dst, src, size); } -static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, void *dst) +static union bench_clock do_memcpy(const struct function *r, size_t size, + void *src, void *dst) { - u64 cycle_start = 0ULL, cycle_end = 0ULL; + union bench_clock start, end; memcpy_t fn = r->fn.memcpy; int i; memcpy_prefault(fn, size, src, dst); - cycle_start = get_cycles(); + clock_get(&start); for (i = 0; i < nr_loops; ++i) fn(dst, src, size); - cycle_end = get_cycles(); + clock_get(&end); - return cycle_end - cycle_start; -} - -static double do_memcpy_gettimeofday(const struct function *r, size_t size, void *src, void *dst) -{ - struct timeval tv_start, tv_end, tv_diff; - memcpy_t fn = r->fn.memcpy; - int i; - - memcpy_prefault(fn, size, src, dst); - - BUG_ON(gettimeofday(&tv_start, NULL)); - for (i = 0; i < nr_loops; ++i) - fn(dst, src, size); - BUG_ON(gettimeofday(&tv_end, NULL)); - - timersub(&tv_end, &tv_start, &tv_diff); - - return (double)(((double)size * nr_loops) / timeval2double(&tv_diff)); + return clock_diff(&start, &end); } struct function memcpy_functions[] = { @@ -292,8 +297,7 @@ int bench_mem_memcpy(int argc, const char **argv) { struct bench_mem_info info = { .functions = memcpy_functions, - .do_cycles = do_memcpy_cycles, - .do_gettimeofday = do_memcpy_gettimeofday, + .do_op = do_memcpy, .usage = bench_mem_memcpy_usage, .alloc_src = true, }; @@ -301,9 +305,10 @@ int bench_mem_memcpy(int argc, const char **argv) return bench_mem_common(argc, argv, &info); } -static u64 do_memset_cycles(const struct function *r, size_t size, void *src __maybe_unused, void *dst) +static union bench_clock do_memset(const struct function *r, size_t size, + void *src __maybe_unused, void *dst) { - u64 cycle_start = 0ULL, cycle_end = 0ULL; + union bench_clock start, end; memset_t fn = r->fn.memset; int i; @@ -313,34 +318,12 @@ static u64 do_memset_cycles(const struct function *r, size_t size, void *src __m */ fn(dst, -1, size); - cycle_start = get_cycles(); + clock_get(&start); for (i = 0; i < nr_loops; ++i) fn(dst, i, size); - cycle_end = get_cycles(); + clock_get(&end); - return cycle_end - cycle_start; -} - -static double do_memset_gettimeofday(const struct function *r, size_t size, void *src __maybe_unused, void *dst) -{ - struct timeval tv_start, tv_end, tv_diff; - memset_t fn = r->fn.memset; - int i; - - /* - * We prefault the freshly allocated memory range here, - * to not measure page fault overhead: - */ - fn(dst, -1, size); - - BUG_ON(gettimeofday(&tv_start, NULL)); - for (i = 0; i < nr_loops; ++i) - fn(dst, i, size); - BUG_ON(gettimeofday(&tv_end, NULL)); - - timersub(&tv_end, &tv_start, &tv_diff); - - return (double)(((double)size * nr_loops) / timeval2double(&tv_diff)); + return clock_diff(&start, &end); } static const char * const bench_mem_memset_usage[] = { @@ -366,8 +349,7 @@ int bench_mem_memset(int argc, const char **argv) { struct bench_mem_info info = { .functions = memset_functions, - .do_cycles = do_memset_cycles, - .do_gettimeofday = do_memset_gettimeofday, + .do_op = do_memset, .usage = bench_mem_memset_usage, }; -- 2.31.1 Do type conversion to double at the point of use. Reviewed-by: Namhyung Kim Signed-off-by: Ankur Arora --- tools/perf/bench/mem-functions.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 8599ed96ee1f..fddb2acd2d3a 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -139,7 +139,7 @@ struct bench_mem_info { bool alloc_src; }; -static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t size, double size_total) +static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t size, size_t size_total) { const struct function *r = &info->functions[r_idx]; double result_bps = 0.0; @@ -165,18 +165,18 @@ static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t switch (bench_format) { case BENCH_FORMAT_DEFAULT: if (use_cycles) { - printf(" %14lf cycles/byte\n", (double)rt.cycles/size_total); + printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)size_total); } else { - result_bps = size_total/timeval2double(&rt.tv); + result_bps = (double)size_total/timeval2double(&rt.tv); print_bps(result_bps); } break; case BENCH_FORMAT_SIMPLE: if (use_cycles) { - printf("%lf\n", (double)rt.cycles/size_total); + printf("%lf\n", (double)rt.cycles/(double)size_total); } else { - result_bps = size_total/timeval2double(&rt.tv); + result_bps = (double)size_total/timeval2double(&rt.tv); printf("%lf\n", result_bps); } break; @@ -199,7 +199,7 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * { int i; size_t size; - double size_total; + size_t size_total; argc = parse_options(argc, argv, options, info->usage, 0); @@ -212,7 +212,7 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * } size = (size_t)perf_atoll((char *)size_str); - size_total = (double)size * nr_loops; + size_total = size * nr_loops; if ((s64)size <= 0) { fprintf(stderr, "Invalid size:%s\n", size_str); -- 2.31.1 Move benchmark function parameters in struct bench_params. Reviewed-by: Namhyung Kim Signed-off-by: Ankur Arora --- tools/perf/bench/mem-functions.c | 62 +++++++++++++++++--------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index fddb2acd2d3a..4d723774c1b3 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -30,7 +30,7 @@ static const char *size_str = "1MB"; static const char *function_str = "all"; -static int nr_loops = 1; +static unsigned int nr_loops = 1; static bool use_cycles; static int cycles_fd; @@ -42,7 +42,7 @@ static const struct option options[] = { OPT_STRING('f', "function", &function_str, "all", "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"), - OPT_INTEGER('l', "nr_loops", &nr_loops, + OPT_UINTEGER('l', "nr_loops", &nr_loops, "Specify the number of loops to run. (default: 1)"), OPT_BOOLEAN('c', "cycles", &use_cycles, @@ -56,6 +56,12 @@ union bench_clock { struct timeval tv; }; +struct bench_params { + size_t size; + size_t size_total; + unsigned int nr_loops; +}; + typedef void *(*memcpy_t)(void *, const void *, size_t); typedef void *(*memset_t)(void *, int, size_t); @@ -134,17 +140,19 @@ static double timeval2double(struct timeval *ts) struct bench_mem_info { const struct function *functions; - union bench_clock (*do_op)(const struct function *r, size_t size, void *src, void *dst); + union bench_clock (*do_op)(const struct function *r, struct bench_params *p, + void *src, void *dst); const char *const *usage; bool alloc_src; }; -static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t size, size_t size_total) +static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p, + int r_idx) { const struct function *r = &info->functions[r_idx]; double result_bps = 0.0; union bench_clock rt = { 0 }; - void *src = NULL, *dst = zalloc(size); + void *src = NULL, *dst = zalloc(p->size); printf("# function '%s' (%s)\n", r->name, r->desc); @@ -152,7 +160,7 @@ static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t goto out_alloc_failed; if (info->alloc_src) { - src = zalloc(size); + src = zalloc(p->size); if (src == NULL) goto out_alloc_failed; } @@ -160,23 +168,23 @@ static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t if (bench_format == BENCH_FORMAT_DEFAULT) printf("# Copying %s bytes ...\n\n", size_str); - rt = info->do_op(r, size, src, dst); + rt = info->do_op(r, p, src, dst); switch (bench_format) { case BENCH_FORMAT_DEFAULT: if (use_cycles) { - printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)size_total); + printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total); } else { - result_bps = (double)size_total/timeval2double(&rt.tv); + result_bps = (double)p->size_total/timeval2double(&rt.tv); print_bps(result_bps); } break; case BENCH_FORMAT_SIMPLE: if (use_cycles) { - printf("%lf\n", (double)rt.cycles/(double)size_total); + printf("%lf\n", (double)rt.cycles/(double)p->size_total); } else { - result_bps = (double)size_total/timeval2double(&rt.tv); + result_bps = (double)p->size_total/timeval2double(&rt.tv); printf("%lf\n", result_bps); } break; @@ -198,8 +206,7 @@ static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *info) { int i; - size_t size; - size_t size_total; + struct bench_params p = { 0 }; argc = parse_options(argc, argv, options, info->usage, 0); @@ -211,17 +218,18 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * } } - size = (size_t)perf_atoll((char *)size_str); - size_total = size * nr_loops; + p.nr_loops = nr_loops; + p.size = (size_t)perf_atoll((char *)size_str); - if ((s64)size <= 0) { + if ((s64)p.size <= 0) { fprintf(stderr, "Invalid size:%s\n", size_str); return 1; } + p.size_total = p.size * p.nr_loops; if (!strncmp(function_str, "all", 3)) { for (i = 0; info->functions[i].name; i++) - __bench_mem_function(info, i, size, size_total); + __bench_mem_function(info, &p, i); return 0; } @@ -240,7 +248,7 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * return 1; } - __bench_mem_function(info, i, size, size_total); + __bench_mem_function(info, &p, i); return 0; } @@ -257,18 +265,17 @@ static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) fn(dst, src, size); } -static union bench_clock do_memcpy(const struct function *r, size_t size, +static union bench_clock do_memcpy(const struct function *r, struct bench_params *p, void *src, void *dst) { union bench_clock start, end; memcpy_t fn = r->fn.memcpy; - int i; - memcpy_prefault(fn, size, src, dst); + memcpy_prefault(fn, p->size, src, dst); clock_get(&start); - for (i = 0; i < nr_loops; ++i) - fn(dst, src, size); + for (unsigned int i = 0; i < p->nr_loops; ++i) + fn(dst, src, p->size); clock_get(&end); return clock_diff(&start, &end); @@ -305,22 +312,21 @@ int bench_mem_memcpy(int argc, const char **argv) return bench_mem_common(argc, argv, &info); } -static union bench_clock do_memset(const struct function *r, size_t size, +static union bench_clock do_memset(const struct function *r, struct bench_params *p, void *src __maybe_unused, void *dst) { union bench_clock start, end; memset_t fn = r->fn.memset; - int i; /* * We prefault the freshly allocated memory range here, * to not measure page fault overhead: */ - fn(dst, -1, size); + fn(dst, -1, p->size); clock_get(&start); - for (i = 0; i < nr_loops; ++i) - fn(dst, i, size); + for (unsigned int i = 0; i < p->nr_loops; ++i) + fn(dst, i, p->size); clock_get(&end); return clock_diff(&start, &end); -- 2.31.1 No functional change. Signed-off-by: Ankur Arora --- tools/perf/bench/mem-functions.c | 103 +++++++++++++------ tools/perf/bench/mem-memcpy-arch.h | 2 +- tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 4 + tools/perf/bench/mem-memset-arch.h | 2 +- tools/perf/bench/mem-memset-x86-64-asm-def.h | 4 + 5 files changed, 81 insertions(+), 34 deletions(-) diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 4d723774c1b3..60ea20277507 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -62,15 +62,31 @@ struct bench_params { unsigned int nr_loops; }; +struct bench_mem_info { + const struct function *functions; + int (*do_op)(const struct function *r, struct bench_params *p, + void *src, void *dst, union bench_clock *rt); + const char *const *usage; + bool alloc_src; +}; + +typedef bool (*mem_init_t)(struct bench_mem_info *, struct bench_params *, + void **, void **); +typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *, + void **, void **); typedef void *(*memcpy_t)(void *, const void *, size_t); typedef void *(*memset_t)(void *, int, size_t); struct function { const char *name; const char *desc; - union { - memcpy_t memcpy; - memset_t memset; + struct { + mem_init_t init; + mem_fini_t fini; + union { + memcpy_t memcpy; + memset_t memset; + }; } fn; }; @@ -138,37 +154,24 @@ static double timeval2double(struct timeval *ts) printf(" %14lf GB/sec\n", x / K / K / K); \ } while (0) -struct bench_mem_info { - const struct function *functions; - union bench_clock (*do_op)(const struct function *r, struct bench_params *p, - void *src, void *dst); - const char *const *usage; - bool alloc_src; -}; - static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p, int r_idx) { const struct function *r = &info->functions[r_idx]; double result_bps = 0.0; union bench_clock rt = { 0 }; - void *src = NULL, *dst = zalloc(p->size); + void *src = NULL, *dst = NULL; printf("# function '%s' (%s)\n", r->name, r->desc); - if (dst == NULL) - goto out_alloc_failed; - - if (info->alloc_src) { - src = zalloc(p->size); - if (src == NULL) - goto out_alloc_failed; - } + if (r->fn.init && r->fn.init(info, p, &src, &dst)) + goto out_init_failed; if (bench_format == BENCH_FORMAT_DEFAULT) printf("# Copying %s bytes ...\n\n", size_str); - rt = info->do_op(r, p, src, dst); + if (info->do_op(r, p, src, dst, &rt)) + goto out_test_failed; switch (bench_format) { case BENCH_FORMAT_DEFAULT: @@ -194,11 +197,11 @@ static void __bench_mem_function(struct bench_mem_info *info, struct bench_param break; } +out_test_failed: out_free: - free(src); - free(dst); + if (r->fn.fini) r->fn.fini(info, p, &src, &dst); return; -out_alloc_failed: +out_init_failed: printf("# Memory allocation failed - maybe size (%s) is too large?\n", size_str); goto out_free; } @@ -265,8 +268,8 @@ static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) fn(dst, src, size); } -static union bench_clock do_memcpy(const struct function *r, struct bench_params *p, - void *src, void *dst) +static int do_memcpy(const struct function *r, struct bench_params *p, + void *src, void *dst, union bench_clock *rt) { union bench_clock start, end; memcpy_t fn = r->fn.memcpy; @@ -278,16 +281,47 @@ static union bench_clock do_memcpy(const struct function *r, struct bench_params fn(dst, src, p->size); clock_get(&end); - return clock_diff(&start, &end); + *rt = clock_diff(&start, &end); + + return 0; +} + +static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p, + void **src, void **dst) +{ + bool failed; + + *dst = zalloc(p->size); + failed = *dst == NULL; + + if (info->alloc_src) { + *src = zalloc(p->size); + failed = failed || *src == NULL; + } + + return failed; +} + +static void mem_free(struct bench_mem_info *info __maybe_unused, + struct bench_params *p __maybe_unused, + void **src, void **dst) +{ + free(*dst); + free(*src); + + *dst = *src = NULL; } struct function memcpy_functions[] = { { .name = "default", .desc = "Default memcpy() provided by glibc", + .fn.init = mem_alloc, + .fn.fini = mem_free, .fn.memcpy = memcpy }, #ifdef HAVE_ARCH_X86_64_SUPPORT -# define MEMCPY_FN(_fn, _name, _desc) {.name = _name, .desc = _desc, .fn.memcpy = _fn}, +# define MEMCPY_FN(_fn, _init, _fini, _name, _desc) \ + {.name = _name, .desc = _desc, .fn.memcpy = _fn, .fn.init = _init, .fn.fini = _fini }, # include "mem-memcpy-x86-64-asm-def.h" # undef MEMCPY_FN #endif @@ -312,8 +346,8 @@ int bench_mem_memcpy(int argc, const char **argv) return bench_mem_common(argc, argv, &info); } -static union bench_clock do_memset(const struct function *r, struct bench_params *p, - void *src __maybe_unused, void *dst) +static int do_memset(const struct function *r, struct bench_params *p, + void *src __maybe_unused, void *dst, union bench_clock *rt) { union bench_clock start, end; memset_t fn = r->fn.memset; @@ -329,7 +363,9 @@ static union bench_clock do_memset(const struct function *r, struct bench_params fn(dst, i, p->size); clock_get(&end); - return clock_diff(&start, &end); + *rt = clock_diff(&start, &end); + + return 0; } static const char * const bench_mem_memset_usage[] = { @@ -340,10 +376,13 @@ static const char * const bench_mem_memset_usage[] = { static const struct function memset_functions[] = { { .name = "default", .desc = "Default memset() provided by glibc", + .fn.init = mem_alloc, + .fn.fini = mem_free, .fn.memset = memset }, #ifdef HAVE_ARCH_X86_64_SUPPORT -# define MEMSET_FN(_fn, _name, _desc) { .name = _name, .desc = _desc, .fn.memset = _fn }, +# define MEMSET_FN(_fn, _init, _fini, _name, _desc) \ + {.name = _name, .desc = _desc, .fn.memset = _fn, .fn.init = _init, .fn.fini = _fini }, # include "mem-memset-x86-64-asm-def.h" # undef MEMSET_FN #endif diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h index 5bcaec5601a8..852e48cfd8fe 100644 --- a/tools/perf/bench/mem-memcpy-arch.h +++ b/tools/perf/bench/mem-memcpy-arch.h @@ -2,7 +2,7 @@ #ifdef HAVE_ARCH_X86_64_SUPPORT -#define MEMCPY_FN(fn, name, desc) \ +#define MEMCPY_FN(fn, init, fini, name, desc) \ void *fn(void *, const void *, size_t); #include "mem-memcpy-x86-64-asm-def.h" diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index 6188e19d3129..f43038f4448b 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ MEMCPY_FN(memcpy_orig, + mem_alloc, + mem_free, "x86-64-unrolled", "unrolled memcpy() in arch/x86/lib/memcpy_64.S") MEMCPY_FN(__memcpy, + mem_alloc, + mem_free, "x86-64-movsq", "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h index 53f45482663f..278c5da12d63 100644 --- a/tools/perf/bench/mem-memset-arch.h +++ b/tools/perf/bench/mem-memset-arch.h @@ -2,7 +2,7 @@ #ifdef HAVE_ARCH_X86_64_SUPPORT -#define MEMSET_FN(fn, name, desc) \ +#define MEMSET_FN(fn, init, fini, name, desc) \ void *fn(void *, int, size_t); #include "mem-memset-x86-64-asm-def.h" diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index 247c72fdfb9d..80ad1b7ea770 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ MEMSET_FN(memset_orig, + mem_alloc, + mem_free, "x86-64-unrolled", "unrolled memset() in arch/x86/lib/memset_64.S") MEMSET_FN(__memset, + mem_alloc, + mem_free, "x86-64-stosq", "movsq-based memset() in arch/x86/lib/memset_64.S") -- 2.31.1 Using mmap() ensures that the buffer is always aligned at a fixed boundary. Switch to that to remove one source of variability. Since we always want to read/write from the allocated buffers map with pagetables pre-populated. Reviewed-by: Namhyung Kim Signed-off-by: Ankur Arora --- tools/perf/bench/mem-functions.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 60ea20277507..e97962dd8f81 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -22,9 +22,9 @@ #include #include #include +#include #include #include -#include #define K 1024 @@ -286,16 +286,33 @@ static int do_memcpy(const struct function *r, struct bench_params *p, return 0; } +static void *bench_mmap(size_t size, bool populate) +{ + void *p; + int extra = populate ? MAP_POPULATE : 0; + + p = mmap(NULL, size, PROT_READ|PROT_WRITE, + extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + return p == MAP_FAILED ? NULL : p; +} + +static void bench_munmap(void *p, size_t size) +{ + if (p) + munmap(p, size); +} + static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p, void **src, void **dst) { bool failed; - *dst = zalloc(p->size); + *dst = bench_mmap(p->size, true); failed = *dst == NULL; if (info->alloc_src) { - *src = zalloc(p->size); + *src = bench_mmap(p->size, true); failed = failed || *src == NULL; } @@ -306,8 +323,8 @@ static void mem_free(struct bench_mem_info *info __maybe_unused, struct bench_params *p __maybe_unused, void **src, void **dst) { - free(*dst); - free(*src); + bench_munmap(*dst, p->size); + bench_munmap(*src, p->size); *dst = *src = NULL; } -- 2.31.1 Page sizes that can be selected: 4KB, 2MB, 1GB. Both the reservation and node from which hugepages are allocated from are expected to be addressed by the user. An example of page-size selection: $ perf bench mem memset -s 4gb -p 2mb # Running 'mem/memset' benchmark: # function 'default' (Default memset() provided by glibc) # Copying 4gb bytes ... 14.919194 GB/sec # function 'x86-64-unrolled' (unrolled memset() in arch/x86/lib/memset_64.S) # Copying 4gb bytes ... 11.514503 GB/sec # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4gb bytes ... 12.600568 GB/sec Signed-off-by: Ankur Arora --- tools/perf/Documentation/perf-bench.txt | 14 +++++++++-- tools/perf/bench/mem-functions.c | 33 ++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 8331bd28b10e..04cdc31a0b0b 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -177,11 +177,16 @@ Suite for evaluating performance of simple memory copy in various ways. Options of *memcpy* ^^^^^^^^^^^^^^^^^^^ --l:: +-s:: --size:: Specify size of memory to copy (default: 1MB). Available units are B, KB, MB, GB and TB (case insensitive). +-p:: +--page:: +Specify page-size for mapping memory buffers (default: 4KB). +Available values are 4KB, 2MB, 1GB (case insensitive). + -f:: --function:: Specify function to copy (default: default). @@ -201,11 +206,16 @@ Suite for evaluating performance of simple memory set in various ways. Options of *memset* ^^^^^^^^^^^^^^^^^^^ --l:: +-s:: --size:: Specify size of memory to set (default: 1MB). Available units are B, KB, MB, GB and TB (case insensitive). +-p:: +--page:: +Specify page-size for mapping memory buffers (default: 4KB). +Available values are 4KB, 2MB, 1GB (case insensitive). + -f:: --function:: Specify function to set (default: default). diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index e97962dd8f81..6aa1f02553ba 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -25,11 +25,17 @@ #include #include #include +#include #define K 1024 +#define PAGE_SHIFT_4KB 12 +#define PAGE_SHIFT_2MB 21 +#define PAGE_SHIFT_1GB 30 + static const char *size_str = "1MB"; static const char *function_str = "all"; +static const char *page_size_str = "4KB"; static unsigned int nr_loops = 1; static bool use_cycles; static int cycles_fd; @@ -39,6 +45,10 @@ static const struct option options[] = { "Specify the size of the memory buffers. " "Available units: B, KB, MB, GB and TB (case insensitive)"), + OPT_STRING('p', "page", &page_size_str, "4KB", + "Specify page-size for mapping memory buffers. " + "Available sizes: 4KB, 2MB, 1GB (case insensitive)"), + OPT_STRING('f', "function", &function_str, "all", "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"), @@ -60,6 +70,7 @@ struct bench_params { size_t size; size_t size_total; unsigned int nr_loops; + unsigned int page_shift; }; struct bench_mem_info { @@ -202,7 +213,8 @@ static void __bench_mem_function(struct bench_mem_info *info, struct bench_param if (r->fn.fini) r->fn.fini(info, p, &src, &dst); return; out_init_failed: - printf("# Memory allocation failed - maybe size (%s) is too large?\n", size_str); + printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, + p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); goto out_free; } @@ -210,6 +222,7 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * { int i; struct bench_params p = { 0 }; + unsigned int page_size; argc = parse_options(argc, argv, options, info->usage, 0); @@ -230,6 +243,15 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * } p.size_total = p.size * p.nr_loops; + page_size = (unsigned int)perf_atoll((char *)page_size_str); + if (page_size != (1 << PAGE_SHIFT_4KB) && + page_size != (1 << PAGE_SHIFT_2MB) && + page_size != (1 << PAGE_SHIFT_1GB)) { + fprintf(stderr, "Invalid page-size:%s\n", page_size_str); + return 1; + } + p.page_shift = ilog2(page_size); + if (!strncmp(function_str, "all", 3)) { for (i = 0; info->functions[i].name; i++) __bench_mem_function(info, &p, i); @@ -286,11 +308,14 @@ static int do_memcpy(const struct function *r, struct bench_params *p, return 0; } -static void *bench_mmap(size_t size, bool populate) +static void *bench_mmap(size_t size, bool populate, unsigned int page_shift) { void *p; int extra = populate ? MAP_POPULATE : 0; + if (page_shift != PAGE_SHIFT_4KB) + extra |= MAP_HUGETLB | (page_shift << MAP_HUGE_SHIFT); + p = mmap(NULL, size, PROT_READ|PROT_WRITE, extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); @@ -308,11 +333,11 @@ static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p, { bool failed; - *dst = bench_mmap(p->size, true); + *dst = bench_mmap(p->size, true, p->page_shift); failed = *dst == NULL; if (info->alloc_src) { - *src = bench_mmap(p->size, true); + *src = bench_mmap(p->size, true, p->page_shift); failed = failed || *src == NULL; } -- 2.31.1 There can be a significant gap in memset/memcpy performance depending on the size of the region being operated on. With chunk-size=4kb: $ echo madvise > /sys/kernel/mm/transparent_hugepage/enabled $ perf bench mem memset -p 4kb -k 4kb -s 4gb -l 10 -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4gb bytes ... 13.011655 GB/sec With chunk-size=1gb: $ echo madvise > /sys/kernel/mm/transparent_hugepage/enabled $ perf bench mem memset -p 4kb -k 1gb -s 4gb -l 10 -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4gb bytes ... 21.936355 GB/sec So, allow the user to specify the chunk-size. The default value is identical to the total size of the region, which preserves current behaviour. Reviewed-by: Namhyung Kim Signed-off-by: Ankur Arora --- tools/perf/Documentation/perf-bench.txt | 10 ++++++++++ tools/perf/bench/mem-functions.c | 20 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 04cdc31a0b0b..3d1455d880c3 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -187,6 +187,11 @@ Available units are B, KB, MB, GB and TB (case insensitive). Specify page-size for mapping memory buffers (default: 4KB). Available values are 4KB, 2MB, 1GB (case insensitive). +-k:: +--chunk:: +Specify the chunk-size for each invocation. (default: 0, or full-extent) +Available units are B, KB, MB, GB and TB (case insensitive). + -f:: --function:: Specify function to copy (default: default). @@ -216,6 +221,11 @@ Available units are B, KB, MB, GB and TB (case insensitive). Specify page-size for mapping memory buffers (default: 4KB). Available values are 4KB, 2MB, 1GB (case insensitive). +-k:: +--chunk:: +Specify the chunk-size for each invocation. (default: 0, or full-extent) +Available units are B, KB, MB, GB and TB (case insensitive). + -f:: --function:: Specify function to set (default: default). diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 6aa1f02553ba..69968ba63d81 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -36,6 +36,7 @@ static const char *size_str = "1MB"; static const char *function_str = "all"; static const char *page_size_str = "4KB"; +static const char *chunk_size_str = "0"; static unsigned int nr_loops = 1; static bool use_cycles; static int cycles_fd; @@ -49,6 +50,10 @@ static const struct option options[] = { "Specify page-size for mapping memory buffers. " "Available sizes: 4KB, 2MB, 1GB (case insensitive)"), + OPT_STRING('k', "chunk", &chunk_size_str, "0", + "Specify the chunk-size for each invocation. " + "Available units: B, KB, MB, GB and TB (case insensitive)"), + OPT_STRING('f', "function", &function_str, "all", "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"), @@ -69,6 +74,7 @@ union bench_clock { struct bench_params { size_t size; size_t size_total; + size_t chunk_size; unsigned int nr_loops; unsigned int page_shift; }; @@ -243,6 +249,14 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * } p.size_total = p.size * p.nr_loops; + p.chunk_size = (size_t)perf_atoll((char *)chunk_size_str); + if ((s64)p.chunk_size < 0 || (s64)p.chunk_size > (s64)p.size) { + fprintf(stderr, "Invalid chunk_size:%s\n", chunk_size_str); + return 1; + } + if (!p.chunk_size) + p.chunk_size = p.size; + page_size = (unsigned int)perf_atoll((char *)page_size_str); if (page_size != (1 << PAGE_SHIFT_4KB) && page_size != (1 << PAGE_SHIFT_2MB) && @@ -300,7 +314,8 @@ static int do_memcpy(const struct function *r, struct bench_params *p, clock_get(&start); for (unsigned int i = 0; i < p->nr_loops; ++i) - fn(dst, src, p->size); + for (size_t off = 0; off < p->size; off += p->chunk_size) + fn(dst + off, src + off, min(p->chunk_size, p->size - off)); clock_get(&end); *rt = clock_diff(&start, &end); @@ -402,7 +417,8 @@ static int do_memset(const struct function *r, struct bench_params *p, clock_get(&start); for (unsigned int i = 0; i < p->nr_loops; ++i) - fn(dst, i, p->size); + for (size_t off = 0; off < p->size; off += p->chunk_size) + fn(dst + off, i, min(p->chunk_size, p->size - off)); clock_get(&end); *rt = clock_diff(&start, &end); -- 2.31.1 Split mem benchmark options into common and memset/memcpy specific. Reviewed-by: Namhyung Kim Signed-off-by: Ankur Arora --- tools/perf/bench/mem-functions.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 69968ba63d81..2a23bed8c2d3 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -41,7 +41,7 @@ static unsigned int nr_loops = 1; static bool use_cycles; static int cycles_fd; -static const struct option options[] = { +static const struct option bench_common_options[] = { OPT_STRING('s', "size", &size_str, "1MB", "Specify the size of the memory buffers. " "Available units: B, KB, MB, GB and TB (case insensitive)"), @@ -50,10 +50,6 @@ static const struct option options[] = { "Specify page-size for mapping memory buffers. " "Available sizes: 4KB, 2MB, 1GB (case insensitive)"), - OPT_STRING('k', "chunk", &chunk_size_str, "0", - "Specify the chunk-size for each invocation. " - "Available units: B, KB, MB, GB and TB (case insensitive)"), - OPT_STRING('f', "function", &function_str, "all", "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"), @@ -66,6 +62,14 @@ static const struct option options[] = { OPT_END() }; +static const struct option bench_mem_options[] = { + OPT_STRING('k', "chunk", &chunk_size_str, "0", + "Specify the chunk-size for each invocation. " + "Available units: B, KB, MB, GB and TB (case insensitive)"), + OPT_PARENT(bench_common_options), + OPT_END() +}; + union bench_clock { u64 cycles; struct timeval tv; @@ -84,6 +88,7 @@ struct bench_mem_info { int (*do_op)(const struct function *r, struct bench_params *p, void *src, void *dst, union bench_clock *rt); const char *const *usage; + const struct option *options; bool alloc_src; }; @@ -230,7 +235,7 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * struct bench_params p = { 0 }; unsigned int page_size; - argc = parse_options(argc, argv, options, info->usage, 0); + argc = parse_options(argc, argv, info->options, info->usage, 0); if (use_cycles) { i = init_cycles(); @@ -397,6 +402,7 @@ int bench_mem_memcpy(int argc, const char **argv) .functions = memcpy_functions, .do_op = do_memcpy, .usage = bench_mem_memcpy_usage, + .options = bench_mem_options, .alloc_src = true, }; @@ -454,6 +460,7 @@ int bench_mem_memset(int argc, const char **argv) .functions = memset_functions, .do_op = do_memset, .usage = bench_mem_memset_usage, + .options = bench_mem_options, }; return bench_mem_common(argc, argv, &info); -- 2.31.1 Add two mmap() workloads: one that eagerly populates a region and another that demand faults it in. The intent is to probe the memory subsytem performance incurred by mmap(). $ perf bench mem mmap -s 4gb -p 4kb -l 10 -f populate # Running 'mem/mmap' benchmark: # function 'populate' (Eagerly populated map()) # Copying 4gb bytes ... 1.811691 GB/sec $ perf bench mem mmap -s 4gb -p 2mb -l 10 -f populate # Running 'mem/mmap' benchmark: # function 'populate' (Eagerly populated mmap()) # Copying 4gb bytes ... 12.272017 GB/sec $ perf bench mem mmap -s 4gb -p 1gb -l 10 -f populate # Running 'mem/mmap' benchmark: # function 'populate' (Eagerly populated mmap()) # Copying 4gb bytes ... 17.085927 GB/sec Signed-off-by: Ankur Arora --- tools/perf/Documentation/perf-bench.txt | 34 +++++++++ tools/perf/bench/bench.h | 1 + tools/perf/bench/mem-functions.c | 96 +++++++++++++++++++++++++ tools/perf/builtin-bench.c | 1 + 4 files changed, 132 insertions(+) diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 3d1455d880c3..1160224cb718 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -240,6 +240,40 @@ Repeat memset invocation this number of times. --cycles:: Use perf's cpu-cycles event instead of gettimeofday syscall. +*mmap*:: +Suite for evaluating memory subsystem performance for mmap()'d memory. + +Options of *mmap* +^^^^^^^^^^^^^^^^^ +-s:: +--size:: +Specify size of memory to set (default: 1MB). +Available units are B, KB, MB, GB and TB (case insensitive). + +-p:: +--page:: +Specify page-size for mapping memory buffers (default: 4KB). +Available values are 4KB, 2MB, 1GB (case insensitive). + +-r:: +--randomize:: +Specify seed to randomize page access offset (default: 0, or not randomized). + +-f:: +--function:: +Specify function to set (default: all). +Available functions are 'demand' and 'populate', with the first +demand faulting pages in the region and the second using an eager +mapping. + +-l:: +--nr_loops:: +Repeat mmap() invocation this number of times. + +-c:: +--cycles:: +Use perf's cpu-cycles event instead of gettimeofday syscall. + SUITES FOR 'numa' ~~~~~~~~~~~~~~~~~ *mem*:: diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 9f736423af53..8519eb5a42fa 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -28,6 +28,7 @@ int bench_syscall_fork(int argc, const char **argv); int bench_syscall_execve(int argc, const char **argv); int bench_mem_memcpy(int argc, const char **argv); int bench_mem_memset(int argc, const char **argv); +int bench_mem_mmap(int argc, const char **argv); int bench_mem_find_bit(int argc, const char **argv); int bench_futex_hash(int argc, const char **argv); int bench_futex_wake(int argc, const char **argv); diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 2a23bed8c2d3..2908a3a796c9 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -40,6 +40,7 @@ static const char *chunk_size_str = "0"; static unsigned int nr_loops = 1; static bool use_cycles; static int cycles_fd; +static unsigned int seed; static const struct option bench_common_options[] = { OPT_STRING('s', "size", &size_str, "1MB", @@ -81,6 +82,7 @@ struct bench_params { size_t chunk_size; unsigned int nr_loops; unsigned int page_shift; + unsigned int seed; }; struct bench_mem_info { @@ -98,6 +100,7 @@ typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *, void **, void **); typedef void *(*memcpy_t)(void *, const void *, size_t); typedef void *(*memset_t)(void *, int, size_t); +typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool); struct function { const char *name; @@ -108,6 +111,7 @@ struct function { union { memcpy_t memcpy; memset_t memset; + mmap_op_t mmap_op; }; } fn; }; @@ -160,6 +164,14 @@ static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e) return t; } +static void clock_accum(union bench_clock *a, union bench_clock *b) +{ + if (use_cycles) + a->cycles += b->cycles; + else + timeradd(&a->tv, &b->tv, &a->tv); +} + static double timeval2double(struct timeval *ts) { return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC; @@ -271,6 +283,8 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * } p.page_shift = ilog2(page_size); + p.seed = seed; + if (!strncmp(function_str, "all", 3)) { for (i = 0; info->functions[i].name; i++) __bench_mem_function(info, &p, i); @@ -465,3 +479,85 @@ int bench_mem_memset(int argc, const char **argv) return bench_mem_common(argc, argv, &info); } + +static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random) +{ + unsigned long npages = size / (1 << page_shift); + unsigned long offset = 0, r = 0; + + for (unsigned long i = 0; i < npages; i++) { + if (random) + r = rand() % (1 << page_shift); + + *((char *)dst + offset + r) = *(char *)(dst + offset + r) + i; + offset += 1 << page_shift; + } +} + +static int do_mmap(const struct function *r, struct bench_params *p, + void *src __maybe_unused, void *dst __maybe_unused, + union bench_clock *accum) +{ + union bench_clock start, end, diff; + mmap_op_t fn = r->fn.mmap_op; + bool populate = strcmp(r->name, "populate") == 0; + + if (p->seed) + srand(p->seed); + + for (unsigned int i = 0; i < p->nr_loops; i++) { + clock_get(&start); + dst = bench_mmap(p->size, populate, p->page_shift); + if (!dst) + goto out; + + fn(dst, p->size, p->page_shift, p->seed); + clock_get(&end); + diff = clock_diff(&start, &end); + clock_accum(accum, &diff); + + bench_munmap(dst, p->size); + } + + return 0; +out: + printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, + p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); + return -1; +} + +static const char * const bench_mem_mmap_usage[] = { + "perf bench mem mmap ", + NULL +}; + +static const struct function mmap_functions[] = { + { .name = "demand", + .desc = "Demand loaded mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = "populate", + .desc = "Eagerly populated mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = NULL, } +}; + +int bench_mem_mmap(int argc, const char **argv) +{ + static const struct option bench_mmap_options[] = { + OPT_UINTEGER('r', "randomize", &seed, + "Seed to randomize page access offset."), + OPT_PARENT(bench_common_options), + OPT_END() + }; + + struct bench_mem_info info = { + .functions = mmap_functions, + .do_op = do_mmap, + .usage = bench_mem_mmap_usage, + .options = bench_mmap_options, + }; + + return bench_mem_common(argc, argv, &info); +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 2c1a9f3d847a..02dea1b88228 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -65,6 +65,7 @@ static struct bench mem_benchmarks[] = { { "memcpy", "Benchmark for memcpy() functions", bench_mem_memcpy }, { "memset", "Benchmark for memset() functions", bench_mem_memset }, { "find_bit", "Benchmark for find_bit() functions", bench_mem_find_bit }, + { "mmap", "Benchmark for mmap() mappings", bench_mem_mmap }, { "all", "Run all memory access benchmarks", NULL }, { NULL, NULL, NULL } }; -- 2.31.1 clear_page_rep() and clear_page_erms() are wrappers around "REP; STOS" variations. Inlining gets rid of an unnecessary CALL/RET (which isn't free when using RETHUNK speculative execution mitigations.) Fixup and rename clear_page_orig() to adapt to the changed calling convention. Also add a comment from Dave Hansen detailing various clearing mechanisms used in clear_page(). Signed-off-by: Ankur Arora --- arch/x86/include/asm/page_32.h | 6 +++++ arch/x86/include/asm/page_64.h | 42 ++++++++++++++++++++++++++-------- arch/x86/lib/clear_page_64.S | 39 +++++++------------------------ 3 files changed, 46 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 0c623706cb7e..19fddb002cc9 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -17,6 +17,12 @@ extern unsigned long __phys_addr(unsigned long); #include +/** + * clear_page() - clear a page using a kernel virtual address. + * @page: address of kernel page + * + * Does absolutely no exception handling. + */ static inline void clear_page(void *page) { memset(page, 0, PAGE_SIZE); diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 015d23f3e01f..17b6ae89e211 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -40,23 +40,45 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -void clear_page_orig(void *page); -void clear_page_rep(void *page); -void clear_page_erms(void *page); +void memzero_page_aligned_unrolled(void *addr, u64 len); +/** + * clear_page() - clear a page using a kernel virtual address. + * @page: address of kernel page + * + * Switch between three implementations of page clearing based on CPU + * capabilities: + * + * - memzero_page_aligned_unrolled(): the oldest, slowest and universally + * supported method. Zeroes via 8-byte MOV instructions unrolled 8x + * to write a 64-byte cacheline in each loop iteration.. + * + * - "rep stosq": really old CPUs had crummy REP implementations. + * Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be + * trusted. The instruction writes 8-byte per REP iteration but + * CPUs can internally batch these together and do larger writes. + * + * - "rep stosb": CPUs that enumerate 'ERMS' have an improved STOS + * implementation that is less picky about alignment and where + * STOSB (1-byte at a time) is actually faster than STOSQ (8-bytes + * at a time.) + * + * Does absolutely no exception handling. + */ static inline void clear_page(void *page) { + u64 len = PAGE_SIZE; /* * Clean up KMSAN metadata for the page being cleared. The assembly call * below clobbers @page, so we perform unpoisoning before it. */ - kmsan_unpoison_memory(page, PAGE_SIZE); - alternative_call_2(clear_page_orig, - clear_page_rep, X86_FEATURE_REP_GOOD, - clear_page_erms, X86_FEATURE_ERMS, - "=D" (page), - "D" (page), - "cc", "memory", "rax", "rcx"); + kmsan_unpoison_memory(page, len); + asm volatile(ALTERNATIVE_2("call memzero_page_aligned_unrolled", + "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD, + "rep stosb", X86_FEATURE_ERMS) + : "+c" (len), "+D" (page), ASM_CALL_CONSTRAINT + : "a" (0) + : "cc", "memory"); } void copy_page(void *to, void *from); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a508e4a8c66a..27debe0c018c 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -6,30 +6,15 @@ #include /* - * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is - * recommended to use this when possible and we do use them by default. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original. + * Zero page aligned region. + * %rdi - dest + * %rcx - length */ - -/* - * Zero a page. - * %rdi - page - */ -SYM_TYPED_FUNC_START(clear_page_rep) - movl $4096/8,%ecx - xorl %eax,%eax - rep stosq - RET -SYM_FUNC_END(clear_page_rep) -EXPORT_SYMBOL_GPL(clear_page_rep) - -SYM_TYPED_FUNC_START(clear_page_orig) - xorl %eax,%eax - movl $4096/64,%ecx +SYM_TYPED_FUNC_START(memzero_page_aligned_unrolled) + shrq $6, %rcx .p2align 4 .Lloop: - decl %ecx + decq %rcx #define PUT(x) movq %rax,x*8(%rdi) movq %rax,(%rdi) PUT(1) @@ -43,16 +28,8 @@ SYM_TYPED_FUNC_START(clear_page_orig) jnz .Lloop nop RET -SYM_FUNC_END(clear_page_orig) -EXPORT_SYMBOL_GPL(clear_page_orig) - -SYM_TYPED_FUNC_START(clear_page_erms) - movl $4096,%ecx - xorl %eax,%eax - rep stosb - RET -SYM_FUNC_END(clear_page_erms) -EXPORT_SYMBOL_GPL(clear_page_erms) +SYM_FUNC_END(memzero_page_aligned_unrolled) +EXPORT_SYMBOL_GPL(memzero_page_aligned_unrolled) /* * Default clear user-space. -- 2.31.1 Define fallback versions of clear_pages(), clear_user_pages(). In absence of architectural primitives, these just do straight clearing sequentially. Signed-off-by: Ankur Arora --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..b8c3f265b497 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3768,6 +3768,38 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef ARCH_PAGE_CONTIG_NR +#define PAGE_CONTIG_NR 1 +#else +#define PAGE_CONTIG_NR ARCH_PAGE_CONTIG_NR +#endif + +#ifndef clear_pages +/* + * clear_pages() - clear kernel page range. + * @addr: start address of page range + * @npages: number of pages + * + * Assumes that (@addr, +@npages) references a kernel region. + * Like clear_page(), this does absolutely no exception handling. + */ +static inline void clear_pages(void *addr, unsigned int npages) +{ + for (int i = 0; i < npages; i++) + clear_page(addr + i * PAGE_SIZE); +} +#endif + +#ifndef clear_user_pages +static inline void clear_user_pages(void *addr, unsigned long vaddr, + struct page *pg, unsigned int npages) +{ + for (int i = 0; i < npages; i++) + clear_user_page(addr + i * PAGE_SIZE, + vaddr + i * PAGE_SIZE, pg + i); +} +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); -- 2.31.1 Define clear_user_highpages() which clears sequentially using the single page variant. With !CONFIG_HIGHMEM, pages are contiguous so use the range clearing primitive clear_user_pages(). Signed-off-by: Ankur Arora --- include/linux/highmem.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6234f316468c..eeb0b7bc0a22 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -207,6 +207,18 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif +#ifndef clear_user_highpages +static inline void clear_user_highpages(struct page *page, unsigned long vaddr, + unsigned int npages) +{ + if (!IS_ENABLED(CONFIG_HIGHMEM)) + clear_user_pages(page_address(page), vaddr, page, npages); + else + for (int i = 0; i < npages; i++) + clear_user_highpage(page+i, vaddr + i * PAGE_SIZE); +} +#endif + #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. -- 2.31.1 Change folio_zero_user() to clear contiguous page ranges instead of in the current page-at-a-time fashion. This, when exposed to the processor, allows it to optimize clearing based on the knowledge of the extent. However, clearing in large chunks can have two problems: - cache locality when clearing small folios (< MAX_ORDER_NR_PAGES) (larger folios don't have any expectation of cache locality). - preemption latency when clearing large folios. Handle the first by splitting the clearing in three parts: the faulting page and its immediate locality, its left and right regions; with the local neighbourhood cleared last. The second problem is relevant when running under cooperative preemption models. Limit the worst case preemption latency by clearing in architecture specified PAGE_CONTIG_NR units. Signed-off-by: Ankur Arora --- mm/memory.c | 82 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0ba4f6b71847..0f5b1900b480 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7021,40 +7021,80 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, - unsigned int nr_pages) +/* + * Clear contiguous pages chunking them up when running under + * non-preemptible models. + */ +static void clear_contig_highpages(struct page *page, unsigned long addr, + unsigned int npages) { - unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); - int i; + unsigned int i, count, unit; - might_sleep(); - for (i = 0; i < nr_pages; i++) { + unit = preempt_model_preemptible() ? npages : PAGE_CONTIG_NR; + + for (i = 0; i < npages; ) { + count = min(unit, npages - i); + clear_user_highpages(nth_page(page, i), + addr + i * PAGE_SIZE, count); + i += count; cond_resched(); - clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } -static int clear_subpage(unsigned long addr, int idx, void *arg) -{ - struct folio *folio = arg; - - clear_user_highpage(folio_page(folio, idx), addr); - return 0; -} - /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. - * @addr_hint: The address will be accessed or the base address if uncelar. + * @addr_hint: The address accessed by the user or the base address. + * + * Uses architectural support for clear_pages() to zero page extents + * instead of clearing page-at-a-time. + * + * Clearing of small folios (< MAX_ORDER_NR_PAGES) is split in three parts: + * pages in the immediate locality of the faulting page, and its left, right + * regions; the local neighbourhood cleared last in order to keep cache + * lines of the target region hot. + * + * For larger folios we assume that there is no expectation of cache locality + * and just do a straight zero. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { - unsigned int nr_pages = folio_nr_pages(folio); + unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); + const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE; + const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1); + const int width = 2; /* number of pages cleared last on either side */ + struct range r[3]; + int i; - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) - clear_gigantic_page(folio, addr_hint, nr_pages); - else - process_huge_page(addr_hint, nr_pages, clear_subpage, folio); + if (folio_nr_pages(folio) > MAX_ORDER_NR_PAGES) { + clear_contig_highpages(folio_page(folio, 0), + base_addr, folio_nr_pages(folio)); + return; + } + + /* + * Faulting page and its immediate neighbourhood. Cleared at the end to + * ensure it sticks around in the cache. + */ + r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - width, pg.start, pg.end), + clamp_t(s64, fault_idx + width, pg.start, pg.end)); + + /* Region to the left of the fault */ + r[1] = DEFINE_RANGE(pg.start, + clamp_t(s64, r[2].start-1, pg.start-1, r[2].start)); + + /* Region to the right of the fault: always valid for the common fault_idx=0 case. */ + r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end+1, r[2].end, pg.end+1), + pg.end); + + for (i = 0; i <= 2; i++) { + unsigned int npages = range_len(&r[i]); + struct page *page = folio_page(folio, r[i].start); + unsigned long addr = base_addr + folio_page_idx(folio, page) * PAGE_SIZE; + + if (npages > 0) + clear_contig_highpages(page, addr, npages); + } } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, -- 2.31.1 Performance when clearing with string instructions (x86-64-stosq and similar) can vary significantly based on the chunk-size used. $ perf bench mem memset -k 4KB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 13.748208 GB/sec $ perf bench mem memset -k 2MB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in # arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 15.067900 GB/sec $ perf bench mem memset -k 1GB -s 4GB -f x86-64-stosq # Running 'mem/memset' benchmark: # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S) # Copying 4GB bytes ... 38.104311 GB/sec (Both on AMD Milan.) With a change in chunk-size of 4KB to 1GB, we see the performance go from 13.7 GB/sec to 38.1 GB/sec. For a chunk-size of 2MB the change isn't quite as drastic but it is worth adding a clear_page() variant that can handle contiguous page-extents. Define clear_user_pages() while at it. Signed-off-by: Ankur Arora --- arch/x86/include/asm/page_64.h | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 17b6ae89e211..289b31a4c910 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -43,8 +43,11 @@ extern unsigned long __phys_addr_symbol(unsigned long); void memzero_page_aligned_unrolled(void *addr, u64 len); /** - * clear_page() - clear a page using a kernel virtual address. - * @page: address of kernel page + * clear_page() - clear a page range using a kernel virtual address. + * @addr: start address + * @npages: number of pages + * + * Assumes that (@addr, +@npages) references a kernel region. * * Switch between three implementations of page clearing based on CPU * capabilities: @@ -65,21 +68,35 @@ void memzero_page_aligned_unrolled(void *addr, u64 len); * * Does absolutely no exception handling. */ -static inline void clear_page(void *page) +static inline void clear_pages(void *addr, unsigned int npages) { - u64 len = PAGE_SIZE; + u64 len = npages * PAGE_SIZE; /* - * Clean up KMSAN metadata for the page being cleared. The assembly call - * below clobbers @page, so we perform unpoisoning before it. + * Clean up KMSAN metadata for the pages being cleared. The assembly call + * below clobbers @addr, so we perform unpoisoning before it. */ - kmsan_unpoison_memory(page, len); + kmsan_unpoison_memory(addr, len); asm volatile(ALTERNATIVE_2("call memzero_page_aligned_unrolled", "shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD, "rep stosb", X86_FEATURE_ERMS) - : "+c" (len), "+D" (page), ASM_CALL_CONSTRAINT + : "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT : "a" (0) : "cc", "memory"); } +#define clear_pages clear_pages + +struct page; +static inline void clear_user_pages(void *page, unsigned long vaddr, + struct page *pg, unsigned int npages) +{ + clear_pages(page, npages); +} +#define clear_user_pages clear_user_pages + +static inline void clear_page(void *addr) +{ + clear_pages(addr, 1); +} void copy_page(void *to, void *from); KCFI_REFERENCE(copy_page); -- 2.31.1 Define ARCH_PAGE_CONTIG_NR which is used by folio_zero_user() to decide the maximum contiguous page range to be zeroed when running under cooperative preemption models. This allows the processor -- when using string instructions (REP; STOS) -- to optimize based on the size of the region. The resultant performance depends on the kinds of optimizations available to the microarch for the region being cleared. Two classes of optimizations: - clearing iteration costs can be amortized over a range larger than a single page. - cacheline allocation elision (seen on AMD Zen models). Testing a demand fault workload shows an improved baseline from the first optimization and a larger improvement when the region being cleared is large enough for the second optimization. AMD Milan (EPYC 7J13, boost=0, region=64GB on the local NUMA node): $ perf bench mem map -p $pg-sz -f demand -s 64GB -l 5 mm/folio_zero_user x86/folio_zero_user change (GB/s +- %stdev) (GB/s +- %stdev) pg-sz=2MB 11.82 +- 0.67% 16.48 +- 0.30% + 39.4% preempt=* pg-sz=1GB 17.14 +- 1.39% 17.42 +- 0.98% [#] + 1.6% preempt=none|voluntary pg-sz=1GB 17.51 +- 1.19% 43.23 +- 5.22% +146.8% preempt=full|lazy [#] Milan uses a threshold of LLC-size (~32MB) for eliding cacheline allocation, which is larger than ARCH_PAGE_CONTIG_NR, so preempt=none|voluntary see no improvement on the pg-sz=1GB. The improvement due to the CPU eliding cacheline allocation for pg-sz=1GB can be seen in the reduced L1-dcache-loads: - 44,513,459,667 cycles # 2.420 GHz ( +- 0.44% ) (35.71%) - 1,378,032,592 instructions # 0.03 insn per cycle - 11,224,288,082 L1-dcache-loads # 610.187 M/sec ( +- 0.08% ) (35.72%) - 5,373,473,118 L1-dcache-load-misses # 47.87% of all L1-dcache accesses ( +- 0.00% ) (35.71%) + 20,093,219,076 cycles # 2.421 GHz ( +- 3.64% ) (35.69%) + 1,378,032,592 instructions # 0.03 insn per cycle + 186,525,095 L1-dcache-loads # 22.479 M/sec ( +- 2.11% ) (35.74%) + 73,479,687 L1-dcache-load-misses # 39.39% of all L1-dcache accesses ( +- 3.03% ) (35.74%) Also as mentioned earlier, the baseline improvement is not specific to AMD Zen*. Intel Icelakex (pg-sz=2MB|1GB) sees a similar improvement as the Milan pg-sz=2MB workload above (~35%). Signed-off-by: Ankur Arora --- arch/x86/include/asm/page_64.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 289b31a4c910..2361066d175e 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -40,6 +40,13 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) +/* + * When running under voluntary preemption models, limit the max extent + * being cleared to pages worth 8MB. With a clearing BW of ~10GBps, this + * should result in worst case scheduling latency of ~1ms. + */ +#define ARCH_PAGE_CONTIG_NR (8 << (20 - PAGE_SHIFT)) + void memzero_page_aligned_unrolled(void *addr, u64 len); /** -- 2.31.1