Add two mmap() workloads: one that eagerly populates a region and another that demand faults it in. The intent is to probe the memory subsytem performance incurred by mmap(). $ perf bench mem mmap -s 4gb -p 4kb -l 10 -f populate # Running 'mem/mmap' benchmark: # function 'populate' (Eagerly populated map()) # Copying 4gb bytes ... 1.811691 GB/sec $ perf bench mem mmap -s 4gb -p 2mb -l 10 -f populate # Running 'mem/mmap' benchmark: # function 'populate' (Eagerly populated mmap()) # Copying 4gb bytes ... 12.272017 GB/sec $ perf bench mem mmap -s 4gb -p 1gb -l 10 -f populate # Running 'mem/mmap' benchmark: # function 'populate' (Eagerly populated mmap()) # Copying 4gb bytes ... 17.085927 GB/sec Signed-off-by: Ankur Arora --- tools/perf/Documentation/perf-bench.txt | 34 +++++++++ tools/perf/bench/bench.h | 1 + tools/perf/bench/mem-functions.c | 96 +++++++++++++++++++++++++ tools/perf/builtin-bench.c | 1 + 4 files changed, 132 insertions(+) diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 3d1455d880c3..1160224cb718 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -240,6 +240,40 @@ Repeat memset invocation this number of times. --cycles:: Use perf's cpu-cycles event instead of gettimeofday syscall. +*mmap*:: +Suite for evaluating memory subsystem performance for mmap()'d memory. + +Options of *mmap* +^^^^^^^^^^^^^^^^^ +-s:: +--size:: +Specify size of memory to set (default: 1MB). +Available units are B, KB, MB, GB and TB (case insensitive). + +-p:: +--page:: +Specify page-size for mapping memory buffers (default: 4KB). +Available values are 4KB, 2MB, 1GB (case insensitive). + +-r:: +--randomize:: +Specify seed to randomize page access offset (default: 0, or not randomized). + +-f:: +--function:: +Specify function to set (default: all). +Available functions are 'demand' and 'populate', with the first +demand faulting pages in the region and the second using an eager +mapping. + +-l:: +--nr_loops:: +Repeat mmap() invocation this number of times. + +-c:: +--cycles:: +Use perf's cpu-cycles event instead of gettimeofday syscall. + SUITES FOR 'numa' ~~~~~~~~~~~~~~~~~ *mem*:: diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 9f736423af53..8519eb5a42fa 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -28,6 +28,7 @@ int bench_syscall_fork(int argc, const char **argv); int bench_syscall_execve(int argc, const char **argv); int bench_mem_memcpy(int argc, const char **argv); int bench_mem_memset(int argc, const char **argv); +int bench_mem_mmap(int argc, const char **argv); int bench_mem_find_bit(int argc, const char **argv); int bench_futex_hash(int argc, const char **argv); int bench_futex_wake(int argc, const char **argv); diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 2a23bed8c2d3..2908a3a796c9 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -40,6 +40,7 @@ static const char *chunk_size_str = "0"; static unsigned int nr_loops = 1; static bool use_cycles; static int cycles_fd; +static unsigned int seed; static const struct option bench_common_options[] = { OPT_STRING('s', "size", &size_str, "1MB", @@ -81,6 +82,7 @@ struct bench_params { size_t chunk_size; unsigned int nr_loops; unsigned int page_shift; + unsigned int seed; }; struct bench_mem_info { @@ -98,6 +100,7 @@ typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *, void **, void **); typedef void *(*memcpy_t)(void *, const void *, size_t); typedef void *(*memset_t)(void *, int, size_t); +typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool); struct function { const char *name; @@ -108,6 +111,7 @@ struct function { union { memcpy_t memcpy; memset_t memset; + mmap_op_t mmap_op; }; } fn; }; @@ -160,6 +164,14 @@ static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e) return t; } +static void clock_accum(union bench_clock *a, union bench_clock *b) +{ + if (use_cycles) + a->cycles += b->cycles; + else + timeradd(&a->tv, &b->tv, &a->tv); +} + static double timeval2double(struct timeval *ts) { return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC; @@ -271,6 +283,8 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * } p.page_shift = ilog2(page_size); + p.seed = seed; + if (!strncmp(function_str, "all", 3)) { for (i = 0; info->functions[i].name; i++) __bench_mem_function(info, &p, i); @@ -465,3 +479,85 @@ int bench_mem_memset(int argc, const char **argv) return bench_mem_common(argc, argv, &info); } + +static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random) +{ + unsigned long npages = size / (1 << page_shift); + unsigned long offset = 0, r = 0; + + for (unsigned long i = 0; i < npages; i++) { + if (random) + r = rand() % (1 << page_shift); + + *((char *)dst + offset + r) = *(char *)(dst + offset + r) + i; + offset += 1 << page_shift; + } +} + +static int do_mmap(const struct function *r, struct bench_params *p, + void *src __maybe_unused, void *dst __maybe_unused, + union bench_clock *accum) +{ + union bench_clock start, end, diff; + mmap_op_t fn = r->fn.mmap_op; + bool populate = strcmp(r->name, "populate") == 0; + + if (p->seed) + srand(p->seed); + + for (unsigned int i = 0; i < p->nr_loops; i++) { + clock_get(&start); + dst = bench_mmap(p->size, populate, p->page_shift); + if (!dst) + goto out; + + fn(dst, p->size, p->page_shift, p->seed); + clock_get(&end); + diff = clock_diff(&start, &end); + clock_accum(accum, &diff); + + bench_munmap(dst, p->size); + } + + return 0; +out: + printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, + p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); + return -1; +} + +static const char * const bench_mem_mmap_usage[] = { + "perf bench mem mmap ", + NULL +}; + +static const struct function mmap_functions[] = { + { .name = "demand", + .desc = "Demand loaded mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = "populate", + .desc = "Eagerly populated mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = NULL, } +}; + +int bench_mem_mmap(int argc, const char **argv) +{ + static const struct option bench_mmap_options[] = { + OPT_UINTEGER('r', "randomize", &seed, + "Seed to randomize page access offset."), + OPT_PARENT(bench_common_options), + OPT_END() + }; + + struct bench_mem_info info = { + .functions = mmap_functions, + .do_op = do_mmap, + .usage = bench_mem_mmap_usage, + .options = bench_mmap_options, + }; + + return bench_mem_common(argc, argv, &info); +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 2c1a9f3d847a..02dea1b88228 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -65,6 +65,7 @@ static struct bench mem_benchmarks[] = { { "memcpy", "Benchmark for memcpy() functions", bench_mem_memcpy }, { "memset", "Benchmark for memset() functions", bench_mem_memset }, { "find_bit", "Benchmark for find_bit() functions", bench_mem_find_bit }, + { "mmap", "Benchmark for mmap() mappings", bench_mem_mmap }, { "all", "Run all memory access benchmarks", NULL }, { NULL, NULL, NULL } }; -- 2.31.1