The VMA count limit check in do_mmap() and do_brk_flags() uses a strict inequality (>), which allows a process's VMA count to exceed the configured sysctl_max_map_count limit by one. A process with mm->map_count == sysctl_max_map_count will incorrectly pass this check and then exceed the limit upon allocation of a new VMA when its map_count is incremented. Other VMA allocation paths, such as split_vma(), already use the correct, inclusive (>=) comparison. Fix this bug by changing the comparison to be inclusive in do_mmap() and do_brk_flags(), bringing them in line with the correct behavior of other allocation paths. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Minchan Kim Cc: Pedro Falcato Reviewed-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Acked-by: SeongJae Park Signed-off-by: Kalesh Singh --- Changes in v3: - Collect Reviewed-by and Acked-by tags. Changes in v2: - Fix mmap check, per Pedro mm/mmap.c | 2 +- mm/vma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 644f02071a41..da2cbdc0f87b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; /* diff --git a/mm/vma.c b/mm/vma.c index a2e1ae954662..fba68f13e628 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2797,7 +2797,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) -- 2.51.0.760.g7b8bcc2412-goog Add a new selftest to verify that the max VMA count limit is correctly enforced. This test suite checks that various VMA operations (mmap, mprotect, munmap, mremap) succeed or fail as expected when the number of VMAs is close to the sysctl_max_map_count limit. The test works by first creating a large number of VMAs to bring the process close to the limit, and then performing various operations that may or may not create new VMAs. The test then verifies that the operations that would exceed the limit fail, and that the operations that do not exceed the limit succeed. NOTE: munmap is special as it's allowed to temporarily exceed the limit by one for splits as this will decrease back to the limit once the unmap succeeds. Cc: Andrew Morton Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Minchan Kim Cc: Pedro Falcato Signed-off-by: Kalesh Singh --- Changes in v3: - Rewrite test using kselftest harness, per Lorenzo - Update test diagram to be vertical so as to not exceed 80 chars, per Lorenzo - Use vm_util.h helpers, per Lorenzo - Update .gitignore, per Lorenzo - Add max_vma_count_tests to MEMORY MAPPING section in MAINTAINERS, per Lorenzo - Remove /proc/*/maps debugging prints and globals, per Lorenzo - rename guard regions to holes to avoid confusion with VMA guard regions, per David Changes in v2: - Add tests, per Liam (note that the do_brk_flags() path is not easily tested from userspace, so it's not included here). Exceeding the limit there should be uncommon. MAINTAINERS | 1 + tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + .../selftests/mm/max_vma_count_tests.c | 672 ++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 5 + 5 files changed, 680 insertions(+) create mode 100644 tools/testing/selftests/mm/max_vma_count_tests.c diff --git a/MAINTAINERS b/MAINTAINERS index 46126ce2f968..aa83e5893e16 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16580,6 +16580,7 @@ F: mm/vma.h F: mm/vma_exec.c F: mm/vma_init.c F: mm/vma_internal.h +F: tools/testing/selftests/mm/max_vma_count_tests.c F: tools/testing/selftests/mm/merge.c F: tools/testing/vma/ diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c2a8586e51a1..010f1bced5b9 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -10,6 +10,7 @@ hugetlb-soft-offline khugepaged map_hugetlb map_populate +max_vma_count_tests thuge-gen compaction_test migration diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eaf9312097f7..4f0b03cdece5 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -93,6 +93,7 @@ TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress TEST_GEN_FILES += uffd-unit-tests TEST_GEN_FILES += uffd-wp-mremap +TEST_GEN_FILES += max_vma_count_tests TEST_GEN_FILES += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_FILES += ksm_functional_tests diff --git a/tools/testing/selftests/mm/max_vma_count_tests.c b/tools/testing/selftests/mm/max_vma_count_tests.c new file mode 100644 index 000000000000..2c69c6986633 --- /dev/null +++ b/tools/testing/selftests/mm/max_vma_count_tests.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Google LLC + */ +#define _GNU_SOURCE + +#include +#include /* Definition of PR_* constants */ +#include +#include +#include +#include +#include +#include +#include + +#define TH_LOG_ENABLED 0 +#include "../kselftest_harness.h" +#include "vm_util.h" + +#define DEFAULT_MAX_MAP_COUNT 65530 +#define TEST_AREA_NR_PAGES 3 +#define TEST_AREA_PROT (PROT_NONE) +#define EXTRA_MAP_PROT (PROT_NONE) + +/* mremap accounts for the worst case to fail early */ +#define MREMAP_REQUIRED_VMA_SLOTS 6 + +FIXTURE(max_vma_count) { + int max_vma_count; + int original_max_vma_count; + int test_area_size; + int nr_extra_maps; + char *test_area; + char *extra_maps; +}; + +/* To keep checkpatch happy */ +#define max_vma_count_data_t FIXTURE_DATA(max_vma_count) + +static int get_max_vma_count(void); +static bool set_max_vma_count(int val); +static int get_current_vma_count(void); +static bool is_test_area_mapped(char *test_area, int test_area_size); +static bool lower_max_map_count_if_needed(max_vma_count_data_t *self, + struct __test_metadata *_metadata); +static void restore_max_map_count_if_needed(max_vma_count_data_t *self, + struct __test_metadata *_metadata); +static bool free_vma_slots(max_vma_count_data_t *self, int slots_to_free); +static void create_reservation(max_vma_count_data_t *self, + struct __test_metadata *_metadata); +static void create_extra_maps(max_vma_count_data_t *self, + struct __test_metadata *_metadata); + +/** + * FIXTURE_SETUP - Sets up the VMA layout for max VMA count testing. + * + * Sets up a specific VMA layout to test behavior near the max_vma_count limit. + * A large memory area is reserved and then unmapped to create a contiguous + * address space. Mappings are then created within this space. + * + * The layout is as follows (addresses increase downwards): + * + * base_addr --> +----------------------+ + * | Hole (1 page) | + * +----------------------+ + * TEST_AREA --> | TEST_AREA | + * | (unmapped, 3 pages) | + * +----------------------+ + * | Hole (1 page) | + * +----------------------+ + * EXTRA_MAPS --> | Extra Map 1 (1 page) | + * +----------------------+ + * | Hole (1 page) | + * +----------------------+ + * | Extra Map 2 (1 page) | + * +----------------------+ + * | ... | + * +----------------------+ + * | Extra Map N (1 page) | + * +----------------------+ + * | Hole (1 page) | + * +----------------------+ + * + * "Holes" are unmapped, 1-page gaps used to isolate mappings. + * The number of "Extra Maps" is calculated to bring the total VMA count + * to MAX_VMA_COUNT - 1. + * + * Populates TEST_AREA and other globals required for the tests. + * + * Return: true on success, false on failure. + */ +FIXTURE_SETUP(max_vma_count) +{ + int initial_vma_count; + + TH_LOG("Setting up vma_max_count test ..."); + + self->test_area_size = TEST_AREA_NR_PAGES * psize(); + + if (!lower_max_map_count_if_needed(self, _metadata)) { + SKIP(return, + "max_map_count too high and cannot be lowered. Please rerun as root."); + } + + initial_vma_count = get_current_vma_count(); + ASSERT_GT(initial_vma_count, 0); + + self->nr_extra_maps = self->max_vma_count - 1 - initial_vma_count; + if (self->nr_extra_maps < 1) { + SKIP(return, + "Not enough available maps to run test (max: %d, current: %d)", + self->max_vma_count, initial_vma_count); + } + + create_reservation(self, _metadata); + create_extra_maps(self, _metadata); + + ASSERT_EQ(get_current_vma_count(), self->max_vma_count - 1); + TH_LOG("vma_max_count test setup done."); +} + +FIXTURE_TEARDOWN(max_vma_count) +{ + /* + * NOTE: Each test is run in a separate process; we leave + * mapping cleanup to process teardown for simplicity. + */ + + restore_max_map_count_if_needed(self, _metadata); +} + +static bool mmap_anon(max_vma_count_data_t *self) +{ + void *addr = mmap(NULL, psize(), PROT_READ, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + return addr != MAP_FAILED; +} + +static inline bool __mprotect(char *addr, int size) +{ + int new_prot = ~TEST_AREA_PROT & (PROT_READ | PROT_WRITE | PROT_EXEC); + + return mprotect(addr, size, new_prot) == 0; +} + +static bool mprotect_nosplit(max_vma_count_data_t *self) +{ + return __mprotect(self->test_area, self->test_area_size); +} + +static bool mprotect_2way_split(max_vma_count_data_t *self) +{ + return __mprotect(self->test_area, self->test_area_size - psize()); +} + +static bool mprotect_3way_split(max_vma_count_data_t *self) +{ + return __mprotect(self->test_area + psize(), psize()); +} + +static inline bool __munmap(char *addr, int size) +{ + return munmap(addr, size) == 0; +} + +static bool munmap_nosplit(max_vma_count_data_t *self) +{ + return __munmap(self->test_area, self->test_area_size); +} + +static bool munmap_2way_split(max_vma_count_data_t *self) +{ + return __munmap(self->test_area, self->test_area_size - psize()); +} + +static bool munmap_3way_split(max_vma_count_data_t *self) +{ + return __munmap(self->test_area + psize(), psize()); +} + +static bool mremap_dontunmap(max_vma_count_data_t *self) +{ + /* + * Using MREMAP_DONTUNMAP will create a new mapping without + * removing the old one, consuming one VMA slot. + */ + return mremap(self->test_area, self->test_area_size, + self->test_area_size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + NULL) != MAP_FAILED; +} + +TEST_F(max_vma_count, mmap_at_1_below_vma_count_limit) +{ + int vma_slots_needed = 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(mmap_anon(self)); +} + +TEST_F(max_vma_count, mmap_at_vma_count_limit) +{ + int vma_slots_needed = 0; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_FALSE(mmap_anon(self)); +} + +TEST_F(max_vma_count, mprotect_nosplit_at_1_below_vma_count_limit) +{ + int vma_slots_needed = 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(mprotect_nosplit(self)); +} + +TEST_F(max_vma_count, mprotect_nosplit_at_vma_count_limit) +{ + int vma_slots_needed = 0; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(mprotect_nosplit(self)); +} + +TEST_F(max_vma_count, mprotect_2way_split_at_1_below_vma_count_limit) +{ + int vma_slots_needed = 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(mprotect_2way_split(self)); +} + +TEST_F(max_vma_count, mprotect_2way_split_at_vma_count_limit) +{ + int vma_slots_needed = 0; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_FALSE(mprotect_2way_split(self)); +} + +TEST_F(max_vma_count, mprotect_3way_split_at_2_below_vma_count_limit) +{ + int vma_slots_needed = 2; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(mprotect_3way_split(self)); +} + +TEST_F(max_vma_count, mprotect_3way_split_at_1_below_vma_count_limit) +{ + int vma_slots_needed = 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_FALSE(mprotect_3way_split(self)); +} + +TEST_F(max_vma_count, mprotect_3way_split_at_vma_count_limit) +{ + int vma_slots_needed = 0; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_FALSE(mprotect_3way_split(self)); +} + +TEST_F(max_vma_count, munmap_nosplit_at_1_below_vma_count_limit) +{ + int vma_slots_needed = 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(munmap_nosplit(self)); +} + +TEST_F(max_vma_count, munmap_nosplit_at_vma_count_limit) +{ + int vma_slots_needed = 0; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(munmap_nosplit(self)); +} + +TEST_F(max_vma_count, munmap_2way_split_at_1_below_vma_count_limit) +{ + int vma_slots_needed = 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(munmap_2way_split(self)); +} + +TEST_F(max_vma_count, munmap_2way_split_at_vma_count_limit) +{ + int vma_slots_needed = 0; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(munmap_2way_split(self)); +} + +TEST_F(max_vma_count, munmap_3way_split_at_2_below_vma_count_limit) +{ + int vma_slots_needed = 2; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(munmap_3way_split(self)); +} + +TEST_F(max_vma_count, munmap_3way_split_at_1_below_vma_count_limit) +{ + int vma_slots_needed = 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(munmap_3way_split(self)); +} + +TEST_F(max_vma_count, munmap_3way_split_at_vma_count_limit) +{ + int vma_slots_needed = 0; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_FALSE(munmap_3way_split(self)); +} + +TEST_F(max_vma_count, mremap_dontunmap_at_required_vma_count_capcity) +{ + int vma_slots_needed = MREMAP_REQUIRED_VMA_SLOTS; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_TRUE(mremap_dontunmap(self)); +} + +TEST_F(max_vma_count, mremap_dontunmap_at_1_below_required_vma_count_capacity) +{ + int vma_slots_needed = MREMAP_REQUIRED_VMA_SLOTS - 1; + + ASSERT_NE(mmap(self->test_area, self->test_area_size, TEST_AREA_PROT, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0), + MAP_FAILED); + + ASSERT_TRUE(free_vma_slots(self, vma_slots_needed)); + + ASSERT_EQ(get_current_vma_count(), + self->max_vma_count - vma_slots_needed); + ASSERT_TRUE(is_test_area_mapped(self->test_area, self->test_area_size)); + + ASSERT_FALSE(mremap_dontunmap(self)); +} + +TEST_HARNESS_MAIN + +/* --- Utilities --- */ + +static bool lower_max_map_count_if_needed(max_vma_count_data_t *self, + struct __test_metadata *_metadata) +{ + self->max_vma_count = get_max_vma_count(); + + ASSERT_GT(self->max_vma_count, 0); + + self->original_max_vma_count = 0; + if (self->max_vma_count > DEFAULT_MAX_MAP_COUNT) { + self->original_max_vma_count = self->max_vma_count; + TH_LOG("Max VMA count: %d; lowering to default %d for test...", + self->max_vma_count, DEFAULT_MAX_MAP_COUNT); + + if (!set_max_vma_count(DEFAULT_MAX_MAP_COUNT)) + return false; + self->max_vma_count = DEFAULT_MAX_MAP_COUNT; + } + return true; +} + +static void restore_max_map_count_if_needed(max_vma_count_data_t *self, + struct __test_metadata *_metadata) +{ + if (!self->original_max_vma_count) + return; + + if (self->max_vma_count == self->original_max_vma_count) + return; + + if (!set_max_vma_count(self->original_max_vma_count)) + TH_LOG("Failed to restore max_map_count to %d", + self->original_max_vma_count); +} + +static int get_max_vma_count(void) +{ + unsigned long val; + int ret; + + ret = read_sysfs("/proc/sys/vm/max_map_count", &val); + if (ret) + return -1; + return val; +} + +static bool set_max_vma_count(int val) +{ + return write_sysfs("/proc/sys/vm/max_map_count", val) == 0; +} + +static int get_current_vma_count(void) +{ + struct procmap_fd pmap; + int count = 0; + int ret; + char vma_name[PATH_MAX]; + + ret = open_self_procmap(&pmap); + if (ret) + return -1; + + pmap.query.query_addr = 0; + pmap.query.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA; + + while (true) { + pmap.query.vma_name_addr = (uint64_t)(uintptr_t)vma_name; + pmap.query.vma_name_size = sizeof(vma_name); + vma_name[0] = '\0'; + + ret = query_procmap(&pmap); + if (ret != 0) + break; + + /* + * The [vsyscall] mapping is a special mapping that + * doesn't count against the max_map_count limit. + * Ignore it here to match the kernel's accounting. + */ + if (strcmp(vma_name, "[vsyscall]") != 0) + count++; + + pmap.query.query_addr = pmap.query.vma_end; + } + + close_procmap(&pmap); + return count; +} + +static void create_reservation(max_vma_count_data_t *self, + struct __test_metadata *_metadata) +{ + size_t reservation_size; + void *base_addr = NULL; + + /* + * To break the dependency on knowing the exact number of extra maps + * before creating the reservation, we allocate a reservation size + * large enough for the maximum possible number of extra maps. + * The maximum number of extra maps is bounded by max_vma_count. + */ + reservation_size = ((self->max_vma_count * 2) + + TEST_AREA_NR_PAGES + + 2 /* Holes around TEST_AREA */) * psize(); + + base_addr = mmap(NULL, reservation_size, PROT_NONE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + ASSERT_NE(base_addr, MAP_FAILED); + + ASSERT_EQ(munmap(base_addr, reservation_size), 0); + + /* The test area is offset by one hole page from the base address. */ + self->test_area = (char *)base_addr + psize(); + + /* The extra maps start after the test area and another hole page. */ + self->extra_maps = self->test_area + self->test_area_size + psize(); +} + +static void create_extra_maps(max_vma_count_data_t *self, + struct __test_metadata *_metadata) +{ + char *ptr = self->extra_maps; + + for (int i = 0; i < self->nr_extra_maps; ++i) { + ASSERT_NE(mmap(ptr, psize(), EXTRA_MAP_PROT, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, + -1, 0), MAP_FAILED) { + TH_LOG("Failed on mapping #%d of %d", i + 1, + self->nr_extra_maps); + } + + /* + * Advance pointer by two pages to leave a 1-page hole, + * after each 1-page map. + */ + ptr += (2 * psize()); + } +} + +static bool free_vma_slots(max_vma_count_data_t *self, int slots_to_free) +{ + for (int i = 0; i < slots_to_free; i++) { + if (munmap(self->extra_maps + (i * 2 * psize()), psize()) != 0) + return false; + } + + return true; +} + +static bool is_test_area_mapped(char *test_area, int test_area_size) +{ + struct procmap_fd pmap; + bool found = false; + int ret; + + ret = open_self_procmap(&pmap); + if (ret) + return false; + + pmap.query.query_addr = (uint64_t)(uintptr_t)test_area; + pmap.query.query_flags = 0; /* Find VMA covering address */ + + if (query_procmap(&pmap) == 0 && + pmap.query.vma_start == (unsigned long)test_area && + pmap.query.vma_end == (unsigned long)test_area + test_area_size) + found = true; + + close_procmap(&pmap); + return found; +} + diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index d9173f2312b7..a85db61e6a92 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -49,6 +49,8 @@ separated by spaces: test madvise(2) MADV_GUARD_INSTALL and MADV_GUARD_REMOVE options - madv_populate test memadvise(2) MADV_POPULATE_{READ,WRITE} options +- max_vma_count + tests for max vma_count - memfd_secret test memfd_secret(2) - process_mrelease @@ -426,6 +428,9 @@ fi # VADDR64 # vmalloc stability smoke test CATEGORY="vmalloc" run_test bash ./test_vmalloc.sh smoke +# test operations against max vma count limit +CATEGORY="max_vma_count" run_test ./max_vma_count_tests + CATEGORY="mremap" run_test ./mremap_dontunmap CATEGORY="hmm" run_test bash ./test_hmm.sh smoke -- 2.51.0.760.g7b8bcc2412-goog The checks against sysctl_max_map_count are open-coded in multiple places. While simple checks are manageable, the logic in places like mremap.c involves arithmetic with magic numbers that can be difficult to reason about. e.g. ... >= sysctl_max_map_count - 3 To improve readability and centralize the logic, introduce a new helper, vma_count_remaining(). This function returns the VMA count headroom available for a given mm. The most common case of checking for a single new VMA can be done with the convenience helper has_vma_count_remaining(): if (!vma_count_remaining(mm)) And the complex checks in mremap.c become clearer by expressing the required capacity directly: if (vma_count_remaining(mm) < 4) While a capacity-based function could be misused (e.g., with an incorrect '<' vs '<=' comparison), the improved readability at the call sites makes such errors less likely than with the previous open-coded arithmetic. As part of this change, sysctl_max_map_count is made static to mm/mmap.c to improve encapsulation. Cc: Andrew Morton Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Minchan Kim Cc: Pedro Falcato Signed-off-by: Kalesh Singh --- Changes in v3: - Move vma_count_remaining() out of #if CONFIG_SYSCTL to fix build failure - Use READ_ONCE() for sysclt_max_map_count, per David, Lorenzo - Remove use of ternary op in vma_count_remaining, per Lorenzo - Rebase on mm-new to fix conflicts in vma_internal.h and mm/internal.h include/linux/mm.h | 2 -- mm/internal.h | 3 +++ mm/mmap.c | 24 +++++++++++++++++++++++- mm/mremap.c | 7 ++++--- mm/nommu.c | 2 +- mm/util.c | 1 - mm/vma.c | 10 +++++----- tools/testing/vma/vma_internal.h | 9 +++++++++ 8 files changed, 45 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c01c4b59ca6..72ff386ef772 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -203,8 +203,6 @@ static inline void __mm_zero_struct_page(struct page *page) #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -extern int sysctl_max_map_count; - extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/mm/internal.h b/mm/internal.h index a2555be247e5..289aca3bdb6c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1682,4 +1682,7 @@ static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, return remap_pfn_range_complete(vma, addr, pfn, size, prot); } +/* mmap.c */ +int vma_count_remaining(const struct mm_struct *mm); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index da2cbdc0f87b..d9ea029cd018 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ - if (mm->map_count >= sysctl_max_map_count) + if (!vma_count_remaining(mm)) return -ENOMEM; /* @@ -1495,6 +1495,28 @@ struct vm_area_struct *_install_special_mapping( &special_mapping_vmops); } +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; + +/** + * vma_count_remaining - Determine available VMA slots + * @mm: The memory descriptor for the process. + * + * Check how many more VMAs can be created for the given @mm + * before hitting the sysctl_max_map_count limit. + * + * Return: The number of new VMAs the process can accommodate. + */ +int vma_count_remaining(const struct mm_struct *mm) +{ + const int map_count = mm->map_count; + const int max_count = READ_ONCE(sysctl_max_map_count); + + if (map_count >= max_count) + return 0; + + return max_count - map_count; +} + #ifdef CONFIG_SYSCTL #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) diff --git a/mm/mremap.c b/mm/mremap.c index 35de0a7b910e..14d35d87e89b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ - if (current->mm->map_count >= sysctl_max_map_count - 3) + if (vma_count_remaining(current->mm) < 4) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { @@ -1814,9 +1814,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) * split in 3 before unmapping it. * That means 2 more maps (1 for each) to the ones we already hold. * Check whether current map count plus 2 still leads us to 4 maps below - * the threshold, otherwise return -ENOMEM here to be more safe. + * the threshold. In other words, is the current map count + 6 at or + * below the threshold? Otherwise return -ENOMEM here to be more safe. */ - if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) + if (vma_count_remaining(current->mm) < 6) return -ENOMEM; return 0; diff --git a/mm/nommu.c b/mm/nommu.c index c3a23b082adb..22e55e7c69c4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1317,7 +1317,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; mm = vma->vm_mm; - if (mm->map_count >= sysctl_max_map_count) + if (!vma_count_remaining(mm)) return -ENOMEM; region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); diff --git a/mm/util.c b/mm/util.c index 088e1f8edcf5..3315e1136c69 100644 --- a/mm/util.c +++ b/mm/util.c @@ -752,7 +752,6 @@ EXPORT_SYMBOL(folio_mc_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; static int sysctl_overcommit_ratio __read_mostly = 50; static unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ diff --git a/mm/vma.c b/mm/vma.c index fba68f13e628..96ba37721002 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -491,8 +491,8 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, } /* - * __split_vma() bypasses sysctl_max_map_count checking. We use this where it - * has already been checked or doesn't make sense to fail. + * __split_vma() bypasses vma_count_remaining() checks. We use this where + * it has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ static __must_check int @@ -592,7 +592,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (vma->vm_mm->map_count >= sysctl_max_map_count) + if (!vma_count_remaining(vma->vm_mm)) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); @@ -1345,7 +1345,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && - vms->vma->vm_mm->map_count >= sysctl_max_map_count) { + !vma_count_remaining(vms->vma->vm_mm)) { error = -ENOMEM; goto map_count_exceeded; } @@ -2797,7 +2797,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; - if (mm->map_count >= sysctl_max_map_count) + if (!vma_count_remaining(mm)) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 3525f5c15e1b..70f11163ab72 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1484,4 +1484,13 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t, return 0; } +/* Helper to get VMA count capacity */ +static int vma_count_remaining(const struct mm_struct *mm) +{ + const int map_count = mm->map_count; + const int max_count = sysctl_max_map_count; + + return (max_count > map_count) ? (max_count - map_count) : 0; +} + #endif /* __MM_VMA_INTERNAL_H */ -- 2.51.0.760.g7b8bcc2412-goog A mechanical rename of the mm_struct->map_count. While at it update the vma_count BUG_ON() in exit_mmap() to a WARN_ON_ONCE; no other functional change is intended. The name "map_count" is ambiguous within the memory management subsystem, as it can be confused with the folio/page->_mapcount field, which tracks PTE references. The new name, vma_count, is more precise as this field has always counted the number of vm_area_structs associated with an mm_struct. Cc: Andrew Morton Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Minchan Kim Cc: Pedro Falcato Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Signed-off-by: Kalesh Singh --- Changes in v3: - Change vma_count BUG_ON() in exit_mmap() to WARN_ON_ONCE, per David and Lorenzo - Collect Reviewed-by tags fs/binfmt_elf.c | 2 +- fs/coredump.c | 2 +- include/linux/mm_types.h | 2 +- kernel/fork.c | 2 +- mm/debug.c | 2 +- mm/mmap.c | 10 +++++----- mm/nommu.c | 6 +++--- mm/vma.c | 24 ++++++++++++------------ tools/testing/vma/vma.c | 32 ++++++++++++++++---------------- tools/testing/vma/vma_internal.h | 6 +++--- 10 files changed, 44 insertions(+), 44 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e4653bb99946..a5acfe97612d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1660,7 +1660,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm data[0] = count; data[1] = PAGE_SIZE; /* - * Count usually is less than mm->map_count, + * Count usually is less than mm->vma_count, * we need to move filenames down. */ n = cprm->vma_count - count; diff --git a/fs/coredump.c b/fs/coredump.c index b5fc06a092a4..5e0859813141 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1733,7 +1733,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) cprm->vma_data_size = 0; gate_vma = get_gate_vma(mm); - cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); + cprm->vma_count = mm->vma_count + (gate_vma ? 1 : 0); cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL); if (!cprm->vma_meta) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4e5d59997e4a..97e0541cd415 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1078,7 +1078,7 @@ struct mm_struct { #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif - int map_count; /* number of VMAs */ + int vma_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some * counters diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..c8d59042b34f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1038,7 +1038,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm_pgtables_bytes_init(mm); - mm->map_count = 0; + mm->vma_count = 0; mm->locked_vm = 0; atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); diff --git a/mm/debug.c b/mm/debug.c index 64ddb0c4b4be..a35e2912ae53 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -204,7 +204,7 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), - mm->map_count, + mm->vma_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, (u64)atomic64_read(&mm->pinned_vm), mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/mmap.c b/mm/mmap.c index d9ea029cd018..b4eda47b88d8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1305,7 +1305,7 @@ void exit_mmap(struct mm_struct *mm) vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); - BUG_ON(count != mm->map_count); + WARN_ON_ONCE(count != mm->vma_count); trace_exit_mmap(mm); destroy: @@ -1508,13 +1508,13 @@ static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; */ int vma_count_remaining(const struct mm_struct *mm) { - const int map_count = mm->map_count; + const int vma_count = mm->vma_count; const int max_count = READ_ONCE(sysctl_max_map_count); - if (map_count >= max_count) + if (vma_count >= max_count) return 0; - return max_count - map_count; + return max_count - vma_count; } #ifdef CONFIG_SYSCTL @@ -1828,7 +1828,7 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ vma_iter_bulk_store(&vmi, tmp); - mm->map_count++; + mm->vma_count++; if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/nommu.c b/mm/nommu.c index 22e55e7c69c4..b375d3e00d0c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -577,7 +577,7 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) static void cleanup_vma_from_mm(struct vm_area_struct *vma) { - vma->vm_mm->map_count--; + vma->vm_mm->vma_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { struct address_space *mapping; @@ -1199,7 +1199,7 @@ unsigned long do_mmap(struct file *file, goto error_just_free; setup_vma_to_mm(vma, current->mm); - current->mm->map_count++; + current->mm->vma_count++; /* add the VMA to the tree */ vma_iter_store_new(&vmi, vma); @@ -1367,7 +1367,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, setup_vma_to_mm(vma, mm); setup_vma_to_mm(new, mm); vma_iter_store_new(vmi, new); - mm->map_count++; + mm->vma_count++; return 0; err_vmi_preallocate: diff --git a/mm/vma.c b/mm/vma.c index 96ba37721002..b35a4607cde4 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -352,7 +352,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, * (it may either follow vma or precede it). */ vma_iter_store_new(vmi, vp->insert); - mm->map_count++; + mm->vma_count++; } if (vp->anon_vma) { @@ -383,7 +383,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, } if (vp->remove->anon_vma) anon_vma_merge(vp->vma, vp->remove); - mm->map_count--; + mm->vma_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); @@ -683,13 +683,13 @@ void validate_mm(struct mm_struct *mm) } #endif /* Check for a infinite loop */ - if (++i > mm->map_count + 10) { + if (++i > mm->vma_count + 10) { i = -1; break; } } - if (i != mm->map_count) { - pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); + if (i != mm->vma_count) { + pr_emerg("vma_count %d vma iterator %d\n", mm->vma_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); @@ -1266,7 +1266,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct mm_struct *mm; mm = current->mm; - mm->map_count -= vms->vma_count; + mm->vma_count -= vms->vma_count; mm->locked_vm -= vms->locked_vm; if (vms->unlock) mmap_write_downgrade(mm); @@ -1340,14 +1340,14 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, if (vms->start > vms->vma->vm_start) { /* - * Make sure that map_count on return from munmap() will + * Make sure that vma_count on return from munmap() will * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && !vma_count_remaining(vms->vma->vm_mm)) { error = -ENOMEM; - goto map_count_exceeded; + goto vma_count_exceeded; } /* Don't bother splitting the VMA if we can't unmap it anyway */ @@ -1461,7 +1461,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, modify_vma_failed: reattach_vmas(mas_detach); start_split_failed: -map_count_exceeded: +vma_count_exceeded: return error; } @@ -1795,7 +1795,7 @@ int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) vma_start_write(vma); vma_iter_store_new(&vmi, vma); vma_link_file(vma); - mm->map_count++; + mm->vma_count++; validate_mm(mm); return 0; } @@ -2512,7 +2512,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); vma_iter_store_new(vmi, vma); - map->mm->map_count++; + map->mm->vma_count++; vma_link_file(vma); /* @@ -2835,7 +2835,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; - mm->map_count++; + mm->vma_count++; validate_mm(mm); out: perf_event_mmap(vma); diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 656e1c75b711..69fa7d14a6c2 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -261,7 +261,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) } mtree_destroy(&mm->mm_mt); - mm->map_count = 0; + mm->vma_count = 0; return count; } @@ -500,7 +500,7 @@ static bool test_merge_new(void) INIT_LIST_HEAD(&vma_d->anon_vma_chain); list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain); ASSERT_FALSE(merged); - ASSERT_EQ(mm.map_count, 4); + ASSERT_EQ(mm.vma_count, 4); /* * Merge BOTH sides. @@ -519,7 +519,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 3); + ASSERT_EQ(mm.vma_count, 3); /* * Merge to PREVIOUS VMA. @@ -536,7 +536,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 3); + ASSERT_EQ(mm.vma_count, 3); /* * Merge to NEXT VMA. @@ -555,7 +555,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 6); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 3); + ASSERT_EQ(mm.vma_count, 3); /* * Merge BOTH sides. @@ -573,7 +573,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 2); + ASSERT_EQ(mm.vma_count, 2); /* * Merge to NEXT VMA. @@ -591,7 +591,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 0xa); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 2); + ASSERT_EQ(mm.vma_count, 2); /* * Merge BOTH sides. @@ -608,7 +608,7 @@ static bool test_merge_new(void) ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 1); + ASSERT_EQ(mm.vma_count, 1); /* * Final state. @@ -967,7 +967,7 @@ static bool test_vma_merge_new_with_close(void) ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_EQ(vma->vm_ops, &vm_ops); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 2); + ASSERT_EQ(mm.vma_count, 2); cleanup_mm(&mm, &vmi); return true; @@ -1017,7 +1017,7 @@ static bool test_merge_existing(void) ASSERT_EQ(vma->vm_pgoff, 2); ASSERT_TRUE(vma_write_started(vma)); ASSERT_TRUE(vma_write_started(vma_next)); - ASSERT_EQ(mm.map_count, 2); + ASSERT_EQ(mm.vma_count, 2); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1045,7 +1045,7 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_next->vm_pgoff, 2); ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_next)); - ASSERT_EQ(mm.map_count, 1); + ASSERT_EQ(mm.vma_count, 1); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1079,7 +1079,7 @@ static bool test_merge_existing(void) ASSERT_EQ(vma->vm_pgoff, 6); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 2); + ASSERT_EQ(mm.vma_count, 2); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1108,7 +1108,7 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->vm_pgoff, 0); ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); - ASSERT_EQ(mm.map_count, 1); + ASSERT_EQ(mm.vma_count, 1); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1138,7 +1138,7 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->vm_pgoff, 0); ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); - ASSERT_EQ(mm.map_count, 1); + ASSERT_EQ(mm.vma_count, 1); /* Clear down and reset. We should have deleted prev and next. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1540,7 +1540,7 @@ static bool test_merge_extend(void) ASSERT_EQ(vma->vm_end, 0x4000); ASSERT_EQ(vma->vm_pgoff, 0); ASSERT_TRUE(vma_write_started(vma)); - ASSERT_EQ(mm.map_count, 1); + ASSERT_EQ(mm.vma_count, 1); cleanup_mm(&mm, &vmi); return true; @@ -1652,7 +1652,7 @@ static bool test_mmap_region_basic(void) 0x24d, NULL); ASSERT_EQ(addr, 0x24d000); - ASSERT_EQ(mm.map_count, 2); + ASSERT_EQ(mm.vma_count, 2); for_each_vma(vmi, vma) { if (vma->vm_start == 0x300000) { diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 70f11163ab72..84760d901656 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -261,7 +261,7 @@ typedef struct { struct mm_struct { struct maple_tree mm_mt; - int map_count; /* number of VMAs */ + int vma_count; /* number of VMAs */ unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ @@ -1487,10 +1487,10 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t, /* Helper to get VMA count capacity */ static int vma_count_remaining(const struct mm_struct *mm) { - const int map_count = mm->map_count; + const int vma_count = mm->vma_count; const int max_count = sysctl_max_map_count; - return (max_count > map_count) ? (max_count - map_count) : 0; + return (max_count > vma_count) ? (max_count - vma_count) : 0; } #endif /* __MM_VMA_INTERNAL_H */ -- 2.51.0.760.g7b8bcc2412-goog Needed observability on in field devices can be collected with minimal overhead and can be toggled on and off. Event driven telemetry can be done with tracepoint BPF programs. The process comm is provided for aggregation across devices and tgid is to enable per-process aggregation per device. This allows for observing the distribution of such problems in the field, to deduce if there are legitimate bugs or if a bump to the limit is warranted. Cc: Andrew Morton Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Minchan Kim Cc: Pedro Falcato Signed-off-by: Kalesh Singh --- Changes in v3: - capture the mm pointer as the unique identifier and capture the vma_count as well, instead of current task tgid, per Steve - Add include/trace/events/vma.h to MEMORY MAPPING section in MAINTAINERS, per Lorenzo - rename trace_max_vma_count_exceeded() to trace_mm_insufficient_vma_slots(), since this is a preemptive check, per Lorenzo - Fix tools/testing/vma build errors, per Lorenzo MAINTAINERS | 1 + include/trace/events/vma.h | 32 ++++++++++++++++++++++++++++++++ mm/mmap.c | 5 ++++- mm/mremap.c | 10 ++++++++-- mm/vma.c | 9 +++++++-- mm/vma_internal.h | 2 ++ tools/testing/vma/vma_internal.h | 5 +++++ 7 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 include/trace/events/vma.h diff --git a/MAINTAINERS b/MAINTAINERS index aa83e5893e16..d37215a8a829 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16566,6 +16566,7 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/trace/events/mmap.h +F: include/trace/events/vma.h F: mm/interval_tree.c F: mm/mincore.c F: mm/mlock.c diff --git a/include/trace/events/vma.h b/include/trace/events/vma.h new file mode 100644 index 000000000000..4540fa607f66 --- /dev/null +++ b/include/trace/events/vma.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vma + +#if !defined(_TRACE_VMA_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VMA_H + +#include + +TRACE_EVENT(mm_insufficient_vma_slots, + + TP_PROTO(struct mm_struct *mm), + + TP_ARGS(mm), + + TP_STRUCT__entry( + __field(void *, mm) + __field(int, vma_count) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->vma_count = mm->vma_count; + ), + + TP_printk("mm=%p vma_count=%d", __entry->mm, __entry->vma_count) +); + +#endif /* _TRACE_VMA_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/mmap.c b/mm/mmap.c index b4eda47b88d8..4035f49ac963 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -56,6 +56,7 @@ #define CREATE_TRACE_POINTS #include +#include #include "internal.h" @@ -374,8 +375,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ - if (!vma_count_remaining(mm)) + if (!vma_count_remaining(mm)) { + trace_mm_insufficient_vma_slots(mm); return -ENOMEM; + } /* * addr is returned from get_unmapped_area, diff --git a/mm/mremap.c b/mm/mremap.c index 14d35d87e89b..a7f440a3737f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -30,6 +30,8 @@ #include #include +#include + #include "internal.h" /* Classify the kind of remap operation being performed. */ @@ -1040,8 +1042,10 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ - if (vma_count_remaining(current->mm) < 4) + if (vma_count_remaining(current->mm) < 4) { + trace_mm_insufficient_vma_slots(current->mm); return -ENOMEM; + } if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) @@ -1817,8 +1821,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) * the threshold. In other words, is the current map count + 6 at or * below the threshold? Otherwise return -ENOMEM here to be more safe. */ - if (vma_count_remaining(current->mm) < 6) + if (vma_count_remaining(current->mm) < 6) { + trace_mm_insufficient_vma_slots(current->mm); return -ENOMEM; + } return 0; } diff --git a/mm/vma.c b/mm/vma.c index b35a4607cde4..6d8cef7f4d5f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -592,8 +592,10 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (!vma_count_remaining(vma->vm_mm)) + if (!vma_count_remaining(vma->vm_mm)) { + trace_mm_insufficient_vma_slots(vma->vm_mm); return -ENOMEM; + } return __split_vma(vmi, vma, addr, new_below); } @@ -1346,6 +1348,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, */ if (vms->end < vms->vma->vm_end && !vma_count_remaining(vms->vma->vm_mm)) { + trace_mm_insufficient_vma_slots(vms->vma->vm_mm); error = -ENOMEM; goto vma_count_exceeded; } @@ -2797,8 +2800,10 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; - if (!vma_count_remaining(mm)) + if (!vma_count_remaining(mm)) { + trace_mm_insufficient_vma_slots(mm); return -ENOMEM; + } if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; diff --git a/mm/vma_internal.h b/mm/vma_internal.h index 2f05735ff190..86823ca6857b 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -52,4 +52,6 @@ #include "internal.h" +#include + #endif /* __MM_VMA_INTERNAL_H */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 84760d901656..57e36d82b4c8 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1493,4 +1493,9 @@ static int vma_count_remaining(const struct mm_struct *mm) return (max_count > vma_count) ? (max_count - vma_count) : 0; } +/* Stub for trace_mm_insufficient_vma_slots */ +static inline void trace_mm_insufficient_vma_slots(struct mm_struct *mm) +{ +} + #endif /* __MM_VMA_INTERNAL_H */ -- 2.51.0.760.g7b8bcc2412-goog