We are currently in the bizarre situation where we are constrained on the number of flags we can set in an mm_struct based on whether this is a 32-bit or 64-bit kernel. This is because mm->flags is an unsigned long field, which is 32-bits on a 32-bit system and 64-bits on a 64-bit system. In order to keep things functional across both architectures, we do not permit mm flag bits to be set above flag 31 (i.e. the 32nd bit). This is a silly situation, especially given how profligate we are in storing metadata in mm_struct, so let's convert mm->flags into a bitmap and allow ourselves as many bits as we like. To keep things manageable, firstly we introduce the bitmap at a system word system as a new field mm->_flags, in union. This means the new bitmap mm->_flags is bitwise exactly identical to the existing mm->flags field. We have an opportunity to also introduce some type safety here, so let's wrap the mm flags field as a struct and declare it as an mm_flags_t typedef to keep it consistent with vm_flags_t for VMAs. We make the internal field privately accessible, in order to force the use of helper functions so we can enforce that accesses are bitwise as required. We therefore introduce accessors prefixed with mm_flags_*() for callers to use. We place the bit parameter first so as to match the parameter ordering of the *_bit() functions. Having this temporary union arrangement allows us to incrementally swap over users of mm->flags patch-by-patch rather than having to do everything in one fell swoop. Signed-off-by: Lorenzo Stoakes --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3868ca1a25f9..4ed4a0b9dad6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,6 +34,8 @@ #include #include #include +#include +#include struct mempolicy; struct anon_vma; @@ -720,6 +722,36 @@ static inline void assert_fault_locked(struct vm_fault *vmf) } #endif /* CONFIG_PER_VMA_LOCK */ +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) +{ + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) +{ + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_set(int flag, struct mm_struct *mm) +{ + set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear(int flag, struct mm_struct *mm) +{ + clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear_all(struct mm_struct *mm) +{ + bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); +} + extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cf94df4955c7..46d3fb8935c7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -927,6 +928,15 @@ struct mm_cid { }; #endif +/* + * Opaque type representing current mm_struct flag state. Must be accessed via + * mm_flags_xxx() helper functions. + */ +#define NUM_MM_FLAG_BITS BITS_PER_LONG +typedef struct { + __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} mm_flags_t; + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -1109,7 +1119,11 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - unsigned long flags; /* Must use atomic bitops to access */ + /* Temporary union while we convert users to mm_flags_t. */ + union { + unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ + }; #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1219,6 +1233,29 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +/* Read the first system word of mm flags, non-atomically. */ +static inline unsigned long __mm_flags_get_word(struct mm_struct *mm) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + return bitmap_read(bitmap, 0, BITS_PER_LONG); +} + +/* Set the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_set_word(struct mm_struct *mm, + unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + bitmap_copy(bitmap, &value, BITS_PER_LONG); +} + +/* Obtain a read-only view of the bitmap. */ +static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) +{ + return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; -- 2.50.1 As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. This will result in the debug output being that of a bitmap output, which will result in a minor change here, but since this is for debug only, this should have no bearing. Otherwise, no functional changes intended. Signed-off-by: Lorenzo Stoakes --- include/linux/huge_mm.h | 2 +- include/linux/khugepaged.h | 6 ++++-- include/linux/ksm.h | 6 +++--- include/linux/mm.h | 2 +- include/linux/mman.h | 2 +- include/linux/oom.h | 2 +- mm/debug.c | 4 ++-- mm/gup.c | 10 +++++----- mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 10 +++++----- mm/ksm.c | 32 ++++++++++++++++---------------- mm/mmap.c | 8 ++++---- mm/oom_kill.c | 26 +++++++++++++------------- mm/util.c | 6 +++--- 14 files changed, 63 insertions(+), 61 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 14d424830fa8..84b7eebe0d68 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -327,7 +327,7 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, * example, s390 kvm. */ return (vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags); + mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index ff6120463745..eb1946a70cff 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,6 +2,8 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H +#include + extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm)) __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, mm)) __khugepaged_exit(mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c17b955e7b0b..22e67ca7cba3 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,13 +56,13 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return __ksm_enter(mm); return 0; @@ -70,7 +70,7 @@ static inline int ksm_execve(struct mm_struct *mm) static inline void ksm_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, mm)) __ksm_exit(mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ed4a0b9dad6..34311ebe62cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1949,7 +1949,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)) return false; return folio_maybe_dma_pinned(folio); diff --git a/include/linux/mman.h b/include/linux/mman.h index de9e8e6229a4..0ba8a7e8b90a 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void) static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 1e0fc6931ce9..7b02bc1d0a7e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) */ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) { - if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + if (unlikely(mm_flags_test(MMF_UNSTABLE, mm))) return VM_FAULT_SIGBUS; return 0; } diff --git a/mm/debug.c b/mm/debug.c index b4388f4dcd4d..64ddb0c4b4be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm) "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %px flags %lx\n" + "binfmt %px flags %*pb\n" #ifdef CONFIG_AIO "ioctx_table %px\n" #endif @@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm) mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, - mm->binfmt, mm->flags, + mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm), #ifdef CONFIG_AIO mm->ioctx_table, #endif diff --git a/mm/gup.c b/mm/gup.c index adffe663594d..331d22bf7b2d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -475,10 +475,10 @@ EXPORT_SYMBOL_GPL(unpin_folios); * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ -static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +static inline void mm_set_has_pinned_flag(struct mm_struct *mm) { - if (!test_bit(MMF_HAS_PINNED, mm_flags)) - set_bit(MMF_HAS_PINNED, mm_flags); + if (!mm_flags_test(MMF_HAS_PINNED, mm)) + mm_flags_set(MMF_HAS_PINNED, mm); } #ifdef CONFIG_MMU @@ -1693,7 +1693,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, mmap_assert_locked(mm); if (flags & FOLL_PIN) - mm_set_has_pinned_flag(&mm->flags); + mm_set_has_pinned_flag(mm); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -3210,7 +3210,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EINVAL; if (gup_flags & FOLL_PIN) - mm_set_has_pinned_flag(¤t->mm->flags); + mm_set_has_pinned_flag(current->mm); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b8bb078a1a34..a2f476e7419a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -251,13 +251,13 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return huge_zero_folio; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); @@ -268,7 +268,7 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); } @@ -1145,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) + if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub) return ret + size; ret += off_sub; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c..6470e7e26c8d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP, mm); } static bool hugepage_pmd_enabled(void) @@ -445,7 +445,7 @@ void __khugepaged_enter(struct mm_struct *mm) /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; mm_slot = mm_slot_alloc(mm_slot_cache); @@ -472,7 +472,7 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) @@ -497,7 +497,7 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { @@ -1459,7 +1459,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_clear(mm, MMF_VM_HUGEPAGE); */ /* khugepaged_mm_lock actually not necessary for the below */ diff --git a/mm/ksm.c b/mm/ksm.c index 160787bb121c..2ef29802a49b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1217,8 +1217,8 @@ static int unmerge_and_remove_all_rmap_items(void) spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); @@ -2620,8 +2620,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2742,7 +2742,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; @@ -2784,16 +2784,16 @@ int ksm_enable_merge_any(struct mm_struct *mm) { int err; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } - set_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_set(MMF_VM_MERGE_ANY, mm); ksm_add_vmas(mm); return 0; @@ -2815,7 +2815,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) { int err; - if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; err = ksm_del_vmas(mm); @@ -2824,7 +2824,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) return err; } - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); return 0; } @@ -2832,9 +2832,9 @@ int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) return 0; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } @@ -2852,7 +2852,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (!vma_ksm_compatible(vma)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; @@ -2912,7 +2912,7 @@ int __ksm_enter(struct mm_struct *mm) list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - set_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_set(MMF_VM_MERGEABLE, mm); mmgrab(mm); if (needs_wakeup) @@ -2954,8 +2954,8 @@ void __ksm_exit(struct mm_struct *mm) if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); + mm_flags_clear(MMF_VM_MERGEABLE, mm); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); diff --git a/mm/mmap.c b/mm/mmap.c index 7306253cc3b5..7a057e0e8da9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) + if (mm_flags_test(MMF_TOPDOWN, mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm) * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); @@ -1859,14 +1859,14 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25923cfec9c6..17650f0b516e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/oom_kill.c - * + * * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... @@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_SKIP, &p->mm->flags) || + mm_flags_test(MMF_OOM_SKIP, p->mm) || in_vfork(p)) { task_unlock(p); return LONG_MIN; @@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm)) goto next; goto abort; } @@ -524,7 +524,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * should imply barriers already and the reader would hit a page fault * if it stumbled over a reaped memory. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) @@ -583,7 +583,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * under mmap_lock for reading because it serializes against the * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { trace_skip_task_reaping(tsk->pid); goto out_unlock; } @@ -619,7 +619,7 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) + mm_flags_test(MMF_OOM_SKIP, mm)) goto done; pr_info("oom_reaper: unable to reap pid:%d (%s)\n", @@ -634,7 +634,7 @@ static void oom_reap_task(struct task_struct *tsk) * Hide this mm from OOM killer because it has been either reaped or * somebody can't call mmap_write_unlock(mm). */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); @@ -670,7 +670,7 @@ static void wake_oom_reaper(struct timer_list *timer) unsigned long flags; /* The victim managed to terminate on its own - see exit_mmap */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { put_task_struct(tsk); return; } @@ -695,7 +695,7 @@ static void wake_oom_reaper(struct timer_list *timer) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm)) return; get_task_struct(tsk); @@ -892,7 +892,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, mm)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -977,7 +977,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; if (is_global_init(p)) { can_oom_reap = false; - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); @@ -1235,7 +1235,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) reap = true; else { /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + if (!mm_flags_test(MMF_OOM_SKIP, mm)) ret = -EINVAL; } task_unlock(p); @@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure * possible change in exit_mmap is seen */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) + if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); diff --git a/mm/util.c b/mm/util.c index f814e6a59ab1..d235b74f7aff 100644 --- a/mm/util.c +++ b/mm/util.c @@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } #endif #ifdef CONFIG_MMU -- 2.50.1 As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. Signed-off-by: Lorenzo Stoakes --- kernel/sys.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 1e28b40053ce..605f7fe9a143 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2392,9 +2392,9 @@ static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; - if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) ret |= PR_MDWE_REFUSE_EXEC_GAIN; - if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm)) ret |= PR_MDWE_NO_INHERIT; return ret; @@ -2427,9 +2427,9 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, return -EPERM; /* Cannot unset the flags */ if (bits & PR_MDWE_NO_INHERIT) - set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) - set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE, current->mm); return 0; } @@ -2627,7 +2627,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); + error = !!mm_flags_test(MMF_DISABLE_THP, me->mm); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2635,9 +2635,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) - set_bit(MMF_DISABLE_THP, &me->mm->flags); + mm_flags_set(MMF_DISABLE_THP, me->mm); else - clear_bit(MMF_DISABLE_THP, &me->mm->flags); + mm_flags_clear(MMF_DISABLE_THP, me->mm); mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: @@ -2770,7 +2770,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); + error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm); break; #endif case PR_RISCV_V_SET_CONTROL: -- 2.50.1 As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. Signed-off-by: Lorenzo Stoakes --- arch/s390/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 4 ++-- arch/x86/mm/mmap.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 40a526d28184..c884b580eb5e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -182,10 +182,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) */ if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flag_set(MMF_TOPDOWN, mm); } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index c5a284df7b41..785e9909340f 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -309,7 +309,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap == RLIM_INFINITY || sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -320,7 +320,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap = (task_size / 6 * 5); mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 5ed2109211da..708f85dc9380 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -122,9 +122,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); else - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, arch_rnd(mmap64_rnd_bits), task_size_64bit(0), -- 2.50.1 As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. Signed-off-by: Lorenzo Stoakes --- kernel/events/uprobes.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..31a12b60055f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1153,15 +1153,15 @@ static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ - first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm); if (first_uprobe) - set_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_set(MMF_HAS_UPROBES, mm); ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) - clear_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_clear(MMF_RECALC_UPROBES, mm); else if (first_uprobe) - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); return ret; } @@ -1171,7 +1171,7 @@ static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; - set_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, mm); return set_orig_insn(&uprobe->arch, vma, vaddr); } @@ -1303,7 +1303,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, vma, info->vaddr); - } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, vma, info->vaddr); } @@ -1595,7 +1595,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && - test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) @@ -1655,12 +1655,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || - test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) || + mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm)) return; if (vma_has_uprobes(vma, start, end)) - set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, @@ -1823,10 +1823,10 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { - set_bit(MMF_HAS_UPROBES, &newmm->flags); + if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) { + mm_flags_set(MMF_HAS_UPROBES, newmm); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ - set_bit(MMF_RECALC_UPROBES, &newmm->flags); + mm_flags_set(MMF_RECALC_UPROBES, newmm); } } @@ -2370,7 +2370,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) return; } - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) @@ -2468,7 +2468,7 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb *is_swbp = -EFAULT; } - if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); @@ -2818,7 +2818,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) if (!current->mm) return 0; - if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) && (!current->utask || !current->utask->return_instances)) return 0; -- 2.50.1 The coredump logic is slightly different from other users in that it both stores mm flags and additionally sets and gets using masks. Since the MMF_DUMPABLE_* flags must remain as they are for uABI reasons, and of course these are within the first 32-bits of the flags, it is reasonable to provide access to these in the same fashion so this logic can all still keep working as it has been. Therefore, introduce coredump-specific helpers __mm_flags_get_dumpable() and __mm_flags_set_mask_dumpable() for this purpose, and update all core dump users of mm flags to use these. Signed-off-by: Lorenzo Stoakes --- fs/coredump.c | 4 +++- fs/exec.c | 2 +- fs/pidfs.c | 7 +++++-- fs/proc/base.c | 8 +++++--- include/linux/sched/coredump.h | 21 ++++++++++++++++++++- 5 files changed, 34 insertions(+), 8 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index fedbead956ed..e5d9d6276990 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1103,8 +1103,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) * We must use the same mm->flags while dumping core to avoid * inconsistency of bit flags, since this flag is not protected * by any locks. + * + * Note that we only care about MMF_DUMP* flags. */ - .mm_flags = mm->flags, + .mm_flags = __mm_flags_get_dumpable(mm), .vma_meta = NULL, .cpu = raw_smp_processor_id(), }; diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a1..dbac0e84cc3e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1999,7 +1999,7 @@ void set_dumpable(struct mm_struct *mm, int value) if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; - set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); + __mm_flags_set_mask_dumpable(mm, value); } SYSCALL_DEFINE3(execve, diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..5148b7646b7f 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -357,8 +357,11 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) { task_lock(task); - if (task->mm) - kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags); + if (task->mm) { + unsigned long flags = __mm_flags_get_dumpable(task->mm); + + kinfo.coredump_mask = pidfs_coredump_mask(flags); + } task_unlock(task); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 62d35631ba8c..f0c093c58aaf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2962,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, ret = 0; mm = get_task_mm(task); if (mm) { + unsigned long flags = __mm_flags_get_dumpable(mm); + len = snprintf(buffer, sizeof(buffer), "%08lx\n", - ((mm->flags & MMF_DUMP_FILTER_MASK) >> + ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -3002,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) - set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else - clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 6eb65ceed213..19ecfcceb27a 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -2,12 +2,29 @@ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H +#include #include #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ +static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +{ + /* + * By convention, dumpable bits are contained in first 32 bits of the + * bitmap, so we can simply access this first unsigned long directly. + */ + return __mm_flags_get_word(mm); +} + +static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + set_mask_bits(bitmap, MMF_DUMPABLE_MASK, value); +} + extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -22,7 +39,9 @@ static inline int __get_dumpable(unsigned long mm_flags) static inline int get_dumpable(struct mm_struct *mm) { - return __get_dumpable(mm->flags); + unsigned long flags = __mm_flags_get_dumpable(mm); + + return __get_dumpable(flags); } #endif /* _LINUX_SCHED_COREDUMP_H */ -- 2.50.1 There is an issue with the mask declarations in linux/mm_types.h, which naively do (1 << bit) operations. Unfortunately this results in the 1 being defaulted as a signed (32-bit) integer. When the compiler expands the MMF_INIT_MASK bitmask it comes up with: (((1 << 2) - 1) | (((1 << 9) - 1) << 2) | (1 << 24) | (1 << 28) | (1 << 30) | (1 << 31)) Which overflows the signed integer to -788,527,105. Implicitly casting this to an unsigned integer results in sign-expansion, and thus this value becomes 0xffffffffd10007ff, rather than the intended 0xd10007ff. While we're limited to a maximum of 32 bits in mm->flags, this isn't an issue as the remaining bits being masked will always be zero. However, now we are moving towards having more bits in this flag, this becomes an issue. Simply resolve this by using the _BITUL() helper to cast the shifted value to an unsigned long. Signed-off-by: Lorenzo Stoakes --- include/linux/mm_types.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 46d3fb8935c7..38b3fa927997 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1756,7 +1756,7 @@ enum { * the modes are SUID_DUMP_* defined in linux/sched/coredump.h */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +#define MMF_DUMPABLE_MASK (_BITUL(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1771,13 +1771,13 @@ enum { #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) + ((_BITUL(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + (_BITUL(MMF_DUMP_ANON_PRIVATE) | _BITUL(MMF_DUMP_ANON_SHARED) | \ + _BITUL(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +# define MMF_DUMP_MASK_DEFAULT_ELF _BITUL(MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif @@ -1797,7 +1797,7 @@ enum { #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_DISABLE_THP_MASK _BITUL(MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* @@ -1810,16 +1810,15 @@ enum { #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - +#define MMF_HAS_MDWE_MASK _BITUL(MMF_HAS_MDWE) #define MMF_HAS_MDWE_NO_INHERIT 29 #define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) +#define MMF_VM_MERGE_ANY_MASK _BITUL(MMF_VM_MERGE_ANY) #define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) +#define MMF_TOPDOWN_MASK _BITUL(MMF_TOPDOWN) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ -- 2.50.1 We now need to account for flag initialisation on fork. We retain the existing logic as much as we can, but dub the existing flag mask legacy. These flags are therefore required to fit in the first 32-bits of the flags field. However, further flag propagation upon fork can be implemented in mm_init() on a per-flag basis. We ensure we clear the entire bitmap prior to setting it, and use __mm_flags_get_word() and __mm_flags_set_word() to manipulate these legacy fields efficiently. Signed-off-by: Lorenzo Stoakes --- include/linux/mm_types.h | 13 ++++++++++--- kernel/fork.c | 7 +++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38b3fa927997..25577ab39094 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1820,16 +1820,23 @@ enum { #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK _BITUL(MMF_TOPDOWN) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) -static inline unsigned long mmf_init_flags(unsigned long flags) +/* Legacy flags must fit within 32 bits. */ +static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX); + +/* + * Initialise legacy flags according to masks, propagating selected flags on + * fork. Further flag manipulation can be performed by the caller. + */ +static inline unsigned long mmf_init_legacy_flags(unsigned long flags) { if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) flags &= ~((1UL << MMF_HAS_MDWE) | (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; + return flags & MMF_INIT_LEGACY_MASK; } #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd..b311caec6419 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1056,11 +1056,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_uprobes_state(mm); hugetlb_count_init(mm); + mm_flags_clear_all(mm); if (current->mm) { - mm->flags = mmf_init_flags(current->mm->flags); + unsigned long flags = __mm_flags_get_word(current->mm); + + __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - mm->flags = default_dump_filter; + __mm_flags_set_word(mm, default_dump_filter); mm->def_flags = 0; } -- 2.50.1 As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. Signed-off-by: Lorenzo Stoakes --- fs/proc/array.c | 2 +- fs/proc/base.c | 4 ++-- fs/proc/task_mmu.c | 2 +- kernel/fork.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index d6a0369caa93..c286dc12325e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } diff --git a/fs/proc/base.c b/fs/proc/base.c index f0c093c58aaf..b997ceef9135 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1163,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { + if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) { mm = p->mm; mmgrab(mm); } @@ -3276,7 +3276,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); seq_printf(m, "ksm_merge_any: %s\n", - test_bit(MMF_VM_MERGE_ANY, &mm->flags) ? "yes" : "no"); + mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no"); ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e64cf40ce9c4..e8e7bef34531 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1592,7 +1592,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return false; if (!is_cow_mapping(vma->vm_flags)) return false; - if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) + if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))) return false; folio = vm_normal_folio(vma, addr, pte); if (!folio) diff --git a/kernel/fork.c b/kernel/fork.c index b311caec6419..68c81539193d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1887,7 +1887,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); - set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + mm_flags_set(MMF_MULTIPROCESS, tsk->mm); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; -- 2.50.1 Now we have updated all users of mm->flags to use the bitmap accessors, repalce it with the bitmap version entirely. We are then able to move to having 64 bits of mm->flags on both 32-bit and 64-bit architectures. We also update the VMA userland tests to ensure that everything remains functional there. No functional changes intended, other than there now being 64 bits of available mm_struct flags. Signed-off-by: Lorenzo Stoakes --- include/linux/mm.h | 12 ++++++------ include/linux/mm_types.h | 14 +++++--------- include/linux/sched/coredump.h | 2 +- tools/testing/vma/vma_internal.h | 19 +++++++++++++++++-- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 34311ebe62cc..b61e2d4858cf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -724,32 +724,32 @@ static inline void assert_fault_locked(struct vm_fault *vmf) static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { - return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) { - return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) { - return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_set(int flag, struct mm_struct *mm) { - set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear(int flag, struct mm_struct *mm) { - clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear_all(struct mm_struct *mm) { - bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); + bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS); } extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 25577ab39094..47d2e4598acd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -932,7 +932,7 @@ struct mm_cid { * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. */ -#define NUM_MM_FLAG_BITS BITS_PER_LONG +#define NUM_MM_FLAG_BITS (64) typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; @@ -1119,11 +1119,7 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - /* Temporary union while we convert users to mm_flags_t. */ - union { - unsigned long flags; /* Must use atomic bitops to access */ - mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ - }; + mm_flags_t flags; /* Must use mm_flags_* hlpers to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1236,7 +1232,7 @@ struct mm_struct { /* Read the first system word of mm flags, non-atomically. */ static inline unsigned long __mm_flags_get_word(struct mm_struct *mm) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); return bitmap_read(bitmap, 0, BITS_PER_LONG); } @@ -1245,7 +1241,7 @@ static inline unsigned long __mm_flags_get_word(struct mm_struct *mm) static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); bitmap_copy(bitmap, &value, BITS_PER_LONG); } @@ -1253,7 +1249,7 @@ static inline void __mm_flags_set_word(struct mm_struct *mm, /* Obtain a read-only view of the bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { - return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); + return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); } #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 19ecfcceb27a..079ae5a97480 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -20,7 +20,7 @@ static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); set_mask_bits(bitmap, MMF_DUMPABLE_MASK, value); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index cb1c2a8afe26..f13354bf0a1e 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -249,6 +249,14 @@ struct mutex {}; #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = {} +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define NUM_MM_FLAG_BITS (64) +typedef struct { + __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} mm_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -260,7 +268,7 @@ struct mm_struct { unsigned long def_flags; - unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t flags; /* Must use mm_flags_* helpers to access */ }; struct vm_area_struct; @@ -1333,6 +1341,13 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +# define ACCESS_PRIVATE(p, member) ((p)->member) + +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1363,7 +1378,7 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ -- 2.50.1