Rename the "kvm_gmem" structure to "gmem_file" in anticipation of using dedicated guest_memfd inodes instead of anonyomous inodes, at which point the "kvm_gmem" nomenclature becomes quite misleading. In guest_memfd, inodes are effectively the raw underlying physical storage, and will be used to track properties of the physical memory, while each gmem file is effectively a single VM's view of that storage, and is used to track assets specific to its associated VM, e.g. memslots=>gmem bindings. Using "kvm_gmem" suggests that the per-VM/per-file structures are _the_ guest_memfd instance, which almost the exact opposite of reality. Opportunistically rename local variables from "gmem" to "f", again to avoid confusion once guest_memfd specific inodes come along. No functional change intended. Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 100 ++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fbca8c0972da..3c57fb42f12c 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -7,7 +7,16 @@ #include "kvm_mm.h" -struct kvm_gmem { +/* + * A guest_memfd instance can be associated multiple VMs, each with its own + * "view" of the underlying physical memory. + * + * The gmem's inode is effectively the raw underlying physical storage, and is + * used to track properties of the physical memory, while each gmem file is + * effectively a single VM's view of that storage, and is used to track assets + * specific to its associated VM, e.g. memslots=>gmem bindings. + */ +struct gmem_file { struct kvm *kvm; struct xarray bindings; struct list_head entry; @@ -110,16 +119,16 @@ static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *in return KVM_FILTER_PRIVATE; } -static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, pgoff_t end, enum kvm_gfn_range_filter attr_filter) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; - xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + xa_for_each_range(&f->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; struct kvm_gfn_range gfn_range = { @@ -152,20 +161,20 @@ static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, { struct list_head *gmem_list = &inode->i_mapping->i_private_list; enum kvm_gfn_range_filter attr_filter; - struct kvm_gmem *gmem; + struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_begin(gmem, start, end, attr_filter); + list_for_each_entry(f, gmem_list, entry) + __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } -static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, pgoff_t end) { - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; - if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); @@ -176,10 +185,10 @@ static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; - struct kvm_gmem *gmem; + struct gmem_file *f; - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_end(gmem, start, end); + list_for_each_entry(f, gmem_list, entry) + __kvm_gmem_invalidate_end(f, start, end); } static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) @@ -277,9 +286,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, static int kvm_gmem_release(struct inode *inode, struct file *file) { - struct kvm_gmem *gmem = file->private_data; + struct gmem_file *f = file->private_data; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; /* @@ -299,7 +308,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) filemap_invalidate_lock(inode->i_mapping); - xa_for_each(&gmem->bindings, index, slot) + xa_for_each(&f->bindings, index, slot) WRITE_ONCE(slot->gmem.file, NULL); /* @@ -307,18 +316,18 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ - __kvm_gmem_invalidate_begin(gmem, 0, -1ul, + __kvm_gmem_invalidate_begin(f, 0, -1ul, kvm_gmem_get_invalidate_filter(inode)); - __kvm_gmem_invalidate_end(gmem, 0, -1ul); + __kvm_gmem_invalidate_end(f, 0, -1ul); - list_del(&gmem->entry); + list_del(&f->entry); filemap_invalidate_unlock(inode->i_mapping); mutex_unlock(&kvm->slots_lock); - xa_destroy(&gmem->bindings); - kfree(gmem); + xa_destroy(&f->bindings); + kfree(f); kvm_put_kvm(kvm); @@ -493,7 +502,7 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; - struct kvm_gmem *gmem; + struct gmem_file *f; struct inode *inode; struct file *file; int fd, err; @@ -502,14 +511,13 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) if (fd < 0) return fd; - gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); - if (!gmem) { + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) { err = -ENOMEM; goto err_fd; } - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, - O_RDWR, NULL); + file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, f, O_RDWR, NULL); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_gmem; @@ -531,15 +539,15 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); kvm_get_kvm(kvm); - gmem->kvm = kvm; - xa_init(&gmem->bindings); - list_add(&gmem->entry, &inode->i_mapping->i_private_list); + f->kvm = kvm; + xa_init(&f->bindings); + list_add(&f->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; err_gmem: - kfree(gmem); + kfree(f); err_fd: put_unused_fd(fd); return err; @@ -564,7 +572,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, { loff_t size = slot->npages << PAGE_SHIFT; unsigned long start, end; - struct kvm_gmem *gmem; + struct gmem_file *f; struct inode *inode; struct file *file; int r = -EINVAL; @@ -578,8 +586,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (file->f_op != &kvm_gmem_fops) goto err; - gmem = file->private_data; - if (gmem->kvm != kvm) + f = file->private_data; + if (f->kvm != kvm) goto err; inode = file_inode(file); @@ -593,8 +601,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, start = offset >> PAGE_SHIFT; end = start + slot->npages; - if (!xa_empty(&gmem->bindings) && - xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (!xa_empty(&f->bindings) && + xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { filemap_invalidate_unlock(inode->i_mapping); goto err; } @@ -609,7 +617,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (kvm_gmem_supports_mmap(inode)) slot->flags |= KVM_MEMSLOT_GMEM_ONLY; - xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); /* @@ -627,7 +635,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; - struct kvm_gmem *gmem; + struct gmem_file *f; struct file *file; /* @@ -638,10 +646,10 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) if (!file) return; - gmem = file->private_data; + f = file->private_data; filemap_invalidate_lock(file->f_mapping); - xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); /* * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() @@ -659,18 +667,18 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, pgoff_t index, kvm_pfn_t *pfn, bool *is_prepared, int *max_order) { - struct file *gmem_file = READ_ONCE(slot->gmem.file); - struct kvm_gmem *gmem = file->private_data; + struct file *slot_file = READ_ONCE(slot->gmem.file); + struct gmem_file *f = file->private_data; struct folio *folio; - if (file != gmem_file) { - WARN_ON_ONCE(gmem_file); + if (file != slot_file) { + WARN_ON_ONCE(slot_file); return ERR_PTR(-EFAULT); } - gmem = file->private_data; - if (xa_load(&gmem->bindings, index) != slot) { - WARN_ON_ONCE(xa_load(&gmem->bindings, index)); + f = file->private_data; + if (xa_load(&f->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&f->bindings, index)); return ERR_PTR(-EIO); } -- 2.51.0.710.ga91ca5db03-goog Add a kvm_gmem_for_each_file() to make it more obvious that KVM is iterating over guest_memfd _files_, not guest_memfd instances, as could be assumed given the name "gmem_list". No functional change intended. Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 3c57fb42f12c..9b9e239b3073 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -22,6 +22,9 @@ struct gmem_file { struct list_head entry; }; +#define kvm_gmem_for_each_file(f, mapping) \ + list_for_each_entry(f, &(mapping)->i_private_list, entry) + /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. @@ -159,13 +162,12 @@ static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; enum kvm_gfn_range_filter attr_filter; struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); - list_for_each_entry(f, gmem_list, entry) + kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } @@ -184,10 +186,9 @@ static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; struct gmem_file *f; - list_for_each_entry(f, gmem_list, entry) + kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_end(f, start, end); } -- 2.51.0.710.ga91ca5db03-goog From: Ackerley Tng guest_memfd's inode represents memory the guest_memfd is providing. guest_memfd's file represents a struct kvm's view of that memory. Using a custom inode allows customization of the inode teardown process via callbacks. For example, ->evict_inode() allows customization of the truncation process on file close, and ->destroy_inode() and ->free_inode() allow customization of the inode freeing process. Customizing the truncation process allows flexibility in management of guest_memfd memory and customization of the inode freeing process allows proper cleanup of memory metadata stored on the inode. Memory metadata is more appropriately stored on the inode (as opposed to the file), since the metadata is for the memory and is not unique to a specific binding and struct kvm. Acked-by: David Hildenbrand Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Signed-off-by: Ackerley Tng Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: drop helpers, open code logic in __kvm_gmem_create()] Signed-off-by: Sean Christopherson --- include/uapi/linux/magic.h | 1 + virt/kvm/guest_memfd.c | 82 +++++++++++++++++++++++++++++++------- virt/kvm/kvm_main.c | 7 +++- virt/kvm/kvm_mm.h | 9 +++-- 4 files changed, 80 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index bb575f3ab45e..638ca21b7a90 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -103,5 +103,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ +#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 9b9e239b3073..2a580b2bdc9d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -1,12 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include +#include #include +#include #include -#include #include "kvm_mm.h" +static struct vfsmount *kvm_gmem_mnt; + /* * A guest_memfd instance can be associated multiple VMs, each with its own * "view" of the underlying physical memory. @@ -426,11 +430,6 @@ static struct file_operations kvm_gmem_fops = { .fallocate = kvm_gmem_fallocate, }; -void kvm_gmem_init(struct module *module) -{ - kvm_gmem_fops.owner = module; -} - static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -502,7 +501,7 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { - const char *anon_name = "[kvm-gmem]"; + static const char *name = "[kvm-gmem]"; struct gmem_file *f; struct inode *inode; struct file *file; @@ -518,16 +517,17 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_fd; } - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, f, O_RDWR, NULL); - if (IS_ERR(file)) { - err = PTR_ERR(file); + /* __fput() will take care of fops_put(). */ + if (!fops_get(&kvm_gmem_fops)) { + err = -ENOENT; goto err_gmem; } - file->f_flags |= O_LARGEFILE; - - inode = file->f_inode; - WARN_ON(file->f_mapping != inode->i_mapping); + inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto err_fops; + } inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; @@ -539,6 +539,15 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_inode; + } + + file->f_flags |= O_LARGEFILE; + file->private_data = f; + kvm_get_kvm(kvm); f->kvm = kvm; xa_init(&f->bindings); @@ -547,6 +556,10 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) fd_install(fd, file); return fd; +err_inode: + iput(inode); +err_fops: + fops_put(&kvm_gmem_fops); err_gmem: kfree(f); err_fd: @@ -819,3 +832,44 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif + +static int kvm_gmem_init_fs_context(struct fs_context *fc) +{ + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; + + return 0; +} + +static struct file_system_type kvm_gmem_fs = { + .name = "guest_memfd", + .init_fs_context = kvm_gmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int kvm_gmem_init_mount(void) +{ + kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); + + if (IS_ERR(kvm_gmem_mnt)) + return PTR_ERR(kvm_gmem_mnt); + + kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; + return 0; +} + +int kvm_gmem_init(struct module *module) +{ + kvm_gmem_fops.owner = module; + + return kvm_gmem_init_mount(); +} + +void kvm_gmem_exit(void) +{ + kern_unmount(kvm_gmem_mnt); + kvm_gmem_mnt = NULL; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..4845e5739436 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6517,7 +6517,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (WARN_ON_ONCE(r)) goto err_vfio; - kvm_gmem_init(module); + r = kvm_gmem_init(module); + if (r) + goto err_gmem; r = kvm_init_virtualization(); if (r) @@ -6538,6 +6540,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) err_register: kvm_uninit_virtualization(); err_virt: + kvm_gmem_exit(); +err_gmem: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6569,6 +6573,7 @@ void kvm_exit(void) for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_gmem_exit(); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); kvm_irqfd_exit(); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 31defb08ccba..9fcc5d5b7f8d 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -68,17 +68,18 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, #endif /* HAVE_KVM_PFNCACHE */ #ifdef CONFIG_KVM_GUEST_MEMFD -void kvm_gmem_init(struct module *module); +int kvm_gmem_init(struct module *module); +void kvm_gmem_exit(void); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset); void kvm_gmem_unbind(struct kvm_memory_slot *slot); #else -static inline void kvm_gmem_init(struct module *module) +static inline int kvm_gmem_init(struct module *module) { - + return 0; } - +static inline void kvm_gmem_exit(void) {}; static inline int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset) -- 2.51.0.710.ga91ca5db03-goog From: Shivank Garg Add a dedicated gmem_inode structure and a slab-allocateda inode cache for guest memory backing, similar to how shmem handles inodes. This adds the necessary allocation/destruction functions and prepares for upcoming guest_memfd NUMA policy support changes. Using a dedicated structure will also allow for additional cleanups, e.g. to track flags in gmem_inode instead of i_private. Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: s/kvm_gmem_inode_info/gmem_inode, name init_once()] Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 77 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 2a580b2bdc9d..cc3b25155726 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -26,6 +26,15 @@ struct gmem_file { struct list_head entry; }; +struct gmem_inode { + struct inode vfs_inode; +}; + +static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) +{ + return container_of(inode, struct gmem_inode, vfs_inode); +} + #define kvm_gmem_for_each_file(f, mapping) \ list_for_each_entry(f, &(mapping)->i_private_list, entry) @@ -833,13 +842,61 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif +static struct kmem_cache *kvm_gmem_inode_cachep; + +static void kvm_gmem_init_inode_once(void *__gi) +{ + struct gmem_inode *gi = __gi; + + /* + * Note! Don't initialize the inode with anything specific to the + * guest_memfd instance, or that might be specific to how the inode is + * used (from the VFS-layer's perspective). This hook is called only + * during the initial slab allocation, i.e. only fields/state that are + * idempotent across _all_ use of the inode _object_ can be initialized + * at this time! + */ + inode_init_once(&gi->vfs_inode); +} + +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) +{ + struct gmem_inode *gi; + + gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); + if (!gi) + return NULL; + + return &gi->vfs_inode; +} + +static void kvm_gmem_destroy_inode(struct inode *inode) +{ +} + +static void kvm_gmem_free_inode(struct inode *inode) +{ + kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); +} + +static const struct super_operations kvm_gmem_super_operations = { + .statfs = simple_statfs, + .alloc_inode = kvm_gmem_alloc_inode, + .destroy_inode = kvm_gmem_destroy_inode, + .free_inode = kvm_gmem_free_inode, +}; + static int kvm_gmem_init_fs_context(struct fs_context *fc) { + struct pseudo_fs_context *ctx; + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; + ctx = fc->fs_private; + ctx->ops = &kvm_gmem_super_operations; return 0; } @@ -863,13 +920,31 @@ static int kvm_gmem_init_mount(void) int kvm_gmem_init(struct module *module) { + struct kmem_cache_args args = { + .align = 0, + .ctor = kvm_gmem_init_inode_once, + }; + int ret; + kvm_gmem_fops.owner = module; + kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", + sizeof(struct gmem_inode), + &args, SLAB_ACCOUNT); + if (!kvm_gmem_inode_cachep) + return -ENOMEM; - return kvm_gmem_init_mount(); + ret = kvm_gmem_init_mount(); + if (ret) { + kmem_cache_destroy(kvm_gmem_inode_cachep); + return ret; + } + return 0; } void kvm_gmem_exit(void) { kern_unmount(kvm_gmem_mnt); kvm_gmem_mnt = NULL; + rcu_barrier(); + kmem_cache_destroy(kvm_gmem_inode_cachep); } -- 2.51.0.710.ga91ca5db03-goog From: Shivank Garg Previously, guest-memfd allocations followed local NUMA node id in absence of process mempolicy, resulting in arbitrary memory allocation. Moreover, mbind() couldn't be used by the VMM as guest memory wasn't mapped into userspace when allocation occurred. Enable NUMA policy support by implementing vm_ops for guest-memfd mmap operation. This allows the VMM to map the memory and use mbind() to set the desired NUMA policy. The policy is stored in the inode structure via kvm_gmem_inode_info, as memory policy is a property of the memory (struct inode) itself. The policy is then retrieved via mpol_shared_policy_lookup() and passed to filemap_grab_folio_mpol() to ensure that allocations follow the specified memory policy. This enables the VMM to control guest memory NUMA placement by calling mbind() on the mapped memory regions, providing fine-grained control over guest memory allocation across NUMA nodes. The policy change only affect future allocations and does not migrate existing memory. This matches mbind(2)'s default behavior which affects only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL flags, which are not supported for guest_memfd as it is unmovable. Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: document the ABI impact of the ->get_policy() hook] Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index cc3b25155726..95267c92983b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ struct gmem_file { }; struct gmem_inode { + struct shared_policy policy; struct inode vfs_inode; }; @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi, + pgoff_t index) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol; + + mpol = mpol_shared_policy_lookup(&gi->policy, index); + return mpol ? mpol : get_task_policy(current); +#else + return NULL; +#endif +} + /* * Returns a locked folio on success. The caller is responsible for * setting the up-to-date flag before the memory is mapped into the guest. @@ -124,7 +139,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ - return filemap_grab_folio(inode->i_mapping, index); + struct mempolicy *policy; + struct folio *folio; + + /* + * Fast-path: See if folio is already present in mapping to avoid + * policy_lookup. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (!IS_ERR(folio)) + return folio; + + policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index); + folio = __filemap_get_folio_mpol(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping), policy); + mpol_cond_put(policy); + + return folio; } static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) @@ -413,8 +446,38 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) return ret; } +#ifdef CONFIG_NUMA +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + + return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *pgoff) +{ + struct inode *inode = file_inode(vma->vm_file); + + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* + * Note! Directly return whatever the lookup returns, do NOT return + * the current task's policy as is done when looking up the policy for + * a specific folio. Kernel ABI for get_mempolicy() is to return + * MPOL_DEFAULT when there is no defined policy, not whatever the + * default policy resolves to. + */ + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); +} +#endif /* CONFIG_NUMA */ + static const struct vm_operations_struct kvm_gmem_vm_ops = { - .fault = kvm_gmem_fault_user_mapping, + .fault = kvm_gmem_fault_user_mapping, +#ifdef CONFIG_NUMA + .get_policy = kvm_gmem_get_policy, + .set_policy = kvm_gmem_set_policy, +#endif }; static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) @@ -867,11 +930,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) if (!gi) return NULL; + mpol_shared_policy_init(&gi->policy, NULL); return &gi->vfs_inode; } static void kvm_gmem_destroy_inode(struct inode *inode) { + mpol_free_shared_policy(&GMEM_I(inode)->policy); } static void kvm_gmem_free_inode(struct inode *inode) -- 2.51.0.710.ga91ca5db03-goog Add kvm_ wrappers for munmap(), close(), fallocate(), and ftruncate() to cut down on boilerplate code when a sycall is expected to succeed, and to make it easier for developers to remember to assert success. Implement and use a macro framework similar to the kernel's SYSCALL_DEFINE infrastructure to further cut down on boilerplate code, and to drastically reduce the probability of typos as the kernel's syscall definitions can be copy+paste almost verbatim. Provide macros to build the raw () wrappers as well, e.g. to replace hand-coded wrappers (NUMA) or pure open-coded calls. Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 2 +- .../selftests/kvm/include/kvm_syscalls.h | 81 +++++++++++++++++++ .../testing/selftests/kvm/include/kvm_util.h | 29 +------ .../selftests/kvm/kvm_binary_stats_test.c | 4 +- tools/testing/selftests/kvm/lib/kvm_util.c | 31 ++----- .../kvm/x86/private_mem_conversions_test.c | 9 +-- 6 files changed, 96 insertions(+), 60 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/kvm_syscalls.h diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 6338f5bbdb70..8d7758f12280 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -636,7 +636,7 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) - close(fd[f]); + kvm_close(fd[f]); } /* handles the valid case: intid=0xffffffff num=1 */ diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h new file mode 100644 index 000000000000..d4e613162bba --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_SYSCALLS_H +#define SELFTEST_KVM_SYSCALLS_H + +#include + +#define MAP_ARGS0(m,...) +#define MAP_ARGS1(m,t,a,...) m(t,a) +#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) +#define MAP_ARGS3(m,t,a,...) m(t,a), MAP_ARGS2(m,__VA_ARGS__) +#define MAP_ARGS4(m,t,a,...) m(t,a), MAP_ARGS3(m,__VA_ARGS__) +#define MAP_ARGS5(m,t,a,...) m(t,a), MAP_ARGS4(m,__VA_ARGS__) +#define MAP_ARGS6(m,t,a,...) m(t,a), MAP_ARGS5(m,__VA_ARGS__) +#define MAP_ARGS(n,...) MAP_ARGS##n(__VA_ARGS__) + +#define __DECLARE_ARGS(t, a) t a +#define __UNPACK_ARGS(t, a) a + +#define DECLARE_ARGS(nr_args, args...) MAP_ARGS(nr_args, __DECLARE_ARGS, args) +#define UNPACK_ARGS(nr_args, args...) MAP_ARGS(nr_args, __UNPACK_ARGS, args) + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* Define a kvm_() API to assert success. */ +#define __KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline void kvm_##name(DECLARE_ARGS(nr_args, args)) \ +{ \ + int r; \ + \ + r = name(UNPACK_ARGS(nr_args, args)); \ + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR(#name, r)); \ +} + +/* + * Macro to define syscall APIs, either because KVM selftests doesn't link to + * the standard library, e.g. libnuma, or because there is no library that yet + * provides the syscall. These + */ +#define KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline long name(DECLARE_ARGS(nr_args, args)) \ +{ \ + return syscall(__NR_##name, UNPACK_ARGS(nr_args, args)); \ +} \ +__KVM_SYSCALL_DEFINE(name, nr_args, args) + +/* + * Special case mmap(), as KVM selftest rarely/never specific an address, + * rarely specify an offset, and because the unique return code requires + * special handling anyways. + */ +static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, + off_t offset) +{ + void *mem; + + mem = mmap(NULL, size, prot, flags, fd, offset); + TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", + (int)(unsigned long)MAP_FAILED)); + return mem; +} + +static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) +{ + return __kvm_mmap(size, prot, flags, fd, 0); +} + +static inline int kvm_dup(int fd) +{ + int new_fd = dup(fd); + + TEST_ASSERT(new_fd >= 0, __KVM_SYSCALL_ERROR("dup()", new_fd)); + return new_fd; +} + +__KVM_SYSCALL_DEFINE(munmap, 2, void *, mem, size_t, size); +__KVM_SYSCALL_DEFINE(close, 1, int, fd); +__KVM_SYSCALL_DEFINE(fallocate, 4, int, fd, int, mode, loff_t, offset, loff_t, len); +__KVM_SYSCALL_DEFINE(ftruncate, 2, unsigned int, fd, off_t, length); + +#endif /* SELFTEST_KVM_SYSCALLS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index ee60dbf5208a..c610169933ef 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -23,6 +23,7 @@ #include +#include "kvm_syscalls.h" #include "kvm_util_arch.h" #include "kvm_util_types.h" #include "sparsebit.h" @@ -283,34 +284,6 @@ static inline bool kvm_has_cap(long cap) return kvm_check_cap(cap); } -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, - off_t offset) -{ - void *mem; - - mem = mmap(NULL, size, prot, flags, fd, offset); - TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", - (int)(unsigned long)MAP_FAILED)); - - return mem; -} - -static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) -{ - return __kvm_mmap(size, prot, flags, fd, 0); -} - -static inline void kvm_munmap(void *mem, size_t size) -{ - int ret; - - ret = munmap(mem, size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); -} - /* * Use the "inner", double-underscore macro when reporting errors from within * other macros so that the name of ioctl() and not its literal numeric value diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index f02355c3c4c2..b7dbde9c0843 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -239,14 +239,14 @@ int main(int argc, char *argv[]) * single stats file works and doesn't cause explosions. */ vm_stats_fds = vm_get_stats_fd(vms[i]); - stats_test(dup(vm_stats_fds)); + stats_test(kvm_dup(vm_stats_fds)); /* Verify userspace can instantiate multiple stats files. */ stats_test(vm_get_stats_fd(vms[i])); for (j = 0; j < max_vcpu; ++j) { vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); - stats_test(dup(vcpu_stats_fds[j])); + stats_test(kvm_dup(vcpu_stats_fds[j])); stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 83a721be7ec5..8b60b767224b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -704,8 +704,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) static void kvm_stats_release(struct kvm_binary_stats *stats) { - int ret; - if (stats->fd < 0) return; @@ -714,8 +712,7 @@ static void kvm_stats_release(struct kvm_binary_stats *stats) stats->desc = NULL; } - ret = close(stats->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(stats->fd); stats->fd = -1; } @@ -738,8 +735,6 @@ __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) */ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { - int ret; - if (vcpu->dirty_gfns) { kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size); vcpu->dirty_gfns = NULL; @@ -747,9 +742,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_munmap(vcpu->run, vcpu_mmap_sz()); - ret = close(vcpu->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - + kvm_close(vcpu->fd); kvm_stats_release(&vcpu->stats); list_del(&vcpu->list); @@ -761,16 +754,12 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) void kvm_vm_release(struct kvm_vm *vmp) { struct kvm_vcpu *vcpu, *tmp; - int ret; list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) vm_vcpu_rm(vmp, vcpu); - ret = close(vmp->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - - ret = close(vmp->kvm_fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(vmp->fd); + kvm_close(vmp->kvm_fd); /* Free cached stats metadata and close FD */ kvm_stats_release(&vmp->stats); @@ -828,7 +817,7 @@ void kvm_vm_free(struct kvm_vm *vmp) int kvm_memfd_alloc(size_t size, bool hugepages) { int memfd_flags = MFD_CLOEXEC; - int fd, r; + int fd; if (hugepages) memfd_flags |= MFD_HUGETLB; @@ -836,11 +825,8 @@ int kvm_memfd_alloc(size_t size, bool hugepages) fd = memfd_create("kvm_selftest", memfd_flags); TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); - r = ftruncate(fd, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); - - r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_ftruncate(fd, size); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); return fd; } @@ -1084,8 +1070,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, * needing to track if the fd is owned by the framework * or by the caller. */ - guest_memfd = dup(guest_memfd); - TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + guest_memfd = kvm_dup(guest_memfd); } region->region.guest_memfd = guest_memfd; diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c index 82a8d88b5338..1969f4ab9b28 100644 --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c @@ -380,7 +380,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; pthread_t threads[KVM_MAX_VCPUS]; struct kvm_vm *vm; - int memfd, i, r; + int memfd, i; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, @@ -428,11 +428,8 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t * should prevent the VM from being fully destroyed until the last * reference to the guest_memfd is also put. */ - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); - - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); close(memfd); } -- 2.51.0.710.ga91ca5db03-goog Register handlers for signals for all selftests that are likely happen due to test (or kernel) bugs, and explicitly fail tests on unexpected signals so that users get a stack trace, i.e. don't have to go spelunking to do basic triage. Register the handlers as early as possible, to catch as many unexpected signals as possible, and also so that the common code doesn't clobber a handler that's installed by test (or arch) code. Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/kvm_util.c | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8b60b767224b..0c3a6a40d1a9 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2290,11 +2290,35 @@ __weak void kvm_selftest_arch_init(void) { } +static void report_unexpected_signal(int signum) +{ +#define KVM_CASE_SIGNUM(sig) \ + case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum) + + switch (signum) { + KVM_CASE_SIGNUM(SIGBUS); + KVM_CASE_SIGNUM(SIGSEGV); + KVM_CASE_SIGNUM(SIGILL); + KVM_CASE_SIGNUM(SIGFPE); + default: + TEST_FAIL("Unexpected signal %d\n", signum); + } +} + void __attribute((constructor)) kvm_selftest_init(void) { + struct sigaction sig_sa = { + .sa_handler = report_unexpected_signal, + }; + /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + sigaction(SIGBUS, &sig_sa, NULL); + sigaction(SIGSEGV, &sig_sa, NULL); + sigaction(SIGILL, &sig_sa, NULL); + sigaction(SIGFPE, &sig_sa, NULL); + guest_random_seed = last_guest_seed = random(); pr_info("Random seed: 0x%x\n", guest_random_seed); -- 2.51.0.710.ga91ca5db03-goog Add APIs for all syscalls defined in the kernel's mm/mempolicy.c to match those that would be provided by linking to libnuma. Opportunistically use the recently inroduced KVM_SYSCALL_DEFINE() builders to take care of the boilderplate, and to fix a flaw where the two existing wrappers would generate multiple symbols if numaif.h were to be included multiple times. Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/numaif.h | 36 +++++++++++-------- .../selftests/kvm/x86/xapic_ipi_test.c | 5 ++- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index b020547403fd..aaa4ac174890 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -13,23 +13,29 @@ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H -#define __NR_get_mempolicy 239 -#define __NR_migrate_pages 256 +#include -/* System calls */ -long get_mempolicy(int *policy, const unsigned long *nmask, - unsigned long maxnode, void *addr, int flags) -{ - return syscall(__NR_get_mempolicy, policy, nmask, - maxnode, addr, flags); -} +#include "kvm_syscalls.h" -long migrate_pages(int pid, unsigned long maxnode, - const unsigned long *frommask, - const unsigned long *tomask) -{ - return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); -} +KVM_SYSCALL_DEFINE(get_mempolicy, 5, int *, policy, const unsigned long *, nmask, + unsigned long, maxnode, void *, addr, int, flags); + +KVM_SYSCALL_DEFINE(set_mempolicy, 3, int, mode, const unsigned long *, nmask, + unsigned long, maxnode); + +KVM_SYSCALL_DEFINE(set_mempolicy_home_node, 4, unsigned long, start, + unsigned long, len, unsigned long, home_node, + unsigned long, flags); + +KVM_SYSCALL_DEFINE(migrate_pages, 4, int, pid, unsigned long, maxnode, + const unsigned long *, frommask, const unsigned long *, tomask); + +KVM_SYSCALL_DEFINE(move_pages, 6, int, pid, unsigned long, count, void *, pages, + const int *, nodes, int *, status, int, flags); + +KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, + const unsigned long *, nodemask, unsigned long, maxnode, + unsigned int, flags); /* Policies */ #define MPOL_DEFAULT 0 diff --git a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c index 35cb9de54a82..ae4a4b6c05ca 100644 --- a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c @@ -256,7 +256,7 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, int nodes = 0; time_t start_time, last_update, now; time_t interval_secs = 1; - int i, r; + int i; int from, to; unsigned long bit; uint64_t hlt_count; @@ -267,9 +267,8 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, delay_usecs); /* Get set of first 64 numa nodes available */ - r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, + kvm_get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, 0, MPOL_F_MEMS_ALLOWED); - TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno); fprintf(stderr, "Numa nodes found amongst first %lu possible nodes " "(each 1-bit indicates node is present): %#lx\n", -- 2.51.0.710.ga91ca5db03-goog Include mempolicy.h in KVM's numaif.h to pick up the kernel-provided NUMA definitions, and drop selftests' definitions, which are _mostly_ equivalent. The syscall numbers in particular are subtly x86_64-specific, i.e. will cause problems if/when numaif.h is used outsize of x86. Opportunistically clean up the file comment and make the syscall wrappers static inline so that including the header multiple times won't lead to weirdness (currently numaif.h is included by exactly one header). Fixes: 346b59f220a2 ("KVM: selftests: Add missing header file needed by xAPIC IPI tests") Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/numaif.h | 32 +------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index aaa4ac174890..1554003c40a1 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -1,14 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/numaif.h - * - * Copyright (C) 2020, Google LLC. - * - * This work is licensed under the terms of the GNU GPL, version 2. - * - * Header file that provides access to NUMA API functions not explicitly - * exported to user space. - */ +/* Copyright (C) 2020, Google LLC. */ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H @@ -37,25 +28,4 @@ KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, const unsigned long *, nodemask, unsigned long, maxnode, unsigned int, flags); -/* Policies */ -#define MPOL_DEFAULT 0 -#define MPOL_PREFERRED 1 -#define MPOL_BIND 2 -#define MPOL_INTERLEAVE 3 - -#define MPOL_MAX MPOL_INTERLEAVE - -/* Flags for get_mem_policy */ -#define MPOL_F_NODE (1<<0) /* return next il node or node of address */ - /* Warning: MPOL_F_NODE is unsupported and - * subject to change. Don't use. - */ -#define MPOL_F_ADDR (1<<1) /* look up vma using address */ -#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ - -/* Flags for mbind */ -#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ - #endif /* SELFTEST_KVM_NUMAIF_H */ -- 2.51.0.710.ga91ca5db03-goog From: Shivank Garg Add NUMA helpers to probe for support/availability and to check if the test is running on a multi-node system. The APIs will be used to verify guest_memfd NUMA support. Signed-off-by: Shivank Garg [sean: land helpers in numaif.h, add comments, tweak names] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/numaif.h | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index 1554003c40a1..29572a6d789c 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -4,6 +4,8 @@ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H +#include + #include #include "kvm_syscalls.h" @@ -28,4 +30,54 @@ KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, const unsigned long *, nodemask, unsigned long, maxnode, unsigned int, flags); +static inline int get_max_numa_node(void) +{ + struct dirent *de; + int max_node = 0; + DIR *d; + + /* + * Assume there's a single node if the kernel doesn't support NUMA, + * or if no nodes are found. + */ + d = opendir("/sys/devices/system/node"); + if (!d) + return 0; + + while ((de = readdir(d)) != NULL) { + int node_id; + char *endptr; + + if (strncmp(de->d_name, "node", 4) != 0) + continue; + + node_id = strtol(de->d_name + 4, &endptr, 10); + if (*endptr != '\0') + continue; + + if (node_id > max_node) + max_node = node_id; + } + closedir(d); + + return max_node; +} + +static bool is_numa_available(void) +{ + /* + * Probe for NUMA by doing a dummy get_mempolicy(). If the syscall + * fails with ENOSYS, then the kernel was built without NUMA support. + * if the syscall fails with EPERM, then the process/user lacks the + * necessary capabilities (CAP_SYS_NICE). + */ + return !get_mempolicy(NULL, NULL, 0, NULL, 0) || + (errno != ENOSYS && errno != EPERM); +} + +static inline bool is_multi_numa_node_system(void) +{ + return is_numa_available() && get_max_numa_node() >= 1; +} + #endif /* SELFTEST_KVM_NUMAIF_H */ -- 2.51.0.710.ga91ca5db03-goog From: Shivank Garg Add tests for NUMA memory policy binding and NUMA aware allocation in guest_memfd. This extends the existing selftests by adding proper validation for: - KVM GMEM set_policy and get_policy() vm_ops functionality using mbind() and get_mempolicy() - NUMA policy application before and after memory allocation Run the NUMA mbind() test with and without INIT_SHARED, as KVM should allow doing mbind(), madvise(), etc. on guest-private memory, e.g. so that userspace can set NUMA policy for CoCo VMs. Run the NUMA allocation test only for INIT_SHARED, i.e. if the host can't fault-in memory (via direct access, madvise(), etc.) as move_pages() returns -ENOENT if the page hasn't been faulted in (walks the host page tables to find the associated folio) Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: don't skip entire test when running on non-NUMA system, test mbind() with private memory, provide more info in assert messages] Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/guest_memfd_test.c | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index e7d9aeb418d3..618c937f3c90 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -19,6 +19,7 @@ #include #include "kvm_util.h" +#include "numaif.h" #include "test_util.h" #include "ucall_common.h" @@ -75,6 +76,101 @@ static void test_mmap_supported(int fd, size_t total_size) kvm_munmap(mem, total_size); } +static void test_mbind(int fd, size_t total_size) +{ + const unsigned long nodemask_0 = 1; /* nid: 0 */ + unsigned long nodemask = 0; + unsigned long maxnode = 8; + int policy; + char *mem; + int ret; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + /* Test MPOL_INTERLEAVE policy */ + kvm_mbind(mem, page_size * 2, MPOL_INTERLEAVE, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_INTERLEAVE && nodemask == nodemask_0, + "Wanted MPOL_INTERLEAVE (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_INTERLEAVE, nodemask_0, policy, nodemask); + + /* Test basic MPOL_BIND policy */ + kvm_mbind(mem + page_size * 2, page_size * 2, MPOL_BIND, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem + page_size * 2, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_BIND && nodemask == nodemask_0, + "Wanted MPOL_BIND (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_BIND, nodemask_0, policy, nodemask); + + /* Test MPOL_DEFAULT policy */ + kvm_mbind(mem, total_size, MPOL_DEFAULT, NULL, 0, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_DEFAULT && !nodemask, + "Wanted MPOL_DEFAULT (%u) and nodemask 0x0, got %u and 0x%lx", + MPOL_DEFAULT, policy, nodemask); + + /* Test with invalid policy */ + ret = mbind(mem, page_size, 999, &nodemask_0, maxnode, 0); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "mbind with invalid policy should fail with EINVAL"); + + kvm_munmap(mem, total_size); +} + +static void test_numa_allocation(int fd, size_t total_size) +{ + unsigned long node0_mask = 1; /* Node 0 */ + unsigned long node1_mask = 2; /* Node 1 */ + unsigned long maxnode = 8; + void *pages[4]; + int status[4]; + char *mem; + int i; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + for (i = 0; i < 4; i++) + pages[i] = (char *)mem + page_size * i; + + /* Set NUMA policy after allocation */ + memset(mem, 0xaa, page_size); + kvm_mbind(pages[0], page_size, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, page_size); + + /* Set NUMA policy before allocation */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + /* Validate if pages are allocated on specified NUMA nodes */ + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 1, "Expected page 0 on node 1, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 1, "Expected page 1 on node 1, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 0, "Expected page 2 on node 0, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 0, "Expected page 3 on node 0, got it on node %d", status[3]); + + /* Punch hole for all pages */ + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, total_size); + + /* Change NUMA policy nodes and reallocate */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 0, "Expected page 0 on node 0, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 0, "Expected page 1 on node 0, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 1, "Expected page 2 on node 1, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 1, "Expected page 3 on node 1, got it on node %d", status[3]); + + kvm_munmap(mem, total_size); +} + static void test_fault_sigbus(int fd, size_t accessible_size, size_t map_size) { const char val = 0xaa; @@ -273,11 +369,13 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { gmem_test(mmap_supported, vm, flags); gmem_test(fault_overflow, vm, flags); + gmem_test(numa_allocation, vm, flags); } else { gmem_test(fault_private, vm, flags); } gmem_test(mmap_cow, vm, flags); + gmem_test(mbind, vm, flags); } else { gmem_test(mmap_not_supported, vm, flags); } -- 2.51.0.710.ga91ca5db03-goog Track a guest_memfd instance's flags in gmem_inode instead of burying them in i_private. Burning an extra 8 bytes per inode is well worth the added clarity provided by explicit tracking. Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 95267c92983b..10df35a9ffd4 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -30,6 +30,8 @@ struct gmem_file { struct gmem_inode { struct shared_policy policy; struct inode vfs_inode; + + u64 flags; }; static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) @@ -162,7 +164,7 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) { - if ((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED) + if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) return KVM_FILTER_SHARED; return KVM_FILTER_PRIVATE; @@ -398,9 +400,7 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) static bool kvm_gmem_supports_mmap(struct inode *inode) { - const u64 flags = (u64)inode->i_private; - - return flags & GUEST_MEMFD_FLAG_MMAP; + return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -412,7 +412,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; - if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) + if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) return VM_FAULT_SIGBUS; folio = kvm_gmem_get_folio(inode, vmf->pgoff); @@ -601,7 +601,6 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_fops; } - inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; inode->i_mode |= S_IFREG; @@ -611,6 +610,8 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + GMEM_I(inode)->flags = flags; + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); if (IS_ERR(file)) { err = PTR_ERR(file); @@ -931,6 +932,8 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) return NULL; mpol_shared_policy_init(&gi->policy, NULL); + + gi->flags = 0; return &gi->vfs_inode; } -- 2.51.0.710.ga91ca5db03-goog