From: Shivank Garg Previously, guest-memfd allocations followed local NUMA node id in absence of process mempolicy, resulting in arbitrary memory allocation. Moreover, mbind() couldn't be used by the VMM as guest memory wasn't mapped into userspace when allocation occurred. Enable NUMA policy support by implementing vm_ops for guest-memfd mmap operation. This allows the VMM to map the memory and use mbind() to set the desired NUMA policy. The policy is stored in the inode structure via kvm_gmem_inode_info, as memory policy is a property of the memory (struct inode) itself. The policy is then retrieved via mpol_shared_policy_lookup() and passed to filemap_grab_folio_mpol() to ensure that allocations follow the specified memory policy. This enables the VMM to control guest memory NUMA placement by calling mbind() on the mapped memory regions, providing fine-grained control over guest memory allocation across NUMA nodes. The policy change only affect future allocations and does not migrate existing memory. This matches mbind(2)'s default behavior which affects only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL flags, which are not supported for guest_memfd as it is unmovable. Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: document the ABI impact of the ->get_policy() hook] Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index cc3b25155726..95267c92983b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ struct gmem_file { }; struct gmem_inode { + struct shared_policy policy; struct inode vfs_inode; }; @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi, + pgoff_t index) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol; + + mpol = mpol_shared_policy_lookup(&gi->policy, index); + return mpol ? mpol : get_task_policy(current); +#else + return NULL; +#endif +} + /* * Returns a locked folio on success. The caller is responsible for * setting the up-to-date flag before the memory is mapped into the guest. @@ -124,7 +139,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ - return filemap_grab_folio(inode->i_mapping, index); + struct mempolicy *policy; + struct folio *folio; + + /* + * Fast-path: See if folio is already present in mapping to avoid + * policy_lookup. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (!IS_ERR(folio)) + return folio; + + policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index); + folio = __filemap_get_folio_mpol(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping), policy); + mpol_cond_put(policy); + + return folio; } static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) @@ -413,8 +446,38 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) return ret; } +#ifdef CONFIG_NUMA +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + + return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *pgoff) +{ + struct inode *inode = file_inode(vma->vm_file); + + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* + * Note! Directly return whatever the lookup returns, do NOT return + * the current task's policy as is done when looking up the policy for + * a specific folio. Kernel ABI for get_mempolicy() is to return + * MPOL_DEFAULT when there is no defined policy, not whatever the + * default policy resolves to. + */ + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); +} +#endif /* CONFIG_NUMA */ + static const struct vm_operations_struct kvm_gmem_vm_ops = { - .fault = kvm_gmem_fault_user_mapping, + .fault = kvm_gmem_fault_user_mapping, +#ifdef CONFIG_NUMA + .get_policy = kvm_gmem_get_policy, + .set_policy = kvm_gmem_set_policy, +#endif }; static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) @@ -867,11 +930,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) if (!gi) return NULL; + mpol_shared_policy_init(&gi->policy, NULL); return &gi->vfs_inode; } static void kvm_gmem_destroy_inode(struct inode *inode) { + mpol_free_shared_policy(&GMEM_I(inode)->policy); } static void kvm_gmem_free_inode(struct inode *inode) -- 2.51.0.710.ga91ca5db03-goog