Add "align" parameter to thp_get_unmapped_area_vmflags() so that it allows get unmapped area with any alignment. There're two existing callers, use PMD_SIZE explicitly for them. No functional change intended. Signed-off-by: Peter Xu --- include/linux/huge_mm.h | 5 +++-- mm/huge_memory.c | 7 ++++--- mm/mmap.c | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 71ac78b9f834f..1c221550362d7 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -362,7 +362,7 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, - vm_flags_t vm_flags); + unsigned long align, vm_flags_t vm_flags); bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, @@ -559,7 +559,8 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, static inline unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t vm_flags) + unsigned long flags, unsigned long align, + vm_flags_t vm_flags) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6cba1cb14b23a..ab2450b985171 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1155,12 +1155,12 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, - vm_flags_t vm_flags) + unsigned long align, vm_flags_t vm_flags) { unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; - ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); + ret = __thp_get_unmapped_area(filp, addr, len, off, flags, align, vm_flags); if (ret) return ret; @@ -1171,7 +1171,8 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); + return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, + PMD_SIZE, 0); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d5..8fa397a18252e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -846,7 +846,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, - pgoff, flags, vm_flags); + pgoff, flags, PMD_SIZE, + vm_flags); } else { addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, flags, vm_flags); -- 2.50.1 Add one new file operation, get_mapping_order(). It can be used by file backends to report mapping order hints. By default, Linux assumed we will map in PAGE_SIZE chunks. With this hint, the driver can report the possibility of mapping chunks that are larger than PAGE_SIZE. Then, the VA allocator will try to use that as alignment when allocating the VA ranges. This is useful because when chunks to be mapped are larger than PAGE_SIZE, VA alignment matters and it needs to be aligned with the size of the chunk to be mapped. Said that, no matter what is the alignment used for the VA allocation, the driver can still decide which size to map the chunks. It is also not an issue if it keeps mapping in PAGE_SIZE. get_mapping_order() is defined to take three parameters. Besides the 1st parameter which will be the file object pointer, the 2nd + 3rd parameters being the pgoff + size of the mmap() request. Its retval is defined as the order, which must be non-negative to enable the alignment. When zero is returned, it should behave like when the hint is not provided, IOW, alignment will still be PAGE_SIZE. When the order is too big, ignore the hint. Normally drivers are trusted, so it's more of an extra layer of safety measure. Suggested-by: Jason Gunthorpe Signed-off-by: Peter Xu --- Documentation/filesystems/vfs.rst | 4 +++ include/linux/fs.h | 1 + mm/mmap.c | 59 +++++++++++++++++++++++++++---- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 4f13b01e42eb5..b707ddbebbf52 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1069,6 +1069,7 @@ This describes how the VFS can manipulate an open file. As of kernel int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + int (*get_mapping_order)(struct file *, unsigned long, size_t); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); @@ -1165,6 +1166,9 @@ otherwise noted. ``get_unmapped_area`` called by the mmap(2) system call +``get_mapping_order`` + called by the mmap(2) system call to get mapping order hint + ``check_flags`` called by the fcntl(2) system call for F_SETFL command diff --git a/include/linux/fs.h b/include/linux/fs.h index dd3b57cfadeeb..5ba373576bfe5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2287,6 +2287,7 @@ struct file_operations { int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + int (*get_mapping_order)(struct file *file, unsigned long pgoff, size_t len); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); diff --git a/mm/mmap.c b/mm/mmap.c index 8fa397a18252e..be3dd0623f00c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -808,6 +808,33 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } +static inline bool file_has_mmap_order_hint(struct file *file) +{ + return file && file->f_op && file->f_op->get_mapping_order; +} + +static inline bool +mmap_should_align(struct file *file, unsigned long addr, unsigned long len) +{ + /* When THP not enabled at all, skip */ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return false; + + /* Never try any alignment if the mmap() address hint is provided */ + if (addr) + return false; + + /* Anonymous THP could use some better alignment when len aligned */ + if (!file) + return IS_ALIGNED(len, PMD_SIZE); + + /* + * It's a file mapping, no address hint provided by caller, try any + * alignment if the file backend would provide a hint + */ + return file_has_mmap_order_hint(file); +} + unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) @@ -815,8 +842,9 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; - unsigned long error = arch_mmap_check(addr, len, flags); + unsigned long align; + if (error) return error; @@ -841,13 +869,30 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file - && !addr /* no hint */ - && IS_ALIGNED(len, PMD_SIZE)) { - /* Ensures that larger anonymous mappings are THP aligned. */ + } else if (mmap_should_align(file, addr, len)) { + if (file_has_mmap_order_hint(file)) { + int order; + /* + * Allow driver to opt-in on the order hint. + * + * Sanity check on the order returned. Treating + * either negative or too big order to be invalid, + * where alignment will be skipped. + */ + order = file->f_op->get_mapping_order(file, pgoff, len); + if (order < 0) + order = 0; + if (check_shl_overflow(PAGE_SIZE, order, &align)) + /* No alignment applied */ + align = PAGE_SIZE; + } else { + /* Default alignment for anonymous THPs */ + align = PMD_SIZE; + } + addr = thp_get_unmapped_area_vmflags(file, addr, len, - pgoff, flags, PMD_SIZE, - vm_flags); + pgoff, flags, + align, vm_flags); } else { addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, flags, vm_flags); -- 2.50.1 Add a hook to vfio_device_ops to allow sub-modules provide mapping order hint for an mmap() request. When not available, use the default value (0). Note that this patch will change the code path for vfio on mmap() when allocating the virtual address range to be mapped, however it should not change the result of the VA allocated, because the default value (0) should be the old behavior. Signed-off-by: Peter Xu --- drivers/vfio/vfio_main.c | 14 ++++++++++++++ include/linux/vfio.h | 5 +++++ 2 files changed, 19 insertions(+) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60e..3f2107ff93e5d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1372,6 +1372,19 @@ static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep) } #endif +static int vfio_device_get_mapping_order(struct file *file, + unsigned long pgoff, + size_t len) +{ + struct vfio_device_file *df = file->private_data; + struct vfio_device *device = df->device; + + if (device->ops->get_mapping_order) + return device->ops->get_mapping_order(device, pgoff, len); + + return 0; +} + const struct file_operations vfio_device_fops = { .owner = THIS_MODULE, .open = vfio_device_fops_cdev_open, @@ -1384,6 +1397,7 @@ const struct file_operations vfio_device_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = vfio_device_show_fdinfo, #endif + .get_mapping_order = vfio_device_get_mapping_order, }; static struct vfio_device *vfio_device_from_file(struct file *file) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee5..46a4d85fc4953 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -111,6 +111,8 @@ struct vfio_device { * @dma_unmap: Called when userspace unmaps IOVA from the container * this device is attached to. * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl + * @get_mapping_order: Optional, provide mapping order hints for mmap(). + * When unavailable, use the default order (zero). */ struct vfio_device_ops { char *name; @@ -139,6 +141,9 @@ struct vfio_device_ops { void (*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length); int (*device_feature)(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); + int (*get_mapping_order)(struct vfio_device *device, + unsigned long pgoff, + size_t len); }; #if IS_ENABLED(CONFIG_IOMMUFD) -- 2.50.1 This patch enables best-effort mmap() for vfio-pci bars even without MAP_FIXED, so as to utilize huge pfnmaps as much as possible. It should also avoid userspace changes (switching to MAP_FIXED with pre-aligned VA addresses) to start enabling huge pfnmaps on VFIO bars. Here the trick is making sure the MMIO PFNs will be aligned with the VAs allocated from mmap() when !MAP_FIXED, so that whatever returned from mmap(!MAP_FIXED) of vfio-pci MMIO regions will be automatically suitable for huge pfnmaps as much as possible. To achieve that, a custom vfio_device's get_mapping_hint() for vfio-pci devices is needed. Note that BAR's MMIO physical addresses should normally be guaranteed to be BAR-size aligned. It means the MMIO address will also always be aligned with vfio-pci's file offset address space, per VFIO_PCI_OFFSET_SHIFT. With that guaranteed, VA allocator can calculate the alignment with pgoff, which will be further aligned with the MMIO physical addresses to be mapped in the VMA later. So far, stick with the simple plan to rely on the hardware assumption that should always be true. Leave it for later if pgoff needs adjustments when there's a real demand of it when calculating the alignment. For discussion on the requirement of this feature, see: https://lore.kernel.org/linux-pci/20250529214414.1508155-1-amastro@fb.com/ Signed-off-by: Peter Xu --- drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_core.c | 49 ++++++++++++++++++++++++++++++++ include/linux/vfio_pci_core.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2f..8f29037cee6eb 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -145,6 +145,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, .pasid_attach_ioas = vfio_iommufd_physical_pasid_attach_ioas, .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, + .get_mapping_order = vfio_pci_core_get_mapping_order, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc9..28ab37715acc0 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1640,6 +1640,55 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma) return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } +/* + * Hint function for mmap() about the size of mapping to be carried out. + * This helps to enable huge pfnmaps as much as possible on BAR mappings. + * + * This function does the minimum check on mmap() parameters to make the + * hint valid only. The majority of mmap() sanity check will be done later + * in mmap(). + */ +int vfio_pci_core_get_mapping_order(struct vfio_device *device, + unsigned long pgoff, size_t len) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + struct pci_dev *pdev = vdev->pdev; + unsigned int index = pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + unsigned long req_start; + size_t phys_len; + + /* Currently, only bars 0-5 supports huge pfnmap */ + if (index >= VFIO_PCI_ROM_REGION_INDEX) + return 0; + + /* + * NOTE: we're keeping things simple as of now, assuming the + * physical address of BARs (aka, pci_resource_start(pdev, index)) + * should always be aligned with pgoff in vfio-pci's address space. + */ + req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1); + phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); + + /* + * If this happens, it will probably fail mmap() later.. mapping + * hint isn't important anymore. + */ + if (req_start >= phys_len) + return 0; + + phys_len = MIN(phys_len - req_start, len); + + if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PUD_PFNMAP) && phys_len >= PUD_SIZE) + return PUD_ORDER; + + if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) && phys_len >= PMD_SIZE) + return PMD_ORDER; + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_get_mapping_order); + static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, unsigned int order) { diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2a..d320dfacc5681 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -119,6 +119,8 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); +int vfio_pci_core_get_mapping_order(struct vfio_device *device, + unsigned long pgoff, size_t len); int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); -- 2.50.1