From: Matthew Brost The core MM splits the folio before calling folio_free, restoring the zone pages associated with the folio to an initialized state (e.g., non-compound, pgmap valid, etc...). The order argument represents the folio’s order prior to the split which can be used driver side to know how many pages are being freed. Fixes: 3a5a06554566 ("mm/zone_device: rename page_free callback to folio_free") Cc: Zi Yan Cc: Madhavan Srinivasan Cc: Nicholas Piggin Cc: Michael Ellerman Cc: "Christophe Leroy (CS GROUP)" Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Cc: David Airlie Cc: Simona Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: Lyude Paul Cc: Danilo Krummrich Cc: Bjorn Helgaas Cc: Logan Gunthorpe Cc: David Hildenbrand Cc: Oscar Salvador Cc: Andrew Morton Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Balbir Singh Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Alistair Popple Cc: linuxppc-dev@lists.ozlabs.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: amd-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org Cc: nouveau@lists.freedesktop.org Cc: linux-pci@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-cxl@vger.kernel.org Signed-off-by: Matthew Brost Signed-off-by: Francois Dugast --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/drm_pagemap.c | 3 ++- drivers/gpu/drm/nouveau/nouveau_dmem.c | 4 ++-- drivers/pci/p2pdma.c | 2 +- include/linux/memremap.h | 7 ++++++- lib/test_hmm.c | 4 +--- mm/memremap.c | 5 +++-- 8 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index e5000bef90f2..b58f34eec6e5 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -1014,7 +1014,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) * to a normal PFN during H_SVM_PAGE_OUT. * Gets called with kvm->arch.uvmem_lock held. */ -static void kvmppc_uvmem_folio_free(struct folio *folio) +static void kvmppc_uvmem_folio_free(struct folio *folio, unsigned int order) { struct page *page = &folio->page; unsigned long pfn = page_to_pfn(page) - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index af53e796ea1b..a26e3c448e47 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -567,7 +567,7 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, return r < 0 ? r : 0; } -static void svm_migrate_folio_free(struct folio *folio) +static void svm_migrate_folio_free(struct folio *folio, unsigned int order) { struct page *page = &folio->page; struct svm_range_bo *svm_bo = page->zone_device_data; diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 03ee39a761a4..df253b13cf85 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -1144,11 +1144,12 @@ static int __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas, /** * drm_pagemap_folio_free() - Put GPU SVM zone device data associated with a folio * @folio: Pointer to the folio + * @order: Order of the folio prior to being split by core MM * * This function is a callback used to put the GPU SVM zone device data * associated with a page when it is being released. */ -static void drm_pagemap_folio_free(struct folio *folio) +static void drm_pagemap_folio_free(struct folio *folio, unsigned int order) { drm_pagemap_zdd_put(folio->page.zone_device_data); } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 58071652679d..545f316fca14 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -115,14 +115,14 @@ unsigned long nouveau_dmem_page_addr(struct page *page) return chunk->bo->offset + off; } -static void nouveau_dmem_folio_free(struct folio *folio) +static void nouveau_dmem_folio_free(struct folio *folio, unsigned int order) { struct page *page = &folio->page; struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page); struct nouveau_dmem *dmem = chunk->drm->dmem; spin_lock(&dmem->lock); - if (folio_order(folio)) { + if (order) { page->zone_device_data = dmem->free_folios; dmem->free_folios = folio; } else { diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 4a2fc7ab42c3..a6fa7610f8a8 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -200,7 +200,7 @@ static const struct attribute_group p2pmem_group = { .name = "p2pmem", }; -static void p2pdma_folio_free(struct folio *folio) +static void p2pdma_folio_free(struct folio *folio, unsigned int order) { struct page *page = &folio->page; struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 713ec0435b48..97fcffeb1c1e 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -79,8 +79,13 @@ struct dev_pagemap_ops { * Called once the folio refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare * for handing out the folio again. + * + * The core MM splits the folio before calling folio_free, restoring the + * zone pages associated with the folio to an initialized state (e.g., + * non-compound, pgmap valid, etc...). The order argument represents the + * folio’s order prior to the split. */ - void (*folio_free)(struct folio *folio); + void (*folio_free)(struct folio *folio, unsigned int order); /* * Used for private (un-addressable) device memory only. Must migrate diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 8af169d3873a..e17c71d02a3a 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1580,13 +1580,11 @@ static const struct file_operations dmirror_fops = { .owner = THIS_MODULE, }; -static void dmirror_devmem_free(struct folio *folio) +static void dmirror_devmem_free(struct folio *folio, unsigned int order) { struct page *page = &folio->page; struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; - struct folio *rfolio = page_folio(rpage); - unsigned int order = folio_order(rfolio); if (rpage != page) { if (order) diff --git a/mm/memremap.c b/mm/memremap.c index 63c6ab4fdf08..39dc4bd190d0 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -417,6 +417,7 @@ void free_zone_device_folio(struct folio *folio) { struct dev_pagemap *pgmap = folio->pgmap; unsigned long nr = folio_nr_pages(folio); + unsigned int order = folio_order(folio); int i; if (WARN_ON_ONCE(!pgmap)) @@ -453,7 +454,7 @@ void free_zone_device_folio(struct folio *folio) case MEMORY_DEVICE_COHERENT: if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->folio_free(folio); + pgmap->ops->folio_free(folio, order); percpu_ref_put_many(&folio->pgmap->ref, nr); break; @@ -472,7 +473,7 @@ void free_zone_device_folio(struct folio *folio) case MEMORY_DEVICE_PCI_P2PDMA: if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->folio_free(folio); + pgmap->ops->folio_free(folio, order); break; } } -- 2.43.0 From: Matthew Brost Add free_zone_device_folio_prepare(), a helper that restores large ZONE_DEVICE folios to a sane, initial state before freeing them. Compound ZONE_DEVICE folios overwrite per-page state (e.g. pgmap and compound metadata). Before returning such pages to the device pgmap allocator, each constituent page must be reset to a standalone ZONE_DEVICE folio with a valid pgmap and no compound state. Use this helper prior to folio_free() for device-private and device-coherent folios to ensure consistent device page state for subsequent allocations. Fixes: d245f9b4ab80 ("mm/zone_device: support large zone device private folios") Cc: Zi Yan Cc: David Hildenbrand Cc: Oscar Salvador Cc: Andrew Morton Cc: Balbir Singh Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Alistair Popple Cc: linux-mm@kvack.org Cc: linux-cxl@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Alistair Popple Signed-off-by: Matthew Brost Signed-off-by: Francois Dugast --- include/linux/memremap.h | 1 + mm/memremap.c | 55 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 97fcffeb1c1e..88e1d4707296 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -230,6 +230,7 @@ static inline bool is_fsdax_page(const struct page *page) #ifdef CONFIG_ZONE_DEVICE void zone_device_page_init(struct page *page, unsigned int order); +void free_zone_device_folio_prepare(struct folio *folio); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/mm/memremap.c b/mm/memremap.c index 39dc4bd190d0..375a61e18858 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -413,6 +413,60 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn) } EXPORT_SYMBOL_GPL(get_dev_pagemap); +/** + * free_zone_device_folio_prepare() - Prepare a ZONE_DEVICE folio for freeing. + * @folio: ZONE_DEVICE folio to prepare for release. + * + * ZONE_DEVICE pages/folios (e.g., device-private memory or fsdax-backed pages) + * can be compound. When freeing a compound ZONE_DEVICE folio, the tail pages + * must be restored to a sane ZONE_DEVICE state before they are released. + * + * This helper: + * - Clears @folio->mapping and, for compound folios, clears each page's + * compound-head state (ClearPageHead()/clear_compound_head()). + * - Resets the compound order metadata (folio_reset_order()) and then + * initializes each constituent page as a standalone ZONE_DEVICE folio: + * * clears ->mapping + * * restores ->pgmap (prep_compound_page() overwrites it) + * * clears ->share (only relevant for fsdax; unused for device-private) + * + * If @folio is order-0, only the mapping is cleared and no further work is + * required. + */ +void free_zone_device_folio_prepare(struct folio *folio) +{ + struct dev_pagemap *pgmap = page_pgmap(&folio->page); + int order, i; + + VM_WARN_ON_FOLIO(!folio_is_zone_device(folio), folio); + + folio->mapping = NULL; + order = folio_order(folio); + if (!order) + return; + + folio_reset_order(folio); + + for (i = 0; i < (1UL << order); i++) { + struct page *page = folio_page(folio, i); + struct folio *new_folio = (struct folio *)page; + + ClearPageHead(page); + clear_compound_head(page); + + new_folio->mapping = NULL; + /* + * Reset pgmap which was over-written by + * prep_compound_page(). + */ + new_folio->pgmap = pgmap; + new_folio->share = 0; /* fsdax only, unused for device private */ + VM_WARN_ON_FOLIO(folio_ref_count(new_folio), new_folio); + VM_WARN_ON_FOLIO(!folio_is_zone_device(new_folio), new_folio); + } +} +EXPORT_SYMBOL_GPL(free_zone_device_folio_prepare); + void free_zone_device_folio(struct folio *folio) { struct dev_pagemap *pgmap = folio->pgmap; @@ -454,6 +508,7 @@ void free_zone_device_folio(struct folio *folio) case MEMORY_DEVICE_COHERENT: if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; + free_zone_device_folio_prepare(folio); pgmap->ops->folio_free(folio, order); percpu_ref_put_many(&folio->pgmap->ref, nr); break; -- 2.43.0 From: Matthew Brost Use free_zone_device_folio_prepare() to restore fsdax ZONE_DEVICE folios to a sane initial state upon the final put. Cc: Dan Williams Cc: Matthew Wilcox Cc: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Andrew Morton Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Zi Yan Cc: Alistair Popple Cc: Balbir Singh Cc: linux-mm@kvack.org Cc: linux-fsdevel@vger.kernel.org Cc: nvdimm@lists.linux.dev Cc: linux-kernel@vger.kernel.org Suggested-by: Alistair Popple Signed-off-by: Matthew Brost Signed-off-by: Francois Dugast --- fs/dax.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 289e6254aa30..d998f7615abb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -391,29 +391,7 @@ static inline unsigned long dax_folio_put(struct folio *folio) if (ref) return ref; - folio->mapping = NULL; - order = folio_order(folio); - if (!order) - return 0; - folio_reset_order(folio); - - for (i = 0; i < (1UL << order); i++) { - struct dev_pagemap *pgmap = page_pgmap(&folio->page); - struct page *page = folio_page(folio, i); - struct folio *new_folio = (struct folio *)page; - - ClearPageHead(page); - clear_compound_head(page); - - new_folio->mapping = NULL; - /* - * Reset pgmap which was over-written by - * prep_compound_page(). - */ - new_folio->pgmap = pgmap; - new_folio->share = 0; - WARN_ON_ONCE(folio_ref_count(new_folio)); - } + free_zone_device_folio_prepare(folio); return ref; } -- 2.43.0 If the page is part of a folio, unlock and put the whole folio at once instead of individual pages one after the other. This will reduce the amount of operations once device THP are in use. Cc: Andrew Morton Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Zi Yan Cc: Alistair Popple Cc: Balbir Singh Cc: linux-mm@kvack.org Suggested-by: Matthew Brost Reviewed-by: Matthew Brost Signed-off-by: Francois Dugast --- drivers/gpu/drm/drm_pagemap.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index df253b13cf85..bd9a4703fbce 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -154,15 +154,15 @@ static void drm_pagemap_zdd_put(struct drm_pagemap_zdd *zdd) } /** - * drm_pagemap_migration_unlock_put_page() - Put a migration page - * @page: Pointer to the page to put + * drm_pagemap_migration_unlock_put_folio() - Put a migration folio + * @folio: Pointer to the folio to put * - * This function unlocks and puts a page. + * This function unlocks and puts a folio. */ -static void drm_pagemap_migration_unlock_put_page(struct page *page) +static void drm_pagemap_migration_unlock_put_folio(struct folio *folio) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } /** @@ -177,15 +177,23 @@ static void drm_pagemap_migration_unlock_put_pages(unsigned long npages, { unsigned long i; - for (i = 0; i < npages; ++i) { + for (i = 0; i < npages;) { struct page *page; + struct folio *folio; + unsigned int order = 0; if (!migrate_pfn[i]) - continue; + goto next; page = migrate_pfn_to_page(migrate_pfn[i]); - drm_pagemap_migration_unlock_put_page(page); + folio = page_folio(page); + order = folio_order(folio); + + drm_pagemap_migration_unlock_put_folio(folio); migrate_pfn[i] = 0; + +next: + i += NR_PAGES(order); } } -- 2.43.0 This new helper helps ensure all accesses to zone_device_data use the correct API whether the page is part of a folio or not. v2: - Move to drm_pagemap.h, stick to folio_zone_device_data (Matthew Brost) - Return struct drm_pagemap_zdd * (Matthew Brost) Cc: Andrew Morton Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Zi Yan Cc: Alistair Popple Cc: Balbir Singh Cc: linux-mm@kvack.org Suggested-by: Matthew Brost Reviewed-by: Matthew Brost Signed-off-by: Francois Dugast --- drivers/gpu/drm/drm_gpusvm.c | 7 +++++-- drivers/gpu/drm/drm_pagemap.c | 21 ++++++++++++--------- include/drm/drm_pagemap.h | 15 +++++++++++++++ 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c index aa9a0b60e727..585d913d3d19 100644 --- a/drivers/gpu/drm/drm_gpusvm.c +++ b/drivers/gpu/drm/drm_gpusvm.c @@ -1488,12 +1488,15 @@ int drm_gpusvm_get_pages(struct drm_gpusvm *gpusvm, order = drm_gpusvm_hmm_pfn_to_order(pfns[i], i, npages); if (is_device_private_page(page) || is_device_coherent_page(page)) { + struct drm_pagemap_zdd *__zdd = + drm_pagemap_page_zone_device_data(page); + if (!ctx->allow_mixed && - zdd != page->zone_device_data && i > 0) { + zdd != __zdd && i > 0) { err = -EOPNOTSUPP; goto err_unmap; } - zdd = page->zone_device_data; + zdd = __zdd; if (pagemap != page_pgmap(page)) { if (i > 0) { err = -EOPNOTSUPP; diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index bd9a4703fbce..308c14291eba 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -252,7 +252,7 @@ static int drm_pagemap_migrate_map_pages(struct device *dev, order = folio_order(folio); if (is_device_private_page(page)) { - struct drm_pagemap_zdd *zdd = page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(page); struct drm_pagemap *dpagemap = zdd->dpagemap; struct drm_pagemap_addr addr; @@ -323,7 +323,7 @@ static void drm_pagemap_migrate_unmap_pages(struct device *dev, goto next; if (is_zone_device_page(page)) { - struct drm_pagemap_zdd *zdd = page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(page); struct drm_pagemap *dpagemap = zdd->dpagemap; dpagemap->ops->device_unmap(dpagemap, dev, pagemap_addr[i]); @@ -611,7 +611,8 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, pages[i] = NULL; if (src_page && is_device_private_page(src_page)) { - struct drm_pagemap_zdd *src_zdd = src_page->zone_device_data; + struct drm_pagemap_zdd *src_zdd = + drm_pagemap_page_zone_device_data(src_page); if (page_pgmap(src_page) == pagemap && !mdetails->can_migrate_same_pagemap) { @@ -733,8 +734,8 @@ static int drm_pagemap_migrate_populate_ram_pfn(struct vm_area_struct *vas, goto next; if (fault_page) { - if (src_page->zone_device_data != - fault_page->zone_device_data) + if (drm_pagemap_page_zone_device_data(src_page) != + drm_pagemap_page_zone_device_data(fault_page)) goto next; } @@ -1075,7 +1076,7 @@ static int __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas, void *buf; int i, err = 0; - zdd = page->zone_device_data; + zdd = drm_pagemap_page_zone_device_data(page); if (time_before64(get_jiffies_64(), zdd->devmem_allocation->timeslice_expiration)) return 0; @@ -1159,7 +1160,9 @@ static int __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas, */ static void drm_pagemap_folio_free(struct folio *folio, unsigned int order) { - drm_pagemap_zdd_put(folio->page.zone_device_data); + struct page *page = folio_page(folio, 0); + + drm_pagemap_zdd_put(drm_pagemap_page_zone_device_data(page)); } /** @@ -1175,7 +1178,7 @@ static void drm_pagemap_folio_free(struct folio *folio, unsigned int order) */ static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf) { - struct drm_pagemap_zdd *zdd = vmf->page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(vmf->page); int err; err = __drm_pagemap_migrate_to_ram(vmf->vma, @@ -1241,7 +1244,7 @@ EXPORT_SYMBOL_GPL(drm_pagemap_devmem_init); */ struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page) { - struct drm_pagemap_zdd *zdd = page->zone_device_data; + struct drm_pagemap_zdd *zdd = drm_pagemap_page_zone_device_data(page); return zdd->devmem_allocation->dpagemap; } diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h index 46e9c58f09e0..736fb6cb7b33 100644 --- a/include/drm/drm_pagemap.h +++ b/include/drm/drm_pagemap.h @@ -4,6 +4,7 @@ #include #include +#include #include #define NR_PAGES(order) (1U << (order)) @@ -359,4 +360,18 @@ int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap, void drm_pagemap_destroy(struct drm_pagemap *dpagemap, bool is_atomic_or_reclaim); int drm_pagemap_reinit(struct drm_pagemap *dpagemap); + +/** + * drm_pagemap_page_zone_device_data() - Page to zone_device_data + * @page: Pointer to the page + * + * Return: Page's zone_device_data + */ +static inline struct drm_pagemap_zdd *drm_pagemap_page_zone_device_data(struct page *page) +{ + struct folio *folio = page_folio(page); + + return folio_zone_device_data(folio); +} + #endif -- 2.43.0 From: Matthew Brost cpages returned from migrate_vma_setup represents the total number of individual pages found, not the number of 4K pages. The math in drm_pagemap_migrate_to_devmem for npages is based on the number of 4K pages, so cpages != npages can fail even if the entire memory range is found in migrate_vma_setup (e.g., when a single 2M page is found). Add drm_pagemap_cpages, which converts cpages to the number of 4K pages found. Cc: Andrew Morton Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Zi Yan Cc: Alistair Popple Cc: Balbir Singh Cc: linux-mm@kvack.org Signed-off-by: Matthew Brost Signed-off-by: Francois Dugast --- drivers/gpu/drm/drm_pagemap.c | 38 ++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 308c14291eba..af2c8f4da00e 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -452,6 +452,41 @@ static int drm_pagemap_migrate_range(struct drm_pagemap_devmem *devmem, return ret; } +/** + * drm_pagemap_cpages() - Count collected pages + * @migrate_pfn: Array of migrate_pfn entries to account + * @npages: Number of entries in @migrate_pfn + * + * Compute the total number of minimum-sized pages represented by the + * collected entries in @migrate_pfn. The total is derived from the + * order encoded in each entry. + * + * Return: Total number of minimum-sized pages. + */ +static int drm_pagemap_cpages(unsigned long *migrate_pfn, unsigned long npages) +{ + unsigned long i, cpages = 0; + + for (i = 0; i < npages;) { + struct page *page = migrate_pfn_to_page(migrate_pfn[i]); + struct folio *folio; + unsigned int order = 0; + + if (page) { + folio = page_folio(page); + order = folio_order(folio); + cpages += NR_PAGES(order); + } else if (migrate_pfn[i] & MIGRATE_PFN_COMPOUND) { + order = HPAGE_PMD_ORDER; + cpages += NR_PAGES(order); + } + + i += NR_PAGES(order); + } + + return cpages; +} + /** * drm_pagemap_migrate_to_devmem() - Migrate a struct mm_struct range to device memory * @devmem_allocation: The device memory allocation to migrate to. @@ -564,7 +599,8 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, goto err_free; } - if (migrate.cpages != npages) { + if (migrate.cpages != npages && + drm_pagemap_cpages(migrate.src, npages) != npages) { /* * Some pages to migrate. But we want to migrate all or * nothing. Raced or unknown device pages. -- 2.43.0 This enables support for Transparent Huge Pages (THP) for device pages by using MIGRATE_VMA_SELECT_COMPOUND during migration. It removes the need to split folios and loop multiple times over all pages to perform required operations at page level. Instead, we rely on newly introduced support for higher orders in drm_pagemap and folio-level API. In Xe, this drastically improves performance when using SVM. The GT stats below collected after a 2MB page fault show overall servicing is more than 7 times faster, and thanks to reduced CPU overhead the time spent on the actual copy goes from 23% without THP to 80% with THP: Without THP: svm_2M_pagefault_us: 966 svm_2M_migrate_us: 942 svm_2M_device_copy_us: 223 svm_2M_get_pages_us: 9 svm_2M_bind_us: 10 With THP: svm_2M_pagefault_us: 132 svm_2M_migrate_us: 128 svm_2M_device_copy_us: 106 svm_2M_get_pages_us: 1 svm_2M_bind_us: 2 v2: - Fix one occurrence of drm_pagemap_get_devmem_page() (Matthew Brost) v3: - Remove migrate_device_split_page() and folio_split_lock, instead rely on free_zone_device_folio() to split folios before freeing (Matthew Brost) - Assert folio order is HPAGE_PMD_ORDER (Matthew Brost) - Always use folio_set_zone_device_data() in split (Matthew Brost) v4: - Warn on compound device page, s/continue/goto next/ (Matthew Brost) Cc: Matthew Brost Cc: Thomas Hellström Cc: Michal Mrozek Cc: Andrew Morton Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Zi Yan Cc: Alistair Popple Cc: Balbir Singh Cc: linux-mm@kvack.org Signed-off-by: Francois Dugast --- drivers/gpu/drm/drm_pagemap.c | 77 ++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index af2c8f4da00e..bd2c9af51564 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -200,16 +200,20 @@ static void drm_pagemap_migration_unlock_put_pages(unsigned long npages, /** * drm_pagemap_get_devmem_page() - Get a reference to a device memory page * @page: Pointer to the page + * @order: Order * @zdd: Pointer to the GPU SVM zone device data * * This function associates the given page with the specified GPU SVM zone * device data and initializes it for zone device usage. */ static void drm_pagemap_get_devmem_page(struct page *page, + unsigned int order, struct drm_pagemap_zdd *zdd) { - page->zone_device_data = drm_pagemap_zdd_get(zdd); - zone_device_page_init(page, 0); + struct folio *folio = page_folio(page); + + folio_set_zone_device_data(folio, drm_pagemap_zdd_get(zdd)); + zone_device_page_init(page, order); } /** @@ -534,7 +538,8 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, * rare and only occur when the madvise attributes of memory are * changed or atomics are being used. */ - .flags = MIGRATE_VMA_SELECT_SYSTEM | MIGRATE_VMA_SELECT_DEVICE_COHERENT, + .flags = MIGRATE_VMA_SELECT_SYSTEM | MIGRATE_VMA_SELECT_DEVICE_COHERENT | + MIGRATE_VMA_SELECT_COMPOUND, }; unsigned long i, npages = npages_in_range(start, end); unsigned long own_pages = 0, migrated_pages = 0; @@ -640,11 +645,16 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, own_pages = 0; - for (i = 0; i < npages; ++i) { + for (i = 0; i < npages;) { + unsigned long j; struct page *page = pfn_to_page(migrate.dst[i]); struct page *src_page = migrate_pfn_to_page(migrate.src[i]); - cur.start = i; + unsigned int order = 0; + + drm_WARN_ONCE(dpagemap->drm, folio_order(page_folio(page)), + "Unexpected compound device page found\n"); + cur.start = i; pages[i] = NULL; if (src_page && is_device_private_page(src_page)) { struct drm_pagemap_zdd *src_zdd = @@ -654,7 +664,7 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, !mdetails->can_migrate_same_pagemap) { migrate.dst[i] = 0; own_pages++; - continue; + goto next; } if (mdetails->source_peer_migrates) { cur.dpagemap = src_zdd->dpagemap; @@ -670,7 +680,20 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, pages[i] = page; } migrate.dst[i] = migrate_pfn(migrate.dst[i]); - drm_pagemap_get_devmem_page(page, zdd); + + if (migrate.src[i] & MIGRATE_PFN_COMPOUND) { + drm_WARN_ONCE(dpagemap->drm, src_page && + folio_order(page_folio(src_page)) != HPAGE_PMD_ORDER, + "Unexpected folio order\n"); + + order = HPAGE_PMD_ORDER; + migrate.dst[i] |= MIGRATE_PFN_COMPOUND; + + for (j = 1; j < NR_PAGES(order) && i + j < npages; j++) + migrate.dst[i + j] = 0; + } + + drm_pagemap_get_devmem_page(page, order, zdd); /* If we switched the migrating drm_pagemap, migrate previous pages now */ err = drm_pagemap_migrate_range(devmem_allocation, migrate.src, migrate.dst, @@ -680,7 +703,11 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, npages = i + 1; goto err_finalize; } + +next: + i += NR_PAGES(order); } + cur.start = npages; cur.ops = NULL; /* Force migration */ err = drm_pagemap_migrate_range(devmem_allocation, migrate.src, migrate.dst, @@ -789,6 +816,8 @@ static int drm_pagemap_migrate_populate_ram_pfn(struct vm_area_struct *vas, page = folio_page(folio, 0); mpfn[i] = migrate_pfn(page_to_pfn(page)); + if (order) + mpfn[i] |= MIGRATE_PFN_COMPOUND; next: if (page) addr += page_size(page); @@ -1044,8 +1073,15 @@ int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem *devmem_allocation) if (err) goto err_finalize; - for (i = 0; i < npages; ++i) + for (i = 0; i < npages;) { + unsigned int order = 0; + pages[i] = migrate_pfn_to_page(src[i]); + if (pages[i]) + order = folio_order(page_folio(pages[i])); + + i += NR_PAGES(order); + } err = ops->copy_to_ram(pages, pagemap_addr, npages, NULL); if (err) @@ -1098,7 +1134,8 @@ static int __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas, .vma = vas, .pgmap_owner = page_pgmap(page)->owner, .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | - MIGRATE_VMA_SELECT_DEVICE_COHERENT, + MIGRATE_VMA_SELECT_DEVICE_COHERENT | + MIGRATE_VMA_SELECT_COMPOUND, .fault_page = page, }; struct drm_pagemap_migrate_details mdetails = {}; @@ -1164,8 +1201,15 @@ static int __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas, if (err) goto err_finalize; - for (i = 0; i < npages; ++i) + for (i = 0; i < npages;) { + unsigned int order = 0; + pages[i] = migrate_pfn_to_page(migrate.src[i]); + if (pages[i]) + order = folio_order(page_folio(pages[i])); + + i += NR_PAGES(order); + } err = ops->copy_to_ram(pages, pagemap_addr, npages, NULL); if (err) @@ -1224,9 +1268,22 @@ static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf) return err ? VM_FAULT_SIGBUS : 0; } +static void drm_pagemap_folio_split(struct folio *orig_folio, struct folio *new_folio) +{ + struct drm_pagemap_zdd *zdd; + + if (!new_folio) + return; + + new_folio->pgmap = orig_folio->pgmap; + zdd = folio_zone_device_data(orig_folio); + folio_set_zone_device_data(new_folio, drm_pagemap_zdd_get(zdd)); +} + static const struct dev_pagemap_ops drm_pagemap_pagemap_ops = { .folio_free = drm_pagemap_folio_free, .migrate_to_ram = drm_pagemap_migrate_to_ram, + .folio_split = drm_pagemap_folio_split, }; /** -- 2.43.0