To create a migrate entry from a given struct page, that page is first converted to its pfn, before passing the pfn to migrate_pfn(). A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have a pfn and must be handled separately. Prepare for this with a new helper: - migrate_pfn_from_page() This helper takes a struct page as parameter instead of a pfn. This will allow more flexibility for handling the mpfn differently for device private pages. Reviewed-by: Balbir Singh Acked-by: Felix Kuehling Signed-off-by: Jordan Niethe --- v2: New to series v3: No change --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/drm_pagemap.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 4 ++-- include/linux/migrate.h | 5 +++++ lib/test_hmm.c | 11 ++++++----- mm/migrate_device.c | 7 +++---- 7 files changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index e5000bef90f2..67910900af7b 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -784,7 +784,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, } } - *mig.dst = migrate_pfn(page_to_pfn(dpage)); + *mig.dst = migrate_pfn_from_page(dpage); migrate_vma_pages(&mig); out_finalize: migrate_vma_finalize(&mig); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 7a8990b30fa0..1f03cf7342a5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -652,7 +652,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n", dst[i] >> PAGE_SHIFT, page_to_pfn(dpage)); - migrate->dst[i] = migrate_pfn(page_to_pfn(dpage)); + migrate->dst[i] = migrate_pfn_from_page(dpage); j++; } diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 03ee39a761a4..526105aa4b05 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -742,7 +742,7 @@ static int drm_pagemap_migrate_populate_ram_pfn(struct vm_area_struct *vas, goto free_pages; page = folio_page(folio, 0); - mpfn[i] = migrate_pfn(page_to_pfn(page)); + mpfn[i] = migrate_pfn_from_page(page); next: if (page) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 58071652679d..a7edcdca9701 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -249,7 +249,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) goto done; } - args.dst[0] = migrate_pfn(page_to_pfn(dpage)); + args.dst[0] = migrate_pfn_from_page(dpage); if (order) args.dst[0] |= MIGRATE_PFN_COMPOUND; dfolio = page_folio(dpage); @@ -766,7 +766,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT); if (src & MIGRATE_PFN_WRITE) *pfn |= NVIF_VMM_PFNMAP_V0_W; - mpfn = migrate_pfn(page_to_pfn(dpage)); + mpfn = migrate_pfn_from_page(dpage); if (folio_order(page_folio(dpage))) mpfn |= MIGRATE_PFN_COMPOUND; return mpfn; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..d269ec1400be 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -140,6 +140,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; } +static inline unsigned long migrate_pfn_from_page(struct page *page) +{ + return migrate_pfn(page_to_pfn(page)); +} + enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 8af169d3873a..7e5248404d00 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -727,7 +727,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, rpage = BACKING_PAGE(dpage); rpage->zone_device_data = dmirror; - *dst = migrate_pfn(page_to_pfn(dpage)) | write; + *dst = migrate_pfn_from_page(dpage) | + write; src_page = pfn_to_page(spfn + i); if (spage) @@ -754,7 +755,7 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", page_to_pfn(spage), page_to_pfn(dpage)); - *dst = migrate_pfn(page_to_pfn(dpage)) | write; + *dst = migrate_pfn_from_page(dpage) | write; if (is_large) { int i; @@ -989,7 +990,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, if (dpage) { lock_page(dpage); - *dst |= migrate_pfn(page_to_pfn(dpage)); + *dst |= migrate_pfn_from_page(dpage); } for (i = 0; i < (1 << order); i++) { @@ -1000,7 +1001,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, if (!dpage && order) { dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); lock_page(dpage); - dst[i] = migrate_pfn(page_to_pfn(dpage)); + dst[i] = migrate_pfn_from_page(dpage); dst_page = pfn_to_page(page_to_pfn(dpage)); dpage = NULL; /* For the next iteration */ } else { @@ -1412,7 +1413,7 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) /* TODO Support splitting here */ lock_page(dpage); - dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); + dst_pfns[i] = migrate_pfn_from_page(dpage); if (src_pfns[i] & MIGRATE_PFN_WRITE) dst_pfns[i] |= MIGRATE_PFN_WRITE; if (order) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 23379663b1e1..1a2067f830da 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -207,9 +207,8 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, .vma = walk->vma, }; - unsigned long pfn = page_to_pfn(folio_page(folio, 0)); - - migrate->src[migrate->npages] = migrate_pfn(pfn) | write + migrate->src[migrate->npages] = migrate_pfn_from_page(folio_page(folio, 0)) + | write | MIGRATE_PFN_MIGRATE | MIGRATE_PFN_COMPOUND; migrate->dst[migrate->npages++] = 0; @@ -328,7 +327,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, goto again; } - mpfn = migrate_pfn(page_to_pfn(page)) | + mpfn = migrate_pfn_from_page(page) | MIGRATE_PFN_MIGRATE; if (softleaf_is_device_private_write(entry)) mpfn |= MIGRATE_PFN_WRITE; -- 2.34.1 A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have a pfn. A MIGRATE_PFN flag will be introduced that distinguishes between mpfns that contain a pfn vs an offset into device private memory. Replace usages of pfns and page_to_pfn() with mpfns and migrate_pfn_to_page() to prepare for handling this distinction. This will assist in continuing to use the same code paths for both MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_COHERENT devices. Acked-by: Balbir Singh Reviewed-by: Felix Kuehling Signed-off-by: Jordan Niethe --- v2: - New to series v3: - No change v4: - No change --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 15 +++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 1f03cf7342a5..3dd7a35d19f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -210,17 +210,17 @@ svm_migrate_copy_done(struct amdgpu_device *adev, struct dma_fence *mfence) } unsigned long -svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr) +svm_migrate_addr_to_mpfn(struct amdgpu_device *adev, unsigned long addr) { - return (addr + adev->kfd.pgmap.range.start) >> PAGE_SHIFT; + return migrate_pfn((addr + adev->kfd.pgmap.range.start) >> PAGE_SHIFT); } static void -svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) +svm_migrate_get_vram_page(struct svm_range *prange, unsigned long mpfn) { struct page *page; - page = pfn_to_page(pfn); + page = migrate_pfn_to_page(mpfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; zone_device_page_init(page, 0); @@ -231,7 +231,7 @@ svm_migrate_put_vram_page(struct amdgpu_device *adev, unsigned long addr) { struct page *page; - page = pfn_to_page(svm_migrate_addr_to_pfn(adev, addr)); + page = migrate_pfn_to_page(svm_migrate_addr_to_mpfn(adev, addr)); unlock_page(page); put_page(page); } @@ -241,7 +241,7 @@ svm_migrate_addr(struct amdgpu_device *adev, struct page *page) { unsigned long addr; - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = (migrate_pfn_from_page(page) >> MIGRATE_PFN_SHIFT) << PAGE_SHIFT; return (addr - adev->kfd.pgmap.range.start); } @@ -307,9 +307,8 @@ svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, if (migrate->src[i] & MIGRATE_PFN_MIGRATE) { dst[i] = cursor.start + (j << PAGE_SHIFT); - migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]); + migrate->dst[i] = svm_migrate_addr_to_mpfn(adev, dst[i]); svm_migrate_get_vram_page(prange, migrate->dst[i]); - migrate->dst[i] = migrate_pfn(migrate->dst[i]); mpages++; } spage = migrate_pfn_to_page(migrate->src[i]); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index 2b7fd442d29c..a80b72abe1e0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -48,7 +48,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, uint32_t trigger, struct page *fault_page); unsigned long -svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); +svm_migrate_addr_to_mpfn(struct amdgpu_device *adev, unsigned long addr); #endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */ -- 2.34.1 A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have a pfn. This causes an issue for migrate_device_{pfns,range}() which take pfn parameters. Depending on if the device is MEMORY_DEVICE_PRIVATE or MEMORY_DEVICE_COHERENT will effect how that parameter should be interpreted. A MIGRATE_PFN flag will be introduced that distinguishes between mpfns that contain a pfn vs an offset into device private memory, we will take advantage of that here. Update migrate_device_{pfns,range}() to take a mpfn instead of pfn. Update the users of migrate_device_{pfns,range}() to pass in an mpfn. To support this change, update dpagemap_devmem_ops::populate_devmem_pfn() to instead return mpfns and rename accordingly. Signed-off-by: Jordan Niethe --- v2: New to series v3: No change --- drivers/gpu/drm/drm_pagemap.c | 9 +++--- drivers/gpu/drm/nouveau/nouveau_dmem.c | 5 +-- drivers/gpu/drm/xe/xe_svm.c | 9 +++--- include/drm/drm_pagemap.h | 8 ++--- lib/test_hmm.c | 2 +- mm/migrate_device.c | 45 ++++++++++++++------------ 6 files changed, 41 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 526105aa4b05..13072c8665b9 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -505,7 +505,7 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, mmap_assert_locked(mm); - if (!ops->populate_devmem_pfn || !ops->copy_to_devmem || + if (!ops->populate_devmem_mpfn || !ops->copy_to_devmem || !ops->copy_to_ram) return -EOPNOTSUPP; @@ -590,14 +590,14 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, goto err_aborted_migration; } - err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst); + err = ops->populate_devmem_mpfn(devmem_allocation, npages, migrate.dst); if (err) goto err_aborted_migration; own_pages = 0; for (i = 0; i < npages; ++i) { - struct page *page = pfn_to_page(migrate.dst[i]); + struct page *page = migrate_pfn_to_page(migrate.dst[i]); struct page *src_page = migrate_pfn_to_page(migrate.src[i]); cur.start = i; @@ -624,7 +624,6 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, cur.device = dpagemap->drm->dev; pages[i] = page; } - migrate.dst[i] = migrate_pfn(migrate.dst[i]); drm_pagemap_get_devmem_page(page, zdd); /* If we switched the migrating drm_pagemap, migrate previous pages now */ @@ -979,7 +978,7 @@ int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem *devmem_allocation) pagemap_addr = buf + (2 * sizeof(*src) * npages); pages = buf + (2 * sizeof(*src) + sizeof(*pagemap_addr)) * npages; - err = ops->populate_devmem_pfn(devmem_allocation, npages, src); + err = ops->populate_devmem_mpfn(devmem_allocation, npages, src); if (err) goto err_free; diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index a7edcdca9701..bd3f7102c3f9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -483,8 +483,9 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk) dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); dma_info = kvcalloc(npages, sizeof(*dma_info), GFP_KERNEL | __GFP_NOFAIL); - migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT, - npages); + migrate_device_range(src_pfns, + migrate_pfn(chunk->pagemap.range.start >> PAGE_SHIFT), + npages); for (i = 0; i < npages; i++) { if (src_pfns[i] & MIGRATE_PFN_MIGRATE) { diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 213f0334518a..fbf5fd284616 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -10,6 +10,7 @@ #include #include +#include #include "xe_bo.h" #include "xe_exec_queue_types.h" #include "xe_gt_stats.h" @@ -752,8 +753,8 @@ static struct drm_buddy *vram_to_buddy(struct xe_vram_region *vram) return &vram->ttm.mm; } -static int xe_svm_populate_devmem_pfn(struct drm_pagemap_devmem *devmem_allocation, - unsigned long npages, unsigned long *pfn) +static int xe_svm_populate_devmem_mpfn(struct drm_pagemap_devmem *devmem_allocation, + unsigned long npages, unsigned long *pfn) { struct xe_bo *bo = to_xe_bo(devmem_allocation); struct ttm_resource *res = bo->ttm.resource; @@ -769,7 +770,7 @@ static int xe_svm_populate_devmem_pfn(struct drm_pagemap_devmem *devmem_allocati int i; for (i = 0; i < drm_buddy_block_size(buddy, block) >> PAGE_SHIFT; ++i) - pfn[j++] = block_pfn + i; + pfn[j++] = migrate_pfn(block_pfn + i); } return 0; @@ -777,7 +778,7 @@ static int xe_svm_populate_devmem_pfn(struct drm_pagemap_devmem *devmem_allocati static const struct drm_pagemap_devmem_ops dpagemap_devmem_ops = { .devmem_release = xe_svm_devmem_release, - .populate_devmem_pfn = xe_svm_populate_devmem_pfn, + .populate_devmem_mpfn = xe_svm_populate_devmem_mpfn, .copy_to_devmem = xe_svm_copy_to_devmem, .copy_to_ram = xe_svm_copy_to_ram, }; diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h index 2baf0861f78f..bffc7fd5bef3 100644 --- a/include/drm/drm_pagemap.h +++ b/include/drm/drm_pagemap.h @@ -192,17 +192,17 @@ struct drm_pagemap_devmem_ops { void (*devmem_release)(struct drm_pagemap_devmem *devmem_allocation); /** - * @populate_devmem_pfn: Populate device memory PFN (required for migration) + * @populate_devmem_mpfn: Populate device memory PFN (required for migration) * @devmem_allocation: device memory allocation * @npages: Number of pages to populate - * @pfn: Array of page frame numbers to populate + * @mpfn: Array of migrate page frame numbers to populate * * Populate device memory page frame numbers (PFN). * * Return: 0 on success, a negative error code on failure. */ - int (*populate_devmem_pfn)(struct drm_pagemap_devmem *devmem_allocation, - unsigned long npages, unsigned long *pfn); + int (*populate_devmem_mpfn)(struct drm_pagemap_devmem *devmem_allocation, + unsigned long npages, unsigned long *pfn); /** * @copy_to_devmem: Copy to device memory (required for migration) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 7e5248404d00..a6ff292596f3 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1389,7 +1389,7 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); - migrate_device_range(src_pfns, start_pfn, npages); + migrate_device_range(src_pfns, migrate_pfn(start_pfn), npages); for (i = 0; i < npages; i++) { struct page *dpage, *spage; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 1a2067f830da..a2baaa2a81f9 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -1354,11 +1354,11 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); -static unsigned long migrate_device_pfn_lock(unsigned long pfn) +static unsigned long migrate_device_pfn_lock(unsigned long mpfn) { struct folio *folio; - folio = folio_get_nontail_page(pfn_to_page(pfn)); + folio = folio_get_nontail_page(migrate_pfn_to_page(mpfn)); if (!folio) return 0; @@ -1367,13 +1367,14 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn) return 0; } - return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + return mpfn | MIGRATE_PFN_MIGRATE; } /** * migrate_device_range() - migrate device private pfns to normal memory. - * @src_pfns: array large enough to hold migrating source device private pfns. - * @start: starting pfn in the range to migrate. + * @src_mpfns: array large enough to hold migrating source device private + * migrate pfns. + * @start: starting migrate pfn in the range to migrate. * @npages: number of pages to migrate. * * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that @@ -1389,28 +1390,29 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn) * allocate destination pages and start copying data from the device to CPU * memory before calling migrate_device_pages(). */ -int migrate_device_range(unsigned long *src_pfns, unsigned long start, +int migrate_device_range(unsigned long *src_mpfns, unsigned long start, unsigned long npages) { - unsigned long i, j, pfn; + unsigned long i, j, mpfn; - for (pfn = start, i = 0; i < npages; pfn++, i++) { - struct page *page = pfn_to_page(pfn); + for (mpfn = start, i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(mpfn); struct folio *folio = page_folio(page); unsigned int nr = 1; - src_pfns[i] = migrate_device_pfn_lock(pfn); + src_mpfns[i] = migrate_device_pfn_lock(mpfn); nr = folio_nr_pages(folio); if (nr > 1) { - src_pfns[i] |= MIGRATE_PFN_COMPOUND; + src_mpfns[i] |= MIGRATE_PFN_COMPOUND; for (j = 1; j < nr; j++) - src_pfns[i+j] = 0; + src_mpfns[i+j] = 0; i += j - 1; - pfn += j - 1; + mpfn += (j - 1) << MIGRATE_PFN_SHIFT; } + mpfn += 1 << MIGRATE_PFN_SHIFT; } - migrate_device_unmap(src_pfns, npages, NULL); + migrate_device_unmap(src_mpfns, npages, NULL); return 0; } @@ -1418,32 +1420,33 @@ EXPORT_SYMBOL(migrate_device_range); /** * migrate_device_pfns() - migrate device private pfns to normal memory. - * @src_pfns: pre-popluated array of source device private pfns to migrate. + * @src_mpfns: pre-popluated array of source device private migrate pfns to + * migrate. * @npages: number of pages to migrate. * * Similar to migrate_device_range() but supports non-contiguous pre-popluated * array of device pages to migrate. */ -int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) +int migrate_device_pfns(unsigned long *src_mpfns, unsigned long npages) { unsigned long i, j; for (i = 0; i < npages; i++) { - struct page *page = pfn_to_page(src_pfns[i]); + struct page *page = migrate_pfn_to_page(src_mpfns[i]); struct folio *folio = page_folio(page); unsigned int nr = 1; - src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); + src_mpfns[i] = migrate_device_pfn_lock(src_mpfns[i]); nr = folio_nr_pages(folio); if (nr > 1) { - src_pfns[i] |= MIGRATE_PFN_COMPOUND; + src_mpfns[i] |= MIGRATE_PFN_COMPOUND; for (j = 1; j < nr; j++) - src_pfns[i+j] = 0; + src_mpfns[i+j] = 0; i += j - 1; } } - migrate_device_unmap(src_pfns, npages, NULL); + migrate_device_unmap(src_mpfns, npages, NULL); return 0; } -- 2.34.1 A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have normal PFN and must be handled separately. Prepare for this by adding a MIGRATE_PFN_DEVICE_PRIVATE flag to indicate that a migrate pfn contains a PFN for a device private page. Acked-by: Felix Kuehling Signed-off-by: Jordan Niethe Signed-off-by: Alistair Popple --- v1: - Update for HMM huge page support - Update existing drivers to use MIGRATE_PFN_DEVICE v2: - Include changes to migrate_pfn_from_page() - Rename to MIGRATE_PFN_DEVICE_PRIVATE - drm/amd: Check adev->gmc.xgmi.connected_to_cpu - lib/test_hmm.c: Check chunk->pagemap.type == MEMORY_DEVICE_PRIVATE v3: - Use adev->kfd.pgmap.type == MEMORY_DEVICE_PRIVATE v4: - No change --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 7 ++++++- drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++- drivers/gpu/drm/xe/xe_svm.c | 2 +- include/linux/migrate.h | 14 +++++++++----- lib/test_hmm.c | 6 +++++- 5 files changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 3dd7a35d19f7..5478e41877e5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -212,7 +212,12 @@ svm_migrate_copy_done(struct amdgpu_device *adev, struct dma_fence *mfence) unsigned long svm_migrate_addr_to_mpfn(struct amdgpu_device *adev, unsigned long addr) { - return migrate_pfn((addr + adev->kfd.pgmap.range.start) >> PAGE_SHIFT); + unsigned long flags = 0; + + if (adev->kfd.pgmap.type == MEMORY_DEVICE_PRIVATE) + flags |= MIGRATE_PFN_DEVICE_PRIVATE; + return migrate_pfn((addr + adev->kfd.pgmap.range.start) >> PAGE_SHIFT) | + flags; } static void diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index bd3f7102c3f9..adfa3df5cbc5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -484,7 +484,8 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk) dma_info = kvcalloc(npages, sizeof(*dma_info), GFP_KERNEL | __GFP_NOFAIL); migrate_device_range(src_pfns, - migrate_pfn(chunk->pagemap.range.start >> PAGE_SHIFT), + migrate_pfn(chunk->pagemap.range.start >> PAGE_SHIFT) | + MIGRATE_PFN_DEVICE_PRIVATE, npages); for (i = 0; i < npages; i++) { diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index fbf5fd284616..a8aad9e0b1fb 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -770,7 +770,7 @@ static int xe_svm_populate_devmem_mpfn(struct drm_pagemap_devmem *devmem_allocat int i; for (i = 0; i < drm_buddy_block_size(buddy, block) >> PAGE_SHIFT; ++i) - pfn[j++] = migrate_pfn(block_pfn + i); + pfn[j++] = migrate_pfn(block_pfn + i) | MIGRATE_PFN_DEVICE_PRIVATE; } return 0; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index d269ec1400be..5fd2ee080bc0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -122,11 +122,12 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node) * have enough bits to store all physical address and flags. So far we have * enough room for all our flags. */ -#define MIGRATE_PFN_VALID (1UL << 0) -#define MIGRATE_PFN_MIGRATE (1UL << 1) -#define MIGRATE_PFN_WRITE (1UL << 3) -#define MIGRATE_PFN_COMPOUND (1UL << 4) -#define MIGRATE_PFN_SHIFT 6 +#define MIGRATE_PFN_VALID (1UL << 0) +#define MIGRATE_PFN_MIGRATE (1UL << 1) +#define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_COMPOUND (1UL << 4) +#define MIGRATE_PFN_DEVICE_PRIVATE (1UL << 5) +#define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) { @@ -142,6 +143,9 @@ static inline unsigned long migrate_pfn(unsigned long pfn) static inline unsigned long migrate_pfn_from_page(struct page *page) { + if (is_device_private_page(page)) + return migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_DEVICE_PRIVATE; return migrate_pfn(page_to_pfn(page)); } diff --git a/lib/test_hmm.c b/lib/test_hmm.c index a6ff292596f3..872d3846af7b 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1385,11 +1385,15 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) unsigned long *src_pfns; unsigned long *dst_pfns; unsigned int order = 0; + unsigned long flags = 0; src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); - migrate_device_range(src_pfns, migrate_pfn(start_pfn), npages); + if (chunk->pagemap.type == MEMORY_DEVICE_PRIVATE) + flags |= MIGRATE_PFN_DEVICE_PRIVATE; + + migrate_device_range(src_pfns, migrate_pfn(start_pfn) | flags, npages); for (i = 0; i < npages; i++) { struct page *dpage, *spage; -- 2.34.1 A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have normal PFN and must be handled separately. Prepare for this by adding a PVMW_DEVICE_PRIVATE flag to page_vma_mapped_walk::flags. This indicates that page_vma_mapped_walk::pfn contains a device private offset rather than a normal pfn. Once the device private pages are removed from the physical address space this flag will be used to ensure a device private offset is returned. Reviewed-by: Zi Yan Signed-off-by: Jordan Niethe Signed-off-by: Alistair Popple --- v1: - Update for HMM huge page support v2: - Move adding device_private param to check_pmd() until final patch v3: - Track device private offset in pvmw::flags instead of pvmw::pfn v4: - No change --- include/linux/rmap.h | 24 ++++++++++++++++++++++-- mm/page_vma_mapped.c | 4 ++-- mm/rmap.c | 4 ++-- mm/vmscan.c | 2 +- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index daa92a58585d..1b03297f13dc 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -921,6 +921,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, #define PVMW_SYNC (1 << 0) /* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) +/* pvmw::pfn is a device private offset */ +#define PVMW_DEVICE_PRIVATE (1 << 2) /* Result flags */ @@ -939,14 +941,32 @@ struct page_vma_mapped_walk { unsigned int flags; }; +static inline unsigned long page_vma_walk_flags(const struct folio *folio, + unsigned long flags) +{ + if (folio_is_device_private(folio)) + return flags | PVMW_DEVICE_PRIVATE; + return flags; +} + +static inline unsigned long folio_page_vma_walk_pfn(const struct folio *folio) +{ + return folio_pfn(folio); +} + +static inline struct folio *page_vma_walk_pfn_to_folio(struct page_vma_mapped_walk *pvmw) +{ + return pfn_folio(pvmw->pfn); +} + #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ - .pfn = folio_pfn(_folio), \ + .pfn = folio_page_vma_walk_pfn(_folio), \ .nr_pages = folio_nr_pages(_folio), \ .pgoff = folio_pgoff(_folio), \ .vma = _vma, \ .address = _address, \ - .flags = _flags, \ + .flags = page_vma_walk_flags(_folio, _flags), \ } static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index b38a1d00c971..039a2d71e92f 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -350,10 +350,10 @@ unsigned long page_mapped_in_vma(const struct page *page, { const struct folio *folio = page_folio(page); struct page_vma_mapped_walk pvmw = { - .pfn = page_to_pfn(page), + .pfn = folio_page_vma_walk_pfn(folio), .nr_pages = 1, .vma = vma, - .flags = PVMW_SYNC, + .flags = page_vma_walk_flags(folio, PVMW_SYNC), }; pvmw.address = vma_address(vma, page_pgoff(folio, page), 1); diff --git a/mm/rmap.c b/mm/rmap.c index 7b9879ef442d..7fa976b7fb5a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1871,7 +1871,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) - pvmw.flags = PVMW_SYNC; + pvmw.flags = page_vma_walk_flags(folio, PVMW_SYNC); /* * For THP, we have to assume the worse case ie pmd for invalidation. @@ -2298,7 +2298,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) - pvmw.flags = PVMW_SYNC; + pvmw.flags = page_vma_walk_flags(folio, PVMW_SYNC); /* * For THP, we have to assume the worse case ie pmd for invalidation. diff --git a/mm/vmscan.c b/mm/vmscan.c index 614ccf39fe3f..101e1a16d75d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4203,7 +4203,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; - struct folio *folio = pfn_folio(pvmw->pfn); + struct folio *folio = page_vma_walk_pfn_to_folio(pvmw); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -- 2.34.1 To create a new migration entry for a given struct page, that page is first converted to its pfn, before passing the pfn to make_readable_migration_entry() (and friends). A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have a pfn and must be handled separately. Prepare for this with a new set of helpers: - make_readable_migration_entry_from_page() - make_readable_exclusive_migration_entry_from_page() - make_writable_migration_entry_from_page() These helpers take a struct page as parameter instead of a pfn. This will allow more flexibility for handling the swap offset field differently for device private pages. Signed-off-by: Jordan Niethe --- v1: - New to series v2: - Add flags param v3: - No change v4: - s/make_writeable_migration_entry_from_page/make_writable_migration_entry_from_page/ for the !CONFIG_MIGRATION case --- include/linux/leafops.h | 14 ++++++++++++++ include/linux/swapops.h | 34 ++++++++++++++++++++++++++++++++++ mm/huge_memory.c | 29 +++++++++++++++++------------ mm/hugetlb.c | 15 +++++++++------ mm/memory.c | 5 +++-- mm/migrate_device.c | 12 ++++++------ mm/mprotect.c | 10 +++++++--- mm/rmap.c | 12 ++++++------ 8 files changed, 96 insertions(+), 35 deletions(-) diff --git a/include/linux/leafops.h b/include/linux/leafops.h index a9ff94b744f2..52a1af3eb954 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -363,6 +363,20 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) return swp_offset(entry) & SWP_PFN_MASK; } +/** + * softleaf_to_flags() - Obtain flags encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: The flags associated with the leaf entry. + */ +static inline unsigned long softleaf_to_flags(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return swp_offset(entry) & (SWP_MIG_YOUNG | SWP_MIG_DIRTY); +} + /** * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 8cfc966eae48..c1d3c0e8981b 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -173,16 +173,34 @@ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) return swp_entry(SWP_MIGRATION_READ, offset); } +static inline swp_entry_t make_readable_migration_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(SWP_MIGRATION_READ, page_to_pfn(page) | flags); +} + static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); } +static inline swp_entry_t make_readable_exclusive_migration_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, page_to_pfn(page) | flags); +} + static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); } +static inline swp_entry_t make_writable_migration_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(SWP_MIGRATION_WRITE, page_to_pfn(page) | flags); +} + /* * Returns whether the host has large enough swap offset field to support * carrying over pgtable A/D bits for page migrations. The result is @@ -222,11 +240,27 @@ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } +static inline swp_entry_t make_readable_migration_entry_from_page(struct page *page, pgoff_t flags) +{ + return swp_entry(0, 0); +} + +static inline swp_entry_t make_writable_migration_entry_from_page(struct page *page, pgoff_t flags) +{ + return swp_entry(0, 0); +} + static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } +static inline swp_entry_t make_readable_exclusive_migration_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(0, 0); +} + static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40cf59301c21..e3a448cdb34d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1800,7 +1800,8 @@ static void copy_huge_non_present_pmd( if (softleaf_is_migration_write(entry) || softleaf_is_migration_read_exclusive(entry)) { - entry = make_readable_migration_entry(swp_offset(entry)); + entry = make_readable_migration_entry_from_page(softleaf_to_page(entry), + softleaf_to_flags(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); @@ -2524,9 +2525,13 @@ static void change_non_present_huge_pmd(struct mm_struct *mm, * just be safe and disable write */ if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + entry = make_readable_exclusive_migration_entry_from_page( + softleaf_to_page(entry), + softleaf_to_flags(entry)); else - entry = make_readable_migration_entry(swp_offset(entry)); + entry = make_readable_migration_entry_from_page( + softleaf_to_page(entry), + softleaf_to_flags(entry)); newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); @@ -3183,14 +3188,14 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { if (write) - swp_entry = make_writable_migration_entry( - page_to_pfn(page + i)); + swp_entry = make_writable_migration_entry_from_page( + page + i, 0); else if (anon_exclusive) - swp_entry = make_readable_exclusive_migration_entry( - page_to_pfn(page + i)); + swp_entry = make_readable_exclusive_migration_entry_from_page( + page + i, 0); else - swp_entry = make_readable_migration_entry( - page_to_pfn(page + i)); + swp_entry = make_readable_migration_entry_from_page( + page + i, 0); if (young) swp_entry = make_migration_entry_young(swp_entry); if (dirty) @@ -4890,11 +4895,11 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_dirty(pmdval)) folio_mark_dirty(folio); if (pmd_write(pmdval)) - entry = make_writable_migration_entry(page_to_pfn(page)); + entry = make_writable_migration_entry_from_page(page, 0); else if (anon_exclusive) - entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); + entry = make_readable_exclusive_migration_entry_from_page(page, 0); else - entry = make_readable_migration_entry(page_to_pfn(page)); + entry = make_readable_migration_entry_from_page(page, 0); if (pmd_young(pmdval)) entry = make_migration_entry_young(entry); if (pmd_dirty(pmdval)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1832da0f623..a2f9ac8a3177 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4955,8 +4955,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * COW mappings require pages in both * parent and child to be set to read. */ - softleaf = make_readable_migration_entry( - swp_offset(softleaf)); + softleaf = make_readable_migration_entry_from_page( + softleaf_to_page(softleaf), + softleaf_to_flags(softleaf)); entry = swp_entry_to_pte(softleaf); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); @@ -6491,11 +6492,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (softleaf_is_migration_write(entry)) { if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry( - swp_offset(entry)); + entry = make_readable_exclusive_migration_entry_from_page( + softleaf_to_page(entry), + softleaf_to_flags(entry)); else - entry = make_readable_migration_entry( - swp_offset(entry)); + entry = make_readable_migration_entry_from_page( + softleaf_to_page(entry), + softleaf_to_flags(entry)); newpte = swp_entry_to_pte(entry); pages++; } diff --git a/mm/memory.c b/mm/memory.c index da360a6eb8a4..349f360d82b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -963,8 +963,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * to be set to read. A previously exclusive entry is * now shared. */ - entry = make_readable_migration_entry( - swp_offset(entry)); + entry = make_readable_migration_entry_from_page( + softleaf_to_page(entry), + softleaf_to_flags(entry)); pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a2baaa2a81f9..c876526ac6a3 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -432,14 +432,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, /* Setup special migration page table entry */ if (mpfn & MIGRATE_PFN_WRITE) - entry = make_writable_migration_entry( - page_to_pfn(page)); + entry = make_writable_migration_entry_from_page( + page, 0); else if (anon_exclusive) - entry = make_readable_exclusive_migration_entry( - page_to_pfn(page)); + entry = make_readable_exclusive_migration_entry_from_page( + page, 0); else - entry = make_readable_migration_entry( - page_to_pfn(page)); + entry = make_readable_migration_entry_from_page( + page, 0); if (pte_present(pte)) { if (pte_young(pte)) entry = make_migration_entry_young(entry); diff --git a/mm/mprotect.c b/mm/mprotect.c index 283889e4f1ce..adfe1b7a4a19 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -328,10 +328,14 @@ static long change_pte_range(struct mmu_gather *tlb, * just be safe and disable write */ if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry( - swp_offset(entry)); + entry = make_readable_exclusive_migration_entry_from_page( + softleaf_to_page(entry), + softleaf_to_flags(entry)); else - entry = make_readable_migration_entry(swp_offset(entry)); + entry = make_readable_migration_entry_from_page( + softleaf_to_page(entry), + softleaf_to_flags(entry)); + newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); diff --git a/mm/rmap.c b/mm/rmap.c index 7fa976b7fb5a..79cba3d441c3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2536,14 +2536,14 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * pte is removed and then restart fault handling. */ if (writable) - entry = make_writable_migration_entry( - page_to_pfn(subpage)); + entry = make_writable_migration_entry_from_page( + subpage, 0); else if (anon_exclusive) - entry = make_readable_exclusive_migration_entry( - page_to_pfn(subpage)); + entry = make_readable_exclusive_migration_entry_from_page( + subpage, 0); else - entry = make_readable_migration_entry( - page_to_pfn(subpage)); + entry = make_readable_migration_entry_from_page( + subpage, 0); if (likely(pte_present(pteval))) { if (pte_young(pteval)) entry = make_migration_entry_young(entry); -- 2.34.1 A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have pfns and must be handled separately. When migrating a device private page a migration entry is created for that page. This includes the pfn for that page. Once device private pages begin using device memory offsets instead of pfns we will need to be able to determine which kind of value is in the entry so we can associate it with the correct page. Introduce new swap types that correspond to the existing migration entries: - SWP_MIGRATION_DEVICE_READ -> SWP_MIGRATION_READ - SWP_MIGRATION_DEVICE_WRITE -> SWP_MIGRATION_WRITE - SWP_MIGRATION_DEVICE_READ_EXCLUSIVE -> SWP_MIGRATION_READ_EXCLUSIVE The SWP_MIGRATION_DEVICE swap types are meant as specializations of the SWP_MIGRATION types - they are equivalent except the new entries contain device private offsets. Forgo creating new predicates for these new types in favour of new softleaf predicates that will be introduced in a subsequent patch. Currently the softleaf infrastructure does not have the means for creating new entries so provide swap entry helpers to that end. Actually using these creation helpers is deferred until a later patch when the softleaf predicates have been updated, otherwise the existing checks for migration entries would be broken. Note that SWP_DEVICE_NUM is increasing from 3 to 6. This reduces the maximum number of swap files in the worst case (i.e. CONFIG_DEVICE_PRIVATE, CONFIG_MIGRATION, CONFIG_MEMORY_FAILURE) from 24 to 21. Signed-off-by: Jordan Niethe Signed-off-by: Alistair Popple --- v1: - Update for softleaf infrastructure - Handle make_readable_migration_entry_from_page() and friends - s/make_device_migration_readable_exclusive_migration_entry/make_readable_exclusive_migration_device_private_entry - s/is_device_migration_readable_exclusive_entry/is_readable_exclusive_device_private_migration_entry/ v2: - Add softleaf_is_migration_device_private_read() v3: - Move softleaf changes to new patch - Update commit message to explain the change reduces the number of swap files. - Move creating the device private migration changes to a separate patch - Remove predicates - we'll rely on softleaf predicates entirely --- include/linux/swap.h | 8 +++++++- include/linux/swapops.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 38ca3df68716..c15e3b3067cd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,12 +74,18 @@ static inline int current_is_kswapd(void) * * When a page is mapped by the device for exclusive access we set the CPU page * table entries to a special SWP_DEVICE_EXCLUSIVE entry. + * + * Because device private pages do not use regular PFNs, special migration + * entries are also needed. */ #ifdef CONFIG_DEVICE_PRIVATE -#define SWP_DEVICE_NUM 3 +#define SWP_DEVICE_NUM 6 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) #define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) +#define SWP_MIGRATION_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) +#define SWP_MIGRATION_DEVICE_READ_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+4) +#define SWP_MIGRATION_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+5) #else #define SWP_DEVICE_NUM 0 #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c1d3c0e8981b..220627cb7fff 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -148,6 +148,21 @@ static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } +static inline swp_entry_t make_readable_migration_device_private_entry(pgoff_t offset) +{ + return swp_entry(SWP_MIGRATION_DEVICE_READ, offset); +} + +static inline swp_entry_t make_writable_migration_device_private_entry(pgoff_t offset) +{ + return swp_entry(SWP_MIGRATION_DEVICE_WRITE, offset); +} + +static inline swp_entry_t make_readable_exclusive_migration_device_private_entry(pgoff_t offset) +{ + return swp_entry(SWP_MIGRATION_DEVICE_READ_EXCLUSIVE, offset); +} + #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -164,6 +179,21 @@ static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) return swp_entry(0, 0); } +static inline swp_entry_t make_readable_migration_device_private_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + +static inline swp_entry_t make_writable_migration_device_private_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + +static inline swp_entry_t make_readable_exclusive_migration_device_private_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -- 2.34.1 Add equivalent softleaf entries for the device private migration swap entries: - SWP_MIGRATION_DEVICE_READ -> SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ - SWP_MIGRATION_DEVICE_WRITE -> SOFTLEAF_MIGRATION_DEVICE_PRIVATE_WRITE - SWP_MIGRATION_DEVICE_READ_EXCLUSIVE -> SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ_EXCLUSIVE As with their SWP_MIGRATION_DEVICE_xxx kin, the SOFTLEAF_MIGRATION_DEVICE_PRIVATE_xxx types are used as specializations of the SWP_MIGRATION_DEVICE_xxx types. The new entry types have the following relationships: - SOFTLEAF_MIGRATION_DEVICE_READ is-a SOFTLEAF_MIGRATION_READ - SOFTLEAF_MIGRATION_READ !is-a SOFTLEAF_MIGRATION_DEVICE_READ Update the existing softleaf_is_migration_xxx() predicates to reflect this relationship. It is possible to distinguish the between a SOFTLEAF_MIGRATION_DEVICE__xxx and a SOFTLEAF_MIGRATION_xxx type using the softleaf_is_migration_device_private_xxx() predicates. In practice, the only reason for introducing this new type is so we know when to not call pfn_to_page() on the swap entry offset once the device private pages are removed from the physical address space. That is the only time that the difference matters. Creating these new entries will occur in a subsequent patch. Signed-off-by: Jordan Niethe --- v3: - Separated from previous patch - s/SOFTLEAF_MIGRATION_DEVICE_/SOFTLEAF_MIGRATION_DEVICE_PRIVATE_/ - Update comment for softleaf_is_migration_read() --- include/linux/leafops.h | 90 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 10 deletions(-) diff --git a/include/linux/leafops.h b/include/linux/leafops.h index 52a1af3eb954..60681ada7b8e 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -28,6 +28,9 @@ enum softleaf_type { SOFTLEAF_DEVICE_PRIVATE_READ, SOFTLEAF_DEVICE_PRIVATE_WRITE, SOFTLEAF_DEVICE_EXCLUSIVE, + SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ, + SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ_EXCLUSIVE, + SOFTLEAF_MIGRATION_DEVICE_PRIVATE_WRITE, /* H/W posion types. */ SOFTLEAF_HWPOISON, /* Marker types. */ @@ -165,6 +168,12 @@ static inline enum softleaf_type softleaf_type(softleaf_t entry) return SOFTLEAF_DEVICE_PRIVATE_READ; case SWP_DEVICE_EXCLUSIVE: return SOFTLEAF_DEVICE_EXCLUSIVE; + case SWP_MIGRATION_DEVICE_READ: + return SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ; + case SWP_MIGRATION_DEVICE_WRITE: + return SOFTLEAF_MIGRATION_DEVICE_PRIVATE_WRITE; + case SWP_MIGRATION_DEVICE_READ_EXCLUSIVE: + return SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ_EXCLUSIVE; #endif #ifdef CONFIG_MEMORY_FAILURE case SWP_HWPOISON: @@ -190,28 +199,88 @@ static inline bool softleaf_is_swap(softleaf_t entry) return softleaf_type(entry) == SOFTLEAF_SWAP; } +/** + * softleaf_is_migration_device_private() - Is this leaf entry a migration + * device private entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private entry, otherwise false. + */ +static inline bool softleaf_is_migration_device_private(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ: + case SOFTLEAF_MIGRATION_DEVICE_PRIVATE_WRITE: + case SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ_EXCLUSIVE: + return true; + default: + return false; + } +} + +/** + * softleaf_is_migration_device_private_write() - Is this leaf entry a writable + * device private migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a writable device private migration entry, + * otherwise false. + */ +static inline bool softleaf_is_migration_device_private_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_DEVICE_PRIVATE_WRITE; +} + +/** + * softleaf_is_migration_device_private_read() - Is this leaf entry a readable + * device private migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is an readable device private migration + * entry, otherwise false. + */ +static inline bool softleaf_is_migration_device_private_read(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ; +} + +/** + * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive + * readable device private migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is an exclusive readable device private + * migration entry, otherwise false. + */ +static inline bool softleaf_is_migration_device_private_read_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_DEVICE_PRIVATE_READ_EXCLUSIVE; +} + /** * softleaf_is_migration_write() - Is this leaf entry a writable migration entry? * @entry: Leaf entry. * - * Returns: true if the leaf entry is a writable migration entry, otherwise - * false. + * Returns: true if the leaf entry is a writable migration entry or a writable + * device private migration entry, otherwise false. */ static inline bool softleaf_is_migration_write(softleaf_t entry) { - return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE; + return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE || + softleaf_is_migration_device_private_write(entry); } /** * softleaf_is_migration_read() - Is this leaf entry a readable migration entry? * @entry: Leaf entry. * - * Returns: true if the leaf entry is a readable migration entry, otherwise - * false. + * Returns: true if the leaf entry is a readable migration entry or a readable + * device private migration entry, otherwise false. */ static inline bool softleaf_is_migration_read(softleaf_t entry) { - return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ; + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ || + softleaf_is_migration_device_private_read(entry); } /** @@ -219,12 +288,13 @@ static inline bool softleaf_is_migration_read(softleaf_t entry) * readable migration entry? * @entry: Leaf entry. * - * Returns: true if the leaf entry is an exclusive readable migration entry, - * otherwise false. + * Returns: true if the leaf entry is an exclusive readable migration entry or + * exclusive readable device private migration entry, otherwise false. */ static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry) { - return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE; + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE || + softleaf_is_migration_device_private_read_exclusive(entry); } /** @@ -241,7 +311,7 @@ static inline bool softleaf_is_migration(softleaf_t entry) case SOFTLEAF_MIGRATION_WRITE: return true; default: - return false; + return softleaf_is_migration_device_private(entry); } } -- 2.34.1 Update the migration entry creation helpers to automatically create device private migration entries when invoked on device private pages. The corresponding softleaf predicates have already been updated to expect both migration and device private migration entries. Signed-off-by: Jordan Niethe --- v3: - Provided as an individual patch --- include/linux/swapops.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 220627cb7fff..8b39983792ea 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -206,6 +206,10 @@ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) static inline swp_entry_t make_readable_migration_entry_from_page(struct page *page, pgoff_t flags) { + if (is_device_private_page(page)) + return make_readable_migration_device_private_entry( + page_to_pfn(page) | flags); + return swp_entry(SWP_MIGRATION_READ, page_to_pfn(page) | flags); } @@ -217,6 +221,10 @@ static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset static inline swp_entry_t make_readable_exclusive_migration_entry_from_page(struct page *page, pgoff_t flags) { + if (is_device_private_page(page)) + return make_readable_exclusive_migration_device_private_entry( + page_to_pfn(page) | flags); + return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, page_to_pfn(page) | flags); } @@ -228,6 +236,10 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) static inline swp_entry_t make_writable_migration_entry_from_page(struct page *page, pgoff_t flags) { + if (is_device_private_page(page)) + return make_writable_migration_device_private_entry( + page_to_pfn(page) | flags); + return swp_entry(SWP_MIGRATION_WRITE, page_to_pfn(page) | flags); } -- 2.34.1 To create a new device private entry for a given struct page, that page is first converted to its pfn, before passing the pfn to make_writable_device_private_entry() (and friends). A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have a pfn and must be handled separately. Prepare for this with a new set of helpers: - make_readable_device_private_entry_from_page() - make_writable_device_private_entry_from_page() These helpers take a struct page as parameter instead of a pfn. This will allow more flexibility for handling the swap offset field differently for device private pages. Signed-off-by: Jordan Niethe --- v1: - New to series v2: - Add flag param v3: - No change --- include/linux/swapops.h | 24 ++++++++++++++++++++++++ mm/huge_memory.c | 14 ++++++-------- mm/migrate.c | 6 ++---- mm/migrate_device.c | 12 ++++-------- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 8b39983792ea..080e27da6b47 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -138,11 +138,23 @@ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) return swp_entry(SWP_DEVICE_READ, offset); } +static inline swp_entry_t make_readable_device_private_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(SWP_DEVICE_READ, page_to_pfn(page) | flags); +} + static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_WRITE, offset); } +static inline swp_entry_t make_writable_device_private_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(SWP_DEVICE_WRITE, page_to_pfn(page) | flags); +} + static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); @@ -169,11 +181,23 @@ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) return swp_entry(0, 0); } +static inline swp_entry_t make_readable_device_private_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(0, 0); +} + static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } +static inline swp_entry_t make_writable_device_private_entry_from_page(struct page *page, + pgoff_t flags) +{ + return swp_entry(0, 0); +} + static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e3a448cdb34d..03f1f13bb24c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3219,11 +3219,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * is false. */ if (write) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page + i)); + swp_entry = make_writable_device_private_entry_from_page( + page + i, 0); else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page + i)); + swp_entry = make_readable_device_private_entry_from_page( + page + i, 0); /* * Young and dirty bits are not progated via swp_entry */ @@ -4950,11 +4950,9 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) swp_entry_t entry; if (pmd_write(pmde)) - entry = make_writable_device_private_entry( - page_to_pfn(new)); + entry = make_writable_device_private_entry_from_page(new, 0); else - entry = make_readable_device_private_entry( - page_to_pfn(new)); + entry = make_readable_device_private_entry_from_page(new, 0); pmde = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pvmw->pmd)) diff --git a/mm/migrate.c b/mm/migrate.c index 4688b9e38cd2..24e3ebbab1e1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -399,11 +399,9 @@ static bool remove_migration_pte(struct folio *folio, if (unlikely(is_device_private_page(new))) { if (pte_write(pte)) - entry = make_writable_device_private_entry( - page_to_pfn(new)); + entry = make_writable_device_private_entry_from_page(new, 0); else - entry = make_readable_device_private_entry( - page_to_pfn(new)); + entry = make_readable_device_private_entry_from_page(new, 0); pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index c876526ac6a3..0ca6f78df0e2 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -836,11 +836,9 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, swp_entry_t swp_entry; if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); + swp_entry = make_writable_device_private_entry_from_page(page, 0); else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); + swp_entry = make_readable_device_private_entry_from_page(page, 0); entry = swp_entry_to_pmd(swp_entry); } else { if (folio_is_zone_device(folio) && @@ -1033,11 +1031,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry_t swp_entry; if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); + swp_entry = make_writable_device_private_entry_from_page(page, 0); else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); + swp_entry = make_readable_device_private_entry_from_page(page, 0); entry = swp_entry_to_pte(swp_entry); } else { if (folio_is_zone_device(folio) && -- 2.34.1 A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have normal pfns and must be handled separately. Add a new flag PAGE_SNAPSHOT_DEVICE_PRIVATE to track when the pfn of a page snapshot is a device private page. Signed-off-by: Jordan Niethe Signed-off-by: Alistair Popple --- v1: - No change v2: - No change v3: - No change v4: - Move logical continuation to previous line --- fs/proc/page.c | 6 ++++-- include/linux/mm.h | 7 ++++--- mm/util.c | 3 +++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index f9b2c2c906cd..bc14f7ebc369 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -191,10 +191,12 @@ u64 stable_page_flags(const struct page *page) folio_test_large_rmappable(folio)) { /* Note: we indicate any THPs here, not just PMD-sized ones */ u |= 1 << KPF_THP; - } else if (is_huge_zero_pfn(ps.pfn)) { + } else if (!(ps.flags & PAGE_SNAPSHOT_DEVICE_PRIVATE) && + is_huge_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; - } else if (is_zero_pfn(ps.pfn)) { + } else if (!(ps.flags & PAGE_SNAPSHOT_DEVICE_PRIVATE) && + is_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; } diff --git a/include/linux/mm.h b/include/linux/mm.h index f0d5be9dc736..a52979536a5e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4627,9 +4627,10 @@ static inline bool page_pool_page_is_pp(const struct page *page) } #endif -#define PAGE_SNAPSHOT_FAITHFUL (1 << 0) -#define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) -#define PAGE_SNAPSHOT_PG_IDLE (1 << 2) +#define PAGE_SNAPSHOT_FAITHFUL (1 << 0) +#define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) +#define PAGE_SNAPSHOT_PG_IDLE (1 << 2) +#define PAGE_SNAPSHOT_DEVICE_PRIVATE (1 << 3) struct page_snapshot { struct folio folio_snapshot; diff --git a/mm/util.c b/mm/util.c index 97cae40c0209..65e3f1a97d76 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1218,6 +1218,9 @@ static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, if (folio_test_idle(folio)) ps->flags |= PAGE_SNAPSHOT_PG_IDLE; + + if (is_device_private_page(page)) + ps->flags |= PAGE_SNAPSHOT_DEVICE_PRIVATE; } /** -- 2.34.1 A future change will remove device private pages from the physical address space. This will mean that device private pages no longer have normal pfns and must be handled separately. Prepare for this by adding a HMM_PFN_DEVICE_PRIVATE flag to indicate that a hmm_pfn contains a PFN for a device private page. Signed-off-by: Jordan Niethe Signed-off-by: Alistair Popple --- v1: - Update HMM_PFN_ORDER_SHIFT - Handle hmm_vma_handle_absent_pmd() v2: - No change v3: - No change --- include/linux/hmm.h | 4 +++- mm/hmm.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index db75ffc949a7..d8756c341620 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -23,6 +23,7 @@ struct mmu_interval_notifier; * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) * HMM_PFN_ERROR - accessing the pfn is impossible and the device should * fail. ie poisoned memory, special pages, no vma, etc + * HMM_PFN_DEVICE_PRIVATE - the pfn field contains a DEVICE_PRIVATE pfn. * HMM_PFN_P2PDMA - P2P page * HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer * HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation @@ -40,6 +41,7 @@ enum hmm_pfn_flags { HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), + HMM_PFN_DEVICE_PRIVATE = 1UL << (BITS_PER_LONG - 7), /* * Sticky flags, carried from input to output, * don't forget to update HMM_PFN_INOUT_FLAGS @@ -48,7 +50,7 @@ enum hmm_pfn_flags { HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5), HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6), - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11), + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 12), /* Input flags */ HMM_PFN_REQ_FAULT = HMM_PFN_VALID, diff --git a/mm/hmm.c b/mm/hmm.c index 4ec74c18bef6..14895fa6575f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -267,7 +267,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (softleaf_is_device_private(entry) && page_pgmap(softleaf_to_page(entry))->owner == range->dev_private_owner) { - cpu_flags = HMM_PFN_VALID; + cpu_flags = HMM_PFN_VALID | HMM_PFN_DEVICE_PRIVATE; if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; new_pfn_flags = softleaf_to_pfn(entry) | cpu_flags; @@ -347,7 +347,8 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, softleaf_to_folio(entry)->pgmap->owner == range->dev_private_owner) { unsigned long cpu_flags = HMM_PFN_VALID | - hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT) | + HMM_PFN_DEVICE_PRIVATE; unsigned long pfn = softleaf_to_pfn(entry); unsigned long i; -- 2.34.1 The existing design of device private memory imposes limitations which render it non functional for certain systems and configurations where the physical address space is limited. Device private memory is implemented by first reserving a region of the physical address space. This is a problem. The physical address space is not a resource that is directly under the kernel's control. Availability of suitable physical address space is constrained by the underlying hardware and firmware and may not always be available. Device private memory assumes that it will be able to reserve a device memory sized chunk of physical address space. However, there is nothing guaranteeing that this will succeed, and there a number of factors that increase the likelihood of failure. We need to consider what else may exist in the physical address space. It is observed that certain VM configurations place very large PCI windows immediately after RAM. Large enough that there is no physical address space available at all for device private memory. This is more likely to occur on 43 bit physical width systems which have less physical address space. Instead of using the physical address space, introduce a device private address space and allocate devices regions from there to represent the device private pages. Introduce a new interface memremap_device_private_pagemap() that allocates a requested amount of device private address space and creates the necessary device private pages. To support this new interface, struct dev_pagemap needs some changes: - Add a new dev_pagemap::nr_pages field as an input parameter. - Add a new dev_pagemap::pages array to store the device private pages. When using memremap_device_private_pagemap(), rather then passing in dev_pagemap::ranges[dev_pagemap::nr_ranges] of physical address space to be remapped, dev_pagemap::nr_ranges will always be 1, and the device private range that is reserved is returned in dev_pagemap::range. Forbid calling memremap_pages() with dev_pagemap::ranges::type = MEMORY_DEVICE_PRIVATE. Represent this device private address space using a new device_private_pgmap_tree maple tree. This tree maps a given device private address to a struct dev_pagemap, where a specific device private page may then be looked up in that dev_pagemap::pages array. Device private address space can be reclaimed and the assoicated device private pages freed using the corresponding new memunmap_device_private_pagemap() interface. Because the device private pages now live outside the physical address space, they no longer have a normal PFN. This means that page_to_pfn(), et al. are no longer meaningful. Introduce helpers: - device_private_page_to_offset() - device_private_folio_to_offset() to take a given device private page / folio and return its offset within the device private address space. Update the places where we previously converted a device private page to a PFN to use these new helpers. When we encounter a device private offset, instead of looking up its page within the pagemap use device_private_offset_to_page() instead. Update the existing users: - lib/test_hmm.c - ppc ultravisor - drm/amd/amdkfd - gpu/drm/xe - gpu/drm/nouveau to use the new memremap_device_private_pagemap() interface. Acked-by: Felix Kuehling Reviewed-by: Zi Yan # for MM changes Signed-off-by: Jordan Niethe Signed-off-by: Alistair Popple --- v1: - Include NUMA node paramater for memremap_device_private_pagemap() - Add devm_memremap_device_private_pagemap() and friends - Update existing users of memremap_pages(): - ppc ultravisor - drm/amd/amdkfd - gpu/drm/xe - gpu/drm/nouveau - Update for HMM huge page support - Guard device_private_offset_to_page and friends with CONFIG_ZONE_DEVICE v2: - Make sure last member of struct dev_pagemap remains DECLARE_FLEX_ARRAY(struct range, ranges); v3: - Use numa_mem_id() if memremap_device_private_pagemap is called with NUMA_NO_NODE. This fixes a null pointer deref in lruvec_stat_mod_folio(). - drm/xe: Remove call to devm_release_mem_region() in xe_pagemap_destroy_work() - s/VM_BUG/VM_WARN/ v4: - Use devm_memunmap_device_private_pagemap() in xe_pagemap_destroy_work() - Replace ^ with != for PVMW_DEVICE_PRIVATE comparisions - Minor style changes - remove discussion of aarch64 from commit message - not relevant post eeb8fdfcf090 ("arm64: Expose the end of the linear map in PHYSMEM_END") --- Documentation/mm/hmm.rst | 11 +- arch/powerpc/kvm/book3s_hv_uvmem.c | 41 ++--- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 23 +-- drivers/gpu/drm/nouveau/nouveau_dmem.c | 35 ++-- drivers/gpu/drm/xe/xe_svm.c | 30 +--- include/linux/hmm.h | 3 + include/linux/leafops.h | 16 +- include/linux/memremap.h | 64 +++++++- include/linux/migrate.h | 6 +- include/linux/mm.h | 2 + include/linux/rmap.h | 5 + include/linux/swapops.h | 10 +- lib/test_hmm.c | 70 ++++---- mm/debug.c | 9 +- mm/memremap.c | 196 ++++++++++++++++++----- mm/mm_init.c | 8 +- mm/page_vma_mapped.c | 22 ++- mm/rmap.c | 43 +++-- mm/util.c | 5 +- 19 files changed, 397 insertions(+), 202 deletions(-) diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst index 7d61b7a8b65b..27067a6a2408 100644 --- a/Documentation/mm/hmm.rst +++ b/Documentation/mm/hmm.rst @@ -276,17 +276,12 @@ These can be allocated and freed with:: struct resource *res; struct dev_pagemap pagemap; - res = request_free_mem_region(&iomem_resource, /* number of bytes */, - "name of driver resource"); pagemap.type = MEMORY_DEVICE_PRIVATE; - pagemap.range.start = res->start; - pagemap.range.end = res->end; - pagemap.nr_range = 1; + pagemap.nr_pages = /* number of pages */; pagemap.ops = &device_devmem_ops; - memremap_pages(&pagemap, numa_node_id()); + memremap_device_private_pagemap(&pagemap, numa_node_id()); - memunmap_pages(&pagemap); - release_mem_region(pagemap.range.start, range_len(&pagemap.range)); + memunmap_device_private_pagemap(&pagemap); There are also devm_request_free_mem_region(), devm_memremap_pages(), devm_memunmap_pages(), and devm_release_mem_region() when the resources can diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 67910900af7b..948747db8231 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -636,7 +636,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, mutex_lock(&kvm->arch.uvmem_lock); if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) { - uvmem_page = pfn_to_page(uvmem_pfn); + uvmem_page = device_private_offset_to_page(uvmem_pfn); pvt = uvmem_page->zone_device_data; pvt->skip_page_out = skip_page_out; pvt->remove_gfn = true; @@ -721,7 +721,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) pvt->gpa = gpa; pvt->kvm = kvm; - dpage = pfn_to_page(uvmem_pfn); + dpage = device_private_offset_to_page(uvmem_pfn); dpage->zone_device_data = pvt; zone_device_page_init(dpage, 0); return dpage; @@ -888,7 +888,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa, srcu_idx = srcu_read_lock(&kvm->srcu); mutex_lock(&kvm->arch.uvmem_lock); if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) { - uvmem_page = pfn_to_page(uvmem_pfn); + uvmem_page = device_private_offset_to_page(uvmem_pfn); pvt = uvmem_page->zone_device_data; pvt->skip_page_out = true; /* @@ -906,7 +906,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa, mutex_lock(&kvm->arch.uvmem_lock); if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) { - uvmem_page = pfn_to_page(uvmem_pfn); + uvmem_page = device_private_offset_to_page(uvmem_pfn); pvt = uvmem_page->zone_device_data; pvt->skip_page_out = true; pvt->remove_gfn = false; /* it continues to be a valid GFN */ @@ -1017,7 +1017,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) static void kvmppc_uvmem_folio_free(struct folio *folio) { struct page *page = &folio->page; - unsigned long pfn = page_to_pfn(page) - + unsigned long pfn = device_private_page_to_offset(page) - (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT); struct kvmppc_uvmem_page_pvt *pvt; @@ -1159,8 +1159,6 @@ int kvmppc_uvmem_init(void) { int ret = 0; unsigned long size; - struct resource *res; - void *addr; unsigned long pfn_last, pfn_first; size = kvmppc_get_secmem_size(); @@ -1174,27 +1172,18 @@ int kvmppc_uvmem_init(void) goto out; } - res = request_free_mem_region(&iomem_resource, size, "kvmppc_uvmem"); - if (IS_ERR(res)) { - ret = PTR_ERR(res); - goto out; - } - kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; - kvmppc_uvmem_pgmap.range.start = res->start; - kvmppc_uvmem_pgmap.range.end = res->end; kvmppc_uvmem_pgmap.nr_range = 1; + kvmppc_uvmem_pgmap.nr_pages = size / PAGE_SIZE; kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; /* just one global instance: */ kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap; - addr = memremap_pages(&kvmppc_uvmem_pgmap, NUMA_NO_NODE); - if (IS_ERR(addr)) { - ret = PTR_ERR(addr); - goto out_free_region; - } + ret = memremap_device_private_pagemap(&kvmppc_uvmem_pgmap, NUMA_NO_NODE); + if (ret) + goto out; - pfn_first = res->start >> PAGE_SHIFT; - pfn_last = pfn_first + (resource_size(res) >> PAGE_SHIFT); + pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT; + pfn_last = pfn_first + (range_len(&kvmppc_uvmem_pgmap.range) >> PAGE_SHIFT); kvmppc_uvmem_bitmap = bitmap_zalloc(pfn_last - pfn_first, GFP_KERNEL); if (!kvmppc_uvmem_bitmap) { ret = -ENOMEM; @@ -1204,9 +1193,7 @@ int kvmppc_uvmem_init(void) pr_info("KVMPPC-UVMEM: Secure Memory size 0x%lx\n", size); return ret; out_unmap: - memunmap_pages(&kvmppc_uvmem_pgmap); -out_free_region: - release_mem_region(res->start, size); + memunmap_device_private_pagemap(&kvmppc_uvmem_pgmap); out: return ret; } @@ -1216,8 +1203,6 @@ void kvmppc_uvmem_free(void) if (!kvmppc_uvmem_bitmap) return; - memunmap_pages(&kvmppc_uvmem_pgmap); - release_mem_region(kvmppc_uvmem_pgmap.range.start, - range_len(&kvmppc_uvmem_pgmap.range)); + memunmap_device_private_pagemap(&kvmppc_uvmem_pgmap); bitmap_free(kvmppc_uvmem_bitmap); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 5478e41877e5..7e1450e56531 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -1030,9 +1030,9 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) { struct amdgpu_kfd_dev *kfddev = &adev->kfd; struct dev_pagemap *pgmap; - struct resource *res = NULL; unsigned long size; void *r; + int ret; /* Page migration works on gfx9 or newer */ if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1)) @@ -1053,11 +1053,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1; pgmap->type = MEMORY_DEVICE_COHERENT; } else { - res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); - if (IS_ERR(res)) - return PTR_ERR(res); - pgmap->range.start = res->start; - pgmap->range.end = res->end; + pgmap->nr_pages = size / PAGE_SIZE; pgmap->type = MEMORY_DEVICE_PRIVATE; } @@ -1068,14 +1064,19 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) /* Device manager releases device-specific resources, memory region and * pgmap when driver disconnects from device. */ - r = devm_memremap_pages(adev->dev, pgmap); - if (IS_ERR(r)) { + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + ret = devm_memremap_device_private_pagemap(adev->dev, pgmap); + } else { + r = devm_memremap_pages(adev->dev, pgmap); + if (IS_ERR(r)) + ret = PTR_ERR(r); + } + + if (ret) { pr_err("failed to register HMM device memory\n"); - if (pgmap->type == MEMORY_DEVICE_PRIVATE) - devm_release_mem_region(adev->dev, res->start, resource_size(res)); /* Disable SVM support capability */ pgmap->type = 0; - return PTR_ERR(r); + return ret; } pr_debug("reserve %ldMB system memory for VRAM pages struct\n", diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index adfa3df5cbc5..37fe1cfba414 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -109,7 +109,7 @@ static struct nouveau_drm *page_to_drm(struct page *page) unsigned long nouveau_dmem_page_addr(struct page *page) { struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page); - unsigned long off = (page_to_pfn(page) << PAGE_SHIFT) - + unsigned long off = (device_private_page_to_offset(page) << PAGE_SHIFT) - chunk->pagemap.range.start; return chunk->bo->offset + off; @@ -297,9 +297,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage, bool is_large) { struct nouveau_dmem_chunk *chunk; - struct resource *res; struct page *page; - void *ptr; unsigned long i, pfn_first, pfn; int ret; @@ -309,39 +307,28 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage, goto out; } - /* Allocate unused physical address space for device private pages. */ - res = request_free_mem_region(&iomem_resource, DMEM_CHUNK_SIZE * NR_CHUNKS, - "nouveau_dmem"); - if (IS_ERR(res)) { - ret = PTR_ERR(res); - goto out_free; - } - chunk->drm = drm; chunk->pagemap.type = MEMORY_DEVICE_PRIVATE; - chunk->pagemap.range.start = res->start; - chunk->pagemap.range.end = res->end; chunk->pagemap.nr_range = 1; + chunk->pagemap.nr_pages = DMEM_CHUNK_SIZE * NR_CHUNKS / PAGE_SIZE; chunk->pagemap.ops = &nouveau_dmem_pagemap_ops; chunk->pagemap.owner = drm->dev; ret = nouveau_bo_new_pin(&drm->client, NOUVEAU_GEM_DOMAIN_VRAM, DMEM_CHUNK_SIZE, &chunk->bo); if (ret) - goto out_release; + goto out_free; - ptr = memremap_pages(&chunk->pagemap, numa_node_id()); - if (IS_ERR(ptr)) { - ret = PTR_ERR(ptr); + ret = memremap_device_private_pagemap(&chunk->pagemap, numa_node_id()); + if (ret) goto out_bo_free; - } mutex_lock(&drm->dmem->mutex); list_add(&chunk->list, &drm->dmem->chunks); mutex_unlock(&drm->dmem->mutex); pfn_first = chunk->pagemap.range.start >> PAGE_SHIFT; - page = pfn_to_page(pfn_first); + page = device_private_offset_to_page(pfn_first); spin_lock(&drm->dmem->lock); pfn = pfn_first; @@ -350,12 +337,12 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage, if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) || !is_large) { for (j = 0; j < DMEM_CHUNK_NPAGES - 1; j++, pfn++) { - page = pfn_to_page(pfn); + page = device_private_offset_to_page(pfn); page->zone_device_data = drm->dmem->free_pages; drm->dmem->free_pages = page; } } else { - page = pfn_to_page(pfn); + page = device_private_offset_to_page(pfn); page->zone_device_data = drm->dmem->free_folios; drm->dmem->free_folios = page_folio(page); pfn += DMEM_CHUNK_NPAGES; @@ -382,8 +369,6 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage, out_bo_free: nouveau_bo_unpin_del(&chunk->bo); -out_release: - release_mem_region(chunk->pagemap.range.start, range_len(&chunk->pagemap.range)); out_free: kfree(chunk); out: @@ -543,9 +528,7 @@ nouveau_dmem_fini(struct nouveau_drm *drm) nouveau_bo_unpin_del(&chunk->bo); WARN_ON(chunk->callocated); list_del(&chunk->list); - memunmap_pages(&chunk->pagemap); - release_mem_region(chunk->pagemap.range.start, - range_len(&chunk->pagemap.range)); + memunmap_device_private_pagemap(&chunk->pagemap); kfree(chunk); } diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index a8aad9e0b1fb..aadc73b6f951 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -452,7 +452,7 @@ static u64 xe_page_to_dpa(struct page *page) struct xe_pagemap *xpagemap = xe_page_to_pagemap(page); struct xe_vram_region *vr = xe_pagemap_to_vr(xpagemap); u64 hpa_base = xpagemap->hpa_base; - u64 pfn = page_to_pfn(page); + u64 pfn = device_private_page_to_offset(page); u64 offset; u64 dpa; @@ -1699,9 +1699,7 @@ static void xe_pagemap_destroy_work(struct work_struct *work) * will do shortly. */ if (drm_dev_enter(drm, &idx)) { - devm_memunmap_pages(drm->dev, pagemap); - devm_release_mem_region(drm->dev, pagemap->range.start, - pagemap->range.end - pagemap->range.start + 1); + devm_memunmap_device_private_pagemap(drm->dev, pagemap); drm_dev_exit(idx); } @@ -1745,8 +1743,6 @@ static struct xe_pagemap *xe_pagemap_create(struct xe_device *xe, struct xe_vram struct xe_pagemap *xpagemap; struct dev_pagemap *pagemap; struct drm_pagemap *dpagemap; - struct resource *res; - void *addr; int err; xpagemap = kzalloc(sizeof(*xpagemap), GFP_KERNEL); @@ -1763,36 +1759,24 @@ static struct xe_pagemap *xe_pagemap_create(struct xe_device *xe, struct xe_vram if (err) goto out_no_dpagemap; - res = devm_request_free_mem_region(dev, &iomem_resource, - vr->usable_size); - if (IS_ERR(res)) { - err = PTR_ERR(res); - goto out_err; - } - err = drm_pagemap_acquire_owner(&xpagemap->peer, &xe_owner_list, xe_has_interconnect); if (err) - goto out_no_owner; + goto out_err; pagemap->type = MEMORY_DEVICE_PRIVATE; - pagemap->range.start = res->start; - pagemap->range.end = res->end; pagemap->nr_range = 1; + pagemap->nr_pages = vr->usable_size / PAGE_SIZE; pagemap->owner = xpagemap->peer.owner; pagemap->ops = drm_pagemap_pagemap_ops_get(); - addr = devm_memremap_pages(dev, pagemap); - if (IS_ERR(addr)) { - err = PTR_ERR(addr); + err = devm_memremap_device_private_pagemap(dev, pagemap); + if (err) goto out_no_pages; - } - xpagemap->hpa_base = res->start; + xpagemap->hpa_base = pagemap->range.start; return xpagemap; out_no_pages: drm_pagemap_release_owner(&xpagemap->peer); -out_no_owner: - devm_release_mem_region(dev, res->start, res->end - res->start + 1); out_err: drm_pagemap_put(dpagemap); return ERR_PTR(err); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index d8756c341620..25bb4df298f7 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -68,6 +68,9 @@ enum hmm_pfn_flags { */ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) { + if (hmm_pfn & HMM_PFN_DEVICE_PRIVATE) + return device_private_offset_to_page(hmm_pfn & ~HMM_PFN_FLAGS); + return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS); } diff --git a/include/linux/leafops.h b/include/linux/leafops.h index 60681ada7b8e..612c8e91b775 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -455,7 +455,13 @@ static inline unsigned long softleaf_to_flags(softleaf_t entry) */ static inline struct page *softleaf_to_page(softleaf_t entry) { - struct page *page = pfn_to_page(softleaf_to_pfn(entry)); + struct page *page; + + if (softleaf_is_migration_device_private(entry) || + softleaf_is_device_private(entry)) + page = device_private_entry_to_page(entry); + else + page = pfn_to_page(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); /* @@ -475,7 +481,13 @@ static inline struct page *softleaf_to_page(softleaf_t entry) */ static inline struct folio *softleaf_to_folio(softleaf_t entry) { - struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); + struct folio *folio; + + if (softleaf_is_migration_device_private(entry) || + softleaf_is_device_private(entry)) + folio = page_folio(device_private_entry_to_page(entry)); + else + folio = pfn_folio(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); /* diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 713ec0435b48..d702eb52dab9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -37,6 +37,7 @@ struct vmem_altmap { * backing the device memory. Doing so simplifies the implementation, but it is * important to remember that there are certain points at which the struct page * must be treated as an opaque object, rather than a "normal" struct page. + * Unlike "normal" struct pages, the page_to_pfn() is invalid. * * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/mm/hmm.rst. @@ -126,8 +127,12 @@ struct dev_pagemap_ops { * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. - * @nr_range: number of ranges to be mapped - * @range: range to be mapped when nr_range == 1 + * @nr_pages: number of pages requested to be mapped for MEMORY_DEVICE_PRIVATE. + * @pages: array of nr_pages initialized for MEMORY_DEVICE_PRIVATE. + * @nr_range: number of ranges to be mapped. Always == 1 for + * MEMORY_DEVICE_PRIVATE. + * @range: range to be mapped when nr_range == 1. Used as an output param for + * MEMORY_DEVICE_PRIVATE. * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { @@ -139,6 +144,8 @@ struct dev_pagemap { unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; + unsigned long nr_pages; + struct page *pages; int nr_range; union { struct range range; @@ -224,7 +231,14 @@ static inline bool is_fsdax_page(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE +void __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap); void zone_device_page_init(struct page *page, unsigned int order); +unsigned long memremap_device_private_pagemap(struct dev_pagemap *pgmap, int nid); +void memunmap_device_private_pagemap(struct dev_pagemap *pgmap); +int devm_memremap_device_private_pagemap(struct device *dev, struct dev_pagemap *pgmap); +void devm_memunmap_device_private_pagemap(struct device *dev, struct dev_pagemap *pgmap); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -234,6 +248,15 @@ bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); +struct page *device_private_offset_to_page(unsigned long offset); +struct page *device_private_entry_to_page(softleaf_t entry); +pgoff_t device_private_page_to_offset(const struct page *page); + +static inline pgoff_t device_private_folio_to_offset(const struct folio *folio) +{ + return device_private_page_to_offset((const struct page *)&folio->page); +} + static inline void zone_device_folio_init(struct folio *folio, unsigned int order) { zone_device_page_init(&folio->page, order); @@ -276,6 +299,23 @@ static inline void devm_memunmap_pages(struct device *dev, { } +static inline int devm_memremap_device_private_pagemap(struct device *dev, + struct dev_pagemap *pgmap) +{ + /* + * Fail attempts to call devm_memremap_device_private_pagemap() without + * ZONE_DEVICE support enabled, this requires callers to fall + * back to plain devm_memremap() based on config + */ + WARN_ON_ONCE(1); + return -ENXIO; +} + +static inline void devm_memunmap_device_private_pagemap(struct device *dev, + struct dev_pagemap *pgmap) +{ +} + static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { return NULL; @@ -296,6 +336,26 @@ static inline void zone_device_private_split_cb(struct folio *original_folio, struct folio *new_folio) { } + +static inline struct page *device_private_offset_to_page(unsigned long offset) +{ + return NULL; +} + +static inline struct page *device_private_entry_to_page(softleaf_t entry) +{ + return NULL; +} + +static inline pgoff_t device_private_page_to_offset(const struct page *page) +{ + return 0; +} + +static inline pgoff_t device_private_folio_to_offset(const struct folio *folio) +{ + return 0; +} #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 5fd2ee080bc0..2921b3abddf3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -133,6 +133,10 @@ static inline struct page *migrate_pfn_to_page(unsigned long mpfn) { if (!(mpfn & MIGRATE_PFN_VALID)) return NULL; + + if (mpfn & MIGRATE_PFN_DEVICE_PRIVATE) + return device_private_offset_to_page(mpfn >> MIGRATE_PFN_SHIFT); + return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT); } @@ -144,7 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn) static inline unsigned long migrate_pfn_from_page(struct page *page) { if (is_device_private_page(page)) - return migrate_pfn(page_to_pfn(page)) | + return migrate_pfn(device_private_page_to_offset(page)) | MIGRATE_PFN_DEVICE_PRIVATE; return migrate_pfn(page_to_pfn(page)); } diff --git a/include/linux/mm.h b/include/linux/mm.h index a52979536a5e..27089ca4b0c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2042,6 +2042,8 @@ static inline unsigned long memdesc_section(memdesc_flags_t mdf) */ static inline unsigned long folio_pfn(const struct folio *folio) { + VM_WARN_ON(folio_is_device_private(folio)); + return page_to_pfn(&folio->page); } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1b03297f13dc..fffa5fadcbbf 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -951,11 +951,16 @@ static inline unsigned long page_vma_walk_flags(const struct folio *folio, static inline unsigned long folio_page_vma_walk_pfn(const struct folio *folio) { + if (folio_is_device_private(folio)) + return device_private_folio_to_offset(folio); + return folio_pfn(folio); } static inline struct folio *page_vma_walk_pfn_to_folio(struct page_vma_mapped_walk *pvmw) { + if (pvmw->flags & PVMW_DEVICE_PRIVATE) + return page_folio(device_private_offset_to_page(pvmw->pfn)); return pfn_folio(pvmw->pfn); } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 080e27da6b47..2f83e647d6c5 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -141,7 +141,7 @@ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) static inline swp_entry_t make_readable_device_private_entry_from_page(struct page *page, pgoff_t flags) { - return swp_entry(SWP_DEVICE_READ, page_to_pfn(page) | flags); + return swp_entry(SWP_DEVICE_READ, device_private_page_to_offset(page) | flags); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) @@ -152,7 +152,7 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) static inline swp_entry_t make_writable_device_private_entry_from_page(struct page *page, pgoff_t flags) { - return swp_entry(SWP_DEVICE_WRITE, page_to_pfn(page) | flags); + return swp_entry(SWP_DEVICE_WRITE, device_private_page_to_offset(page) | flags); } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) @@ -232,7 +232,7 @@ static inline swp_entry_t make_readable_migration_entry_from_page(struct page *p { if (is_device_private_page(page)) return make_readable_migration_device_private_entry( - page_to_pfn(page) | flags); + device_private_page_to_offset(page) | flags); return swp_entry(SWP_MIGRATION_READ, page_to_pfn(page) | flags); } @@ -247,7 +247,7 @@ static inline swp_entry_t make_readable_exclusive_migration_entry_from_page(stru { if (is_device_private_page(page)) return make_readable_exclusive_migration_device_private_entry( - page_to_pfn(page) | flags); + device_private_page_to_offset(page) | flags); return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, page_to_pfn(page) | flags); } @@ -262,7 +262,7 @@ static inline swp_entry_t make_writable_migration_entry_from_page(struct page *p { if (is_device_private_page(page)) return make_writable_migration_device_private_entry( - page_to_pfn(page) | flags); + device_private_page_to_offset(page) | flags); return swp_entry(SWP_MIGRATION_WRITE, page_to_pfn(page) | flags); } diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 872d3846af7b..e1eddb4aaeb1 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -497,7 +497,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, struct page **ppage, bool is_large) { struct dmirror_chunk *devmem; - struct resource *res = NULL; + bool device_private = false; unsigned long pfn; unsigned long pfn_first; unsigned long pfn_last; @@ -510,13 +510,9 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, switch (mdevice->zone_device_type) { case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE: - res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, - "hmm_dmirror"); - if (IS_ERR_OR_NULL(res)) - goto err_devmem; - devmem->pagemap.range.start = res->start; - devmem->pagemap.range.end = res->end; + device_private = true; devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + devmem->pagemap.nr_pages = DEVMEM_CHUNK_SIZE / PAGE_SIZE; break; case HMM_DMIRROR_MEMORY_DEVICE_COHERENT: devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ? @@ -525,13 +521,13 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, devmem->pagemap.range.end = devmem->pagemap.range.start + DEVMEM_CHUNK_SIZE - 1; devmem->pagemap.type = MEMORY_DEVICE_COHERENT; + devmem->pagemap.nr_range = 1; break; default: ret = -EINVAL; goto err_devmem; } - devmem->pagemap.nr_range = 1; devmem->pagemap.ops = &dmirror_devmem_ops; devmem->pagemap.owner = mdevice; @@ -551,13 +547,20 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, mdevice->devmem_capacity = new_capacity; mdevice->devmem_chunks = new_chunks; } - ptr = memremap_pages(&devmem->pagemap, numa_node_id()); - if (IS_ERR_OR_NULL(ptr)) { - if (ptr) - ret = PTR_ERR(ptr); - else - ret = -EFAULT; - goto err_release; + + if (device_private) { + ret = memremap_device_private_pagemap(&devmem->pagemap, numa_node_id()); + if (ret) + goto err_release; + } else { + ptr = memremap_pages(&devmem->pagemap, numa_node_id()); + if (IS_ERR_OR_NULL(ptr)) { + if (ptr) + ret = PTR_ERR(ptr); + else + ret = -EFAULT; + goto err_release; + } } devmem->mdevice = mdevice; @@ -567,15 +570,21 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, mutex_unlock(&mdevice->devmem_lock); - pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", + pr_info("added new %u MB chunk (total %u chunks, %u MB) %sPFNs [0x%lx 0x%lx)\n", DEVMEM_CHUNK_SIZE / (1024 * 1024), mdevice->devmem_count, mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), + device_private ? "device " : "", pfn_first, pfn_last); spin_lock(&mdevice->lock); for (pfn = pfn_first; pfn < pfn_last; ) { - struct page *page = pfn_to_page(pfn); + struct page *page; + + if (device_private) + page = device_private_offset_to_page(pfn); + else + page = pfn_to_page(pfn); if (is_large && IS_ALIGNED(pfn, HPAGE_PMD_NR) && (pfn + HPAGE_PMD_NR <= pfn_last)) { @@ -616,9 +625,6 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, err_release: mutex_unlock(&mdevice->devmem_lock); - if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); err_devmem: kfree(devmem); @@ -696,8 +702,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, */ spage = migrate_pfn_to_page(*src); if (WARN(spage && is_zone_device_page(spage), - "page already in device spage pfn: 0x%lx\n", - page_to_pfn(spage))) + "page already in device spage mpfn: 0x%lx\n", + migrate_pfn_from_page(spage))) goto next; if (dmirror->flags & HMM_DMIRROR_FLAG_FAIL_ALLOC) { @@ -752,8 +758,9 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, */ rpage->zone_device_data = dmirror; - pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", - page_to_pfn(spage), page_to_pfn(dpage)); + pr_debug("migrating from sys to mpfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), + migrate_pfn_from_page(dpage)); *dst = migrate_pfn_from_page(dpage) | write; @@ -1462,10 +1469,10 @@ static void dmirror_device_remove_chunks(struct dmirror_device *mdevice) spin_unlock(&mdevice->lock); dmirror_device_evict_chunk(devmem); - memunmap_pages(&devmem->pagemap); if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); + memunmap_device_private_pagemap(&devmem->pagemap); + else + memunmap_pages(&devmem->pagemap); kfree(devmem); } mdevice->devmem_count = 0; @@ -1710,7 +1717,13 @@ static void dmirror_devmem_folio_split(struct folio *head, struct folio *tail) return; } - offset = folio_pfn(tail) - folio_pfn(head); + tail->pgmap = head->pgmap; + + if (folio_is_device_private(head)) + offset = device_private_folio_to_offset(tail) - + device_private_folio_to_offset(head); + else + offset = folio_pfn(tail) - folio_pfn(head); rpage_tail = folio_page(rfolio, offset); tail->page.zone_device_data = rpage_tail; @@ -1719,7 +1732,6 @@ static void dmirror_devmem_folio_split(struct folio *head, struct folio *tail) rpage_tail->mapping = NULL; folio_page(tail, 0)->mapping = folio_page(head, 0)->mapping; - tail->pgmap = head->pgmap; folio_set_count(page_folio(rpage_tail), 1); } diff --git a/mm/debug.c b/mm/debug.c index 77fa8fe1d641..04fcc62d440f 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -77,9 +77,11 @@ static void __dump_folio(const struct folio *folio, const struct page *page, if (page_mapcount_is_type(mapcount)) mapcount = 0; - pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx %spfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, - folio->index + idx, pfn); + folio->index + idx, + folio_is_device_private(folio) ? "device " : "", + pfn); if (folio_test_large(folio)) { int pincount = 0; @@ -113,7 +115,8 @@ static void __dump_folio(const struct folio *folio, const struct page *page, * inaccuracy here due to racing. */ pr_warn("%sflags: %pGp%s\n", type, &folio->flags, - is_migrate_cma_folio(folio, pfn) ? " CMA" : ""); + (!folio_is_device_private(folio) && + is_migrate_cma_folio(folio, pfn)) ? " CMA" : ""); if (page_has_type(&folio->page)) pr_warn("page_type: %x(%s)\n", folio->page.page_type >> 24, page_type_name(folio->page.page_type)); diff --git a/mm/memremap.c b/mm/memremap.c index 63c6ab4fdf08..d09569764f95 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -12,9 +12,12 @@ #include #include #include +#include #include "internal.h" static DEFINE_XARRAY(pgmap_array); +static struct maple_tree device_private_pgmap_tree = + MTREE_INIT(device_private_pgmap_tree, MT_FLAGS_ALLOC_RANGE); /* * The memremap() and memremap_pages() interfaces are alternately used @@ -113,9 +116,10 @@ void memunmap_pages(struct dev_pagemap *pgmap) { int i; + WARN_ONCE(pgmap->type == MEMORY_DEVICE_PRIVATE, "Type should not be MEMORY_DEVICE_PRIVATE\n"); + percpu_ref_kill(&pgmap->ref); - if (pgmap->type != MEMORY_DEVICE_PRIVATE && - pgmap->type != MEMORY_DEVICE_COHERENT) + if (pgmap->type != MEMORY_DEVICE_COHERENT) for (i = 0; i < pgmap->nr_range; i++) percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); @@ -144,7 +148,6 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, int range_id, int nid) { - const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE; struct range *range = &pgmap->ranges[range_id]; struct dev_pagemap *conflict_pgmap; int error, is_ram; @@ -190,7 +193,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (error) goto err_pfn_remap; - if (!mhp_range_allowed(range->start, range_len(range), !is_private)) { + if (!mhp_range_allowed(range->start, range_len(range), true)) { error = -EINVAL; goto err_kasan; } @@ -198,30 +201,19 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, mem_hotplug_begin(); /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For all other device memory types, which are accessible by - * the CPU, we do want the linear mapping and thus use + * All device memory types except device private memory are accessible + * by the CPU, so we want the linear mapping and thus use * arch_add_memory(). */ - if (is_private) { - error = add_pages(nid, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), params); - } else { - error = kasan_add_zero_shadow(__va(range->start), range_len(range)); - if (error) { - mem_hotplug_done(); - goto err_kasan; - } - - error = arch_add_memory(nid, range->start, range_len(range), - params); + error = kasan_add_zero_shadow(__va(range->start), range_len(range)); + if (error) { + mem_hotplug_done(); + goto err_kasan; } + error = arch_add_memory(nid, range->start, range_len(range), + params); + if (!error) { struct zone *zone; @@ -248,8 +240,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, return 0; err_add_memory: - if (!is_private) - kasan_remove_zero_shadow(__va(range->start), range_len(range)); + kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: pfnmap_untrack(PHYS_PFN(range->start), range_len(range)); err_pfn_remap: @@ -281,22 +272,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: - if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { - WARN(1, "Device private memory not supported\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { - WARN(1, "Missing migrate_to_ram method\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->ops->folio_free) { - WARN(1, "Missing folio_free method\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->owner) { - WARN(1, "Missing owner\n"); - return ERR_PTR(-EINVAL); - } + WARN(1, "Use memremap_device_private_pagemap()\n"); + return ERR_PTR(-EINVAL); break; case MEMORY_DEVICE_COHERENT: if (!pgmap->ops->folio_free) { @@ -394,6 +371,31 @@ void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) } EXPORT_SYMBOL_GPL(devm_memunmap_pages); +static void devm_memremap_device_private_pagemap_release(void *data) +{ + memunmap_device_private_pagemap(data); +} + +int devm_memremap_device_private_pagemap(struct device *dev, struct dev_pagemap *pgmap) +{ + int ret; + + ret = memremap_device_private_pagemap(pgmap, dev_to_node(dev)); + if (ret) + return ret; + + ret = devm_add_action_or_reset(dev, devm_memremap_device_private_pagemap_release, + pgmap); + return ret; +} +EXPORT_SYMBOL_GPL(devm_memremap_device_private_pagemap); + +void devm_memunmap_device_private_pagemap(struct device *dev, struct dev_pagemap *pgmap) +{ + devm_release_action(dev, devm_memremap_device_private_pagemap_release, pgmap); +} +EXPORT_SYMBOL_GPL(devm_memunmap_device_private_pagemap); + /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map @@ -493,3 +495,113 @@ void zone_device_page_init(struct page *page, unsigned int order) prep_compound_page(page, order); } EXPORT_SYMBOL_GPL(zone_device_page_init); + +unsigned long memremap_device_private_pagemap(struct dev_pagemap *pgmap, int nid) +{ + unsigned long dpfn, dpfn_first, dpfn_last = 0; + unsigned long start; + int rc; + + if (pgmap->type != MEMORY_DEVICE_PRIVATE) { + WARN(1, "Not device private memory\n"); + return -EINVAL; + } + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return -EINVAL; + } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return -EINVAL; + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return -EINVAL; + } + + pgmap->pages = kvzalloc(sizeof(*pgmap->pages) * pgmap->nr_pages, + GFP_KERNEL); + if (!pgmap->pages) + return -ENOMEM; + + rc = mtree_alloc_range(&device_private_pgmap_tree, &start, pgmap, + pgmap->nr_pages * PAGE_SIZE, 0, + 1ull << MAX_PHYSMEM_BITS, GFP_KERNEL); + if (rc < 0) + goto err_mtree_alloc; + + pgmap->range.start = start; + pgmap->range.end = pgmap->range.start + (pgmap->nr_pages * PAGE_SIZE) - 1; + pgmap->nr_range = 1; + + init_completion(&pgmap->done); + rc = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, + GFP_KERNEL); + if (rc < 0) + goto err_ref_init; + + if (nid < 0) + nid = numa_mem_id(); + + dpfn_first = pgmap->range.start >> PAGE_SHIFT; + dpfn_last = dpfn_first + (range_len(&pgmap->range) >> PAGE_SHIFT); + for (dpfn = dpfn_first; dpfn < dpfn_last; dpfn++) { + struct page *page = device_private_offset_to_page(dpfn); + + __init_zone_device_page(page, dpfn, ZONE_DEVICE, nid, pgmap); + page_folio(page)->pgmap = (void *)pgmap; + } + + return 0; + +err_ref_init: + mtree_erase(&device_private_pgmap_tree, pgmap->range.start); +err_mtree_alloc: + kvfree(pgmap->pages); + return rc; +} +EXPORT_SYMBOL_GPL(memremap_device_private_pagemap); + +void memunmap_device_private_pagemap(struct dev_pagemap *pgmap) +{ + percpu_ref_kill(&pgmap->ref); + wait_for_completion(&pgmap->done); + percpu_ref_exit(&pgmap->ref); + kvfree(pgmap->pages); + mtree_erase(&device_private_pgmap_tree, pgmap->range.start); +} +EXPORT_SYMBOL_GPL(memunmap_device_private_pagemap); + +struct page *device_private_offset_to_page(unsigned long offset) +{ + struct dev_pagemap *pgmap; + + pgmap = mtree_load(&device_private_pgmap_tree, offset << PAGE_SHIFT); + if (WARN_ON_ONCE(!pgmap)) + return NULL; + + return &pgmap->pages[offset - (pgmap->range.start >> PAGE_SHIFT)]; +} +EXPORT_SYMBOL_GPL(device_private_offset_to_page); + +struct page *device_private_entry_to_page(softleaf_t entry) +{ + unsigned long offset; + + if (!(softleaf_is_device_private(entry) || + softleaf_is_migration_device_private(entry))) + return NULL; + + offset = softleaf_to_pfn(entry); + return device_private_offset_to_page(offset); +} + +pgoff_t device_private_page_to_offset(const struct page *page) +{ + struct dev_pagemap *pgmap = (struct dev_pagemap *)page_pgmap(page); + + VM_WARN_ON(!is_device_private_page(page)); + + return (pgmap->range.start >> PAGE_SHIFT) + ((page - pgmap->pages)); +} +EXPORT_SYMBOL_GPL(device_private_page_to_offset); diff --git a/mm/mm_init.c b/mm/mm_init.c index fc2a6f1e518f..4a9420cb610c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1004,9 +1004,9 @@ static void __init memmap_init(void) } #ifdef CONFIG_ZONE_DEVICE -static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, - unsigned long zone_idx, int nid, - struct dev_pagemap *pgmap) +void __ref __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap) { __init_single_page(page, pfn, zone_idx, nid); @@ -1038,7 +1038,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (pageblock_aligned(pfn)) { + if (pgmap->type != MEMORY_DEVICE_PRIVATE && pageblock_aligned(pfn)) { init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); cond_resched(); } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 039a2d71e92f..d1c4fcce1e83 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -107,6 +107,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) { unsigned long pfn; + bool device_private = false; pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { @@ -115,6 +116,9 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) if (!softleaf_is_migration(entry)) return false; + if (softleaf_is_migration_device_private(entry)) + device_private = true; + pfn = softleaf_to_pfn(entry); } else if (pte_present(ptent)) { pfn = pte_pfn(ptent); @@ -127,8 +131,14 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) return false; pfn = softleaf_to_pfn(entry); + + if (softleaf_is_device_private(entry)) + device_private = true; } + if (device_private != (bool)(pvmw->flags & PVMW_DEVICE_PRIVATE)) + return false; + if ((pfn + pte_nr - 1) < pvmw->pfn) return false; if (pfn > (pvmw->pfn + pvmw->nr_pages - 1)) @@ -137,8 +147,11 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) } /* Returns true if the two ranges overlap. Careful to not overflow. */ -static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) +static bool check_pmd(unsigned long pfn, bool device_private, struct page_vma_mapped_walk *pvmw) { + if (device_private != (bool)(pvmw->flags & PVMW_DEVICE_PRIVATE)) + return false; + if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) return false; if (pfn > pvmw->pfn + pvmw->nr_pages - 1) @@ -254,14 +267,17 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) entry = softleaf_from_pmd(pmde); if (!softleaf_is_migration(entry) || - !check_pmd(softleaf_to_pfn(entry), pvmw)) + !check_pmd(softleaf_to_pfn(entry), + softleaf_is_device_private(entry) || + softleaf_is_migration_device_private(entry), + pvmw)) return not_found(pvmw); return true; } if (likely(pmd_trans_huge(pmde))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); - if (!check_pmd(pmd_pfn(pmde), pvmw)) + if (!check_pmd(pmd_pfn(pmde), false, pvmw)) return not_found(pvmw); return true; } diff --git a/mm/rmap.c b/mm/rmap.c index 79cba3d441c3..a2725b3896d6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1860,7 +1860,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long nr_pages = 1, end_addr; - unsigned long pfn; + unsigned long nr; unsigned long hsz = 0; int ptes = 0; @@ -1967,15 +1967,20 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get(pvmw.pte); if (likely(pte_present(pteval))) { - pfn = pte_pfn(pteval); + nr = pte_pfn(pteval) - folio_pfn(folio); } else { const softleaf_t entry = softleaf_from_pte(pteval); - pfn = softleaf_to_pfn(entry); + if (softleaf_is_device_private(entry) || + softleaf_is_migration_device_private(entry)) + nr = softleaf_to_pfn(entry) - device_private_folio_to_offset(folio); + else + nr = softleaf_to_pfn(entry) - folio_pfn(folio); + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } - subpage = folio_page(folio, pfn - folio_pfn(folio)); + subpage = folio_page(folio, nr); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -2288,7 +2293,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, struct page *subpage; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; - unsigned long pfn; + unsigned long nr; unsigned long hsz = 0; /* @@ -2327,7 +2332,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { /* PMD-mapped THP migration entry */ if (!pvmw.pte) { - __maybe_unused unsigned long pfn; + __maybe_unused softleaf_t entry; __maybe_unused pmd_t pmdval; if (flags & TTU_SPLIT_HUGE_PMD) { @@ -2339,12 +2344,17 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION pmdval = pmdp_get(pvmw.pmd); + entry = softleaf_from_pmd(pmdval); if (likely(pmd_present(pmdval))) - pfn = pmd_pfn(pmdval); - else - pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); + nr = pmd_pfn(pmdval) - folio_pfn(folio); + else if (softleaf_is_device_private(entry) || + softleaf_is_migration_device_private(entry)) { + nr = softleaf_to_pfn(entry) - device_private_folio_to_offset(folio); + } else { + nr = softleaf_to_pfn(entry) - folio_pfn(folio); + } - subpage = folio_page(folio, pfn - folio_pfn(folio)); + subpage = folio_page(folio, nr); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); @@ -2367,15 +2377,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get(pvmw.pte); if (likely(pte_present(pteval))) { - pfn = pte_pfn(pteval); + nr = pte_pfn(pteval) - folio_pfn(folio); } else { const softleaf_t entry = softleaf_from_pte(pteval); - pfn = softleaf_to_pfn(entry); + if (softleaf_is_device_private(entry) || + softleaf_is_migration_device_private(entry)) + nr = softleaf_to_pfn(entry) - device_private_folio_to_offset(folio); + else + nr = softleaf_to_pfn(entry) - folio_pfn(folio); + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } - subpage = folio_page(folio, pfn - folio_pfn(folio)); + subpage = folio_page(folio, nr); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -2433,7 +2448,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, folio_mark_dirty(folio); writable = pte_write(pteval); } else if (likely(pte_present(pteval))) { - flush_cache_page(vma, address, pfn); + flush_cache_page(vma, address, pte_pfn(pteval)); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* diff --git a/mm/util.c b/mm/util.c index 65e3f1a97d76..8482ebc5c394 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1244,7 +1244,10 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page) struct folio *foliop; int loops = 5; - ps->pfn = page_to_pfn(page); + if (is_device_private_page(page)) + ps->pfn = device_private_page_to_offset(page); + else + ps->pfn = page_to_pfn(page); ps->flags = PAGE_SNAPSHOT_FAITHFUL; again: -- 2.34.1