From: Ackerley Tng The dequeue_hugetlb_folio_vma() function currently handles the gbl_chg parameter to determine if a folio can be dequeued based on global page availability. This leaks reservation-specific logic into the dequeueing path. Relocate this logic to alloc_hugetlb_folio() so that dequeue_hugetlb_folio_vma() focuses solely on selecting and dequeuing a folio. In alloc_hugetlb_folio(), only attempt to dequeue a folio if a reservation exists (gbl_chg == 0) or if there are available huge pages in the global pool. No functional change intended. Reviewed-by: James Houghton Acked-by: Oscar Salvador Reviewed-by: Joshua Hahn Signed-off-by: Ackerley Tng --- mm/hugetlb.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f24bf49be047e..190ab539a97d4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1336,7 +1336,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, long gbl_chg) + unsigned long address) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1344,13 +1344,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, nodemask_t *nodemask; int nid; - /* - * gbl_chg==1 means the allocation requires a new page that was not - * reserved before. Making sure there's at least one free page. - */ - if (gbl_chg && !available_huge_pages(h)) - goto err; - gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); @@ -1368,9 +1361,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, mpol_cond_put(mpol); return folio; - -err: - return NULL; } #if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && defined(CONFIG_CONTIG_ALLOC) @@ -2939,12 +2929,17 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, goto out_uncharge_cgroup_reservation; spin_lock_irq(&hugetlb_lock); + /* - * glb_chg is passed to indicate whether or not a page must be taken - * from the global free pool (global change). gbl_chg == 0 indicates - * a reservation exists for the allocation. + * gbl_chg == 0 indicates a reservation exists for the + * allocation, so try dequeuing a page. In case there was no + * reservation, try dequeuing a page if there are available + * pages in the global pool. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); + folio = NULL; + if (!gbl_chg || available_huge_pages(h)) + folio = dequeue_hugetlb_folio_vma(h, vma, addr); + if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); -- 2.54.0.563.g4f69b47b94-goog From: Ackerley Tng Move memory policy interpretation out of alloc_buddy_hugetlb_folio_with_mpol() and into alloc_hugetlb_folio() to separate reading and interpretation of memory policy from actual allocation. This will later allow memory policy to be interpreted outside of the process of allocating a hugetlb folio entirely. This opens doors for other callers of the HugeTLB folio allocation function, such as guest_memfd, where memory may not always be mapped and hence may not have an associated vma. Introduce struct mempolicy_interpreted to hold all the components of an interpreted memory policy. Rename alloc_buddy_hugetlb_folio_with_mpol() to alloc_buddy_hugetlb_folio() since the function no longer interprets memory policy. No functional change intended. Reviewed-by: James Houghton Acked-by: Oscar Salvador Signed-off-by: Ackerley Tng --- include/uapi/linux/mempolicy.h | 2 +- mm/hugetlb.c | 50 +++++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 6c962d866e864..7f6fc9599693b 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -16,7 +16,7 @@ */ /* Policies */ -enum { +enum mempolicy_mode { MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 190ab539a97d4..6a5f69b3b1cb4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1334,6 +1334,12 @@ static unsigned long available_huge_pages(struct hstate *h) return h->free_huge_pages - h->resv_huge_pages; } +struct mempolicy_interpreted { + int nid; + nodemask_t *nodemask; + enum mempolicy_mode mode; +}; + static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -2155,32 +2161,28 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas return folio; } -/* - * Use the VMA's mpolicy to allocate a huge page from the buddy. - */ static -struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr) +struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, struct mempolicy_interpreted *mpoli) { struct folio *folio = NULL; - struct mempolicy *mpol; - gfp_t gfp_mask = htlb_alloc_mask(h); - int nid; - nodemask_t *nodemask; + nodemask_t *nodemask = mpoli->nodemask; - nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - if (mpol_is_preferred_many(mpol)) { + if (mpoli->mode == MPOL_PREFERRED_MANY) { gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); + folio = alloc_surplus_hugetlb_folio(h, gfp, mpoli->nid, + nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!folio) - folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); - mpol_cond_put(mpol); + if (!folio) { + folio = alloc_surplus_hugetlb_folio(h, gfp_mask, mpoli->nid, + nodemask); + } + return folio; } @@ -2869,7 +2871,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; + gfp_t gfp = htlb_alloc_mask(h); idx = hstate_index(h); @@ -2941,8 +2943,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, folio = dequeue_hugetlb_folio_vma(h, vma, addr); if (!folio) { + struct mempolicy_interpreted mpoli; + struct mempolicy *mpol; + nodemask_t *nodemask; + int nid; + spin_unlock_irq(&hugetlb_lock); - folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); + nid = huge_node(vma, addr, gfp, &mpol, &nodemask); + mpoli = (struct mempolicy_interpreted){ + .nid = nid, + .mode = mpol->mode, + .nodemask = nodemask, + }; + folio = alloc_buddy_hugetlb_folio(h, gfp, &mpoli); + mpol_cond_put(mpol); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); @@ -2998,7 +3012,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, } } - ret = mem_cgroup_charge_hugetlb(folio, gfp); + ret = mem_cgroup_charge_hugetlb(folio, gfp | __GFP_RETRY_MAYFAIL); /* * Unconditionally increment NR_HUGETLB here. If it turns out that * mem_cgroup_charge_hugetlb failed, then immediately free the page and -- 2.54.0.563.g4f69b47b94-goog From: Ackerley Tng Move memory policy interpretation out of dequeue_hugetlb_folio_vma() and into alloc_hugetlb_folio() to separate reading and interpretation of memory policy from actual allocation. Also rename dequeue_hugetlb_folio_vma() to dequeue_hugetlb_folio_with_mpol() to remove association with vma and to align with alloc_buddy_hugetlb_folio_with_mpol(). This will later allow memory policy to be interpreted outside of the process of allocating a hugetlb folio entirely. This opens doors for other callers of the HugeTLB folio allocation function, such as guest_memfd, where memory may not always be mapped and hence may not have an associated vma. No functional change intended. Signed-off-by: Ackerley Tng Reviewed-by: James Houghton --- mm/hugetlb.c | 57 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6a5f69b3b1cb4..9807bbe0d70df 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1340,32 +1340,26 @@ struct mempolicy_interpreted { enum mempolicy_mode mode; }; -static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, - struct vm_area_struct *vma, - unsigned long address) +static struct folio *dequeue_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, + struct mempolicy_interpreted *mpoli) { + nodemask_t *nodemask = mpoli->nodemask; struct folio *folio = NULL; - struct mempolicy *mpol; - gfp_t gfp_mask; - nodemask_t *nodemask; - int nid; - gfp_mask = htlb_alloc_mask(h); - nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - - if (mpol_is_preferred_many(mpol)) { + if (mpoli->mode == MPOL_PREFERRED_MANY) { folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, - nid, nodemask); + mpoli->nid, + nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!folio) + if (!folio) { folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, - nid, nodemask); - - mpol_cond_put(mpol); + mpoli->nid, + nodemask); + } return folio; } @@ -2871,7 +2865,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct mempolicy_interpreted mpoli; gfp_t gfp = htlb_alloc_mask(h); + struct mempolicy *mpol; + nodemask_t *nodemask; + int nid; idx = hstate_index(h); @@ -2930,6 +2928,14 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; + /* Takes reference on mpol. */ + nid = huge_node(vma, addr, gfp, &mpol, &nodemask); + mpoli = (struct mempolicy_interpreted){ + .nid = nid, + .mode = mpol->mode, + .nodemask = nodemask, + }; + spin_lock_irq(&hugetlb_lock); /* @@ -2940,31 +2946,24 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ folio = NULL; if (!gbl_chg || available_huge_pages(h)) - folio = dequeue_hugetlb_folio_vma(h, vma, addr); + folio = dequeue_hugetlb_folio(h, gfp, &mpoli); if (!folio) { - struct mempolicy_interpreted mpoli; - struct mempolicy *mpol; - nodemask_t *nodemask; - int nid; - spin_unlock_irq(&hugetlb_lock); - nid = huge_node(vma, addr, gfp, &mpol, &nodemask); - mpoli = (struct mempolicy_interpreted){ - .nid = nid, - .mode = mpol->mode, - .nodemask = nodemask, - }; folio = alloc_buddy_hugetlb_folio(h, gfp, &mpoli); mpol_cond_put(mpol); - if (!folio) + if (!folio) { + mpol_cond_put(mpol); goto out_uncharge_cgroup; + } spin_lock_irq(&hugetlb_lock); list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + mpol_cond_put(mpol); + /* * Either dequeued or buddy-allocated folio needs to add special * mark to the folio when it consumes a global reservation. -- 2.54.0.563.g4f69b47b94-goog From: Ackerley Tng Refactor alloc_hugetlb_folio to use a local variable for returning error codes. Instead of returning ERR_PTR(-ENOSPC) at the end of the error path, assign -ENOSPC to a return variable at each failure point and return that variable at the end. This allows the cleanup goto targets to be used with other errors in a later patch. No functional change intended. Signed-off-by: Ackerley Tng --- mm/hugetlb.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9807bbe0d70df..ad07e72d6fac3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2903,8 +2903,10 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); - if (gbl_chg < 0) + if (gbl_chg < 0) { + ret = -ENOSPC; goto out_end_reservation; + } } else { /* * If we have the vma reservation ready, no need for extra @@ -2920,13 +2922,17 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); - if (ret) + if (ret) { + ret = -ENOSPC; goto out_subpool_put; + } } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); - if (ret) + if (ret) { + ret = -ENOSPC; goto out_uncharge_cgroup_reservation; + } /* Takes reference on mpol. */ nid = huge_node(vma, addr, gfp, &mpol, &nodemask); @@ -2954,6 +2960,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, mpol_cond_put(mpol); if (!folio) { mpol_cond_put(mpol); + ret = -ENOSPC; goto out_uncharge_cgroup; } spin_lock_irq(&hugetlb_lock); @@ -3046,7 +3053,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); + return ERR_PTR(ret); } static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact) -- 2.54.0.563.g4f69b47b94-goog From: Ackerley Tng Move mem_cgroup_charge_hugetlb() earlier in the folio allocation process. This change draws a cleaner line between memcg charging and the subsequent hugetlb-specific reservation logic for VMAs and subpools. While it would be ideal to make all accounting and reservations perfectly symmetric, mem_cgroup_charge_hugetlb() is a complex operation that cannot be performed under the hugetlb_lock. Moving the charge to this earlier point ensures that memcg charging is handled before the code begins manipulating subpool and VMA-specific state. These two types of accounting will be separated in a future patch. If mem_cgroup_charge_hugetlb() fails, the code now branches to out_subpool_put to ensure the folio is freed and the subpool references are handled correctly. Signed-off-by: Ackerley Tng --- mm/hugetlb.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ad07e72d6fac3..81e73186dff09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2991,6 +2991,24 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, spin_unlock_irq(&hugetlb_lock); + ret = mem_cgroup_charge_hugetlb(folio, gfp | __GFP_RETRY_MAYFAIL); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ + lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); + + if (ret == -ENOMEM) { + free_huge_folio(folio); + /* + * Skip uncharging hugetlb_cgroup since the charges + * were committed to the folio and freeing the folio + * would have cleared those up. + */ + goto out_subpool_put; + } + hugetlb_set_folio_subpool(folio, spool); if (map_chg != MAP_CHG_ENFORCED) { @@ -3018,19 +3036,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, } } - ret = mem_cgroup_charge_hugetlb(folio, gfp | __GFP_RETRY_MAYFAIL); - /* - * Unconditionally increment NR_HUGETLB here. If it turns out that - * mem_cgroup_charge_hugetlb failed, then immediately free the page and - * decrement NR_HUGETLB. - */ - lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); - - if (ret == -ENOMEM) { - free_huge_folio(folio); - return ERR_PTR(-ENOMEM); - } - return folio; out_uncharge_cgroup: -- 2.54.0.563.g4f69b47b94-goog From: Ackerley Tng Refactor out hugetlb_alloc_folio() from alloc_hugetlb_folio(), which handles allocation of a folio and memory and HugeTLB charging to cgroups. This refactoring decouples the HugeTLB page allocation from VMAs, specifically: 1. Reservations (as in resv_map) are stored in the vma 2. mpol is stored at vma->vm_policy 3. A vma must be used for allocation even if the pages are not meant to be used by host process. Without this coupling, VMAs are no longer a requirement for allocation. This opens up the allocation routine for usage without VMAs, which will allow guest_memfd to use HugeTLB as a more generic allocator of huge pages, since guest_memfd memory may not have any associated VMAs by design. In addition, direct allocations from HugeTLB could possibly be refactored to avoid the use of a pseudo-VMA. Also, this decouples HugeTLB page allocation from HugeTLBfs, where the subpool is stored at the fs mount. This is also a requirement for guest_memfd, where the plan is to have a subpool created per-fd and stored on the inode. Provide and use alloc_flags to allow more allocation knobs in future without expanding the number of parameters in hugetlb_alloc_folio(). No functional change intended. Signed-off-by: Ackerley Tng --- include/linux/hugetlb.h | 19 +++++ mm/hugetlb.c | 188 +++++++++++++++++++++++++----------------------- 2 files changed, 117 insertions(+), 90 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 93418625d3c5f..9a0222851573d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H +#include #include #include #include @@ -705,6 +706,24 @@ bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); + +struct mempolicy_interpreted { + int nid; + nodemask_t *nodemask; + enum mempolicy_mode mode; +}; + +enum hugetlb_alloc_flag { + HUGETLB_ALLOC_CHARGE_CGROUP_RSVD_BIT = 0, + HUGETLB_ALLOC_USE_GLOBAL_RESERVATIONS_BIT, +}; + +#define HUGETLB_ALLOC_CHARG_CGROUP_RSVD BIT(HUGETLB_ALLOC_CHARGE_CGROUP_RSVD_BIT) +#define HUGETLB_ALLOC_USE_GLOBAL_RESERVATIONS BIT(HUGETLB_ALLOC_USE_GLOBAL_RESERVATIONS_BIT) + +struct folio *hugetlb_alloc_folio(struct hstate *h, struct hugepage_subpool *spool, + gfp_t gfp, struct mempolicy_interpreted *mpoli, + u8 alloc_flags); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 81e73186dff09..abce2ca76fb9c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1334,12 +1334,6 @@ static unsigned long available_huge_pages(struct hstate *h) return h->free_huge_pages - h->resv_huge_pages; } -struct mempolicy_interpreted { - int nid; - nodemask_t *nodemask; - enum mempolicy_mode mode; -}; - static struct folio *dequeue_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, struct mempolicy_interpreted *mpoli) { @@ -2829,6 +2823,90 @@ void wait_for_freed_hugetlb_folios(void) flush_work(&free_hpage_work); } +struct folio *hugetlb_alloc_folio(struct hstate *h, struct hugepage_subpool *spool, + gfp_t gfp, struct mempolicy_interpreted *mpoli, + u8 alloc_flags) +{ + bool charge_hugetlb_cgroup_rsvd = alloc_flags & + HUGETLB_ALLOC_CHARG_CGROUP_RSVD; + bool use_global_reservation = alloc_flags & + HUGETLB_ALLOC_USE_GLOBAL_RESERVATIONS; + size_t nr_pages = pages_per_huge_page(h); + struct hugetlb_cgroup *h_cg = NULL; + int idx = hstate_index(h); + struct folio *folio; + int ret; + + if (charge_hugetlb_cgroup_rsvd && + hugetlb_cgroup_charge_cgroup_rsvd(idx, nr_pages, &h_cg)) + return ERR_PTR(-ENOSPC); + + if (hugetlb_cgroup_charge_cgroup(idx, nr_pages, &h_cg)) { + ret = -ENOSPC; + goto err_uncharge_hugetlb_cgroup_rsvd; + } + + spin_lock_irq(&hugetlb_lock); + + folio = NULL; + if (use_global_reservation || available_huge_pages(h)) + folio = dequeue_hugetlb_folio(h, gfp, mpoli); + + if (!folio) { + spin_unlock_irq(&hugetlb_lock); + folio = alloc_buddy_hugetlb_folio(h, gfp, mpoli); + if (!folio) { + ret = -ENOSPC; + goto err_uncharge_hugetlb_cgroup; + } + spin_lock_irq(&hugetlb_lock); + list_add(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); + } + + if (use_global_reservation) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); + + if (charge_hugetlb_cgroup_rsvd) { + hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), + h_cg, folio); + } + + spin_unlock_irq(&hugetlb_lock); + + ret = mem_cgroup_charge_hugetlb(folio, gfp | __GFP_RETRY_MAYFAIL); + /* + * Unconditionally increment NR_HUGETLB here because if + * mem_cgroup_charge_hugetlb failed, freeing the page will + * decrement NR_HUGETLB. + */ + lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); + + if (ret == -ENOMEM) { + free_huge_folio(folio); + /* + * Skip uncharging hugetlb_cgroup since the charges + * were committed to the folio and freeing the folio + * would have cleared those up. + */ + return ERR_PTR(ret); + } + + return folio; + + err_uncharge_hugetlb_cgroup: + hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg); + err_uncharge_hugetlb_cgroup_rsvd: + if (charge_hugetlb_cgroup_rsvd) + hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, h_cg); + + return ERR_PTR(ret); +} + typedef enum { /* * For either 0/1: we checked the per-vma resv map, and one resv @@ -2864,11 +2942,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; - struct hugetlb_cgroup *h_cg = NULL; struct mempolicy_interpreted mpoli; gfp_t gfp = htlb_alloc_mask(h); struct mempolicy *mpol; nodemask_t *nodemask; + u8 alloc_flags = 0; int nid; idx = hstate_index(h); @@ -2916,23 +2994,18 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, } /* - * If this allocation is not consuming a per-vma reservation, - * charge the hugetlb cgroup now. + * If allocation doesn't reuse a reservation in the resv_map, + * charge for the reservation. */ - if (map_chg) { - ret = hugetlb_cgroup_charge_cgroup_rsvd( - idx, pages_per_huge_page(h), &h_cg); - if (ret) { - ret = -ENOSPC; - goto out_subpool_put; - } - } + if (map_chg != MAP_CHG_REUSE) + alloc_flags |= HUGETLB_ALLOC_CHARG_CGROUP_RSVD; - ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); - if (ret) { - ret = -ENOSPC; - goto out_uncharge_cgroup_reservation; - } + /* + * gbl_chg == 0 indicates a reservation exists for this + * allocation, so try to use it. + */ + if (gbl_chg == 0) + alloc_flags |= HUGETLB_ALLOC_USE_GLOBAL_RESERVATIONS; /* Takes reference on mpol. */ nid = huge_node(vma, addr, gfp, &mpol, &nodemask); @@ -2942,70 +3015,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, .nodemask = nodemask, }; - spin_lock_irq(&hugetlb_lock); - - /* - * gbl_chg == 0 indicates a reservation exists for the - * allocation, so try dequeuing a page. In case there was no - * reservation, try dequeuing a page if there are available - * pages in the global pool. - */ - folio = NULL; - if (!gbl_chg || available_huge_pages(h)) - folio = dequeue_hugetlb_folio(h, gfp, &mpoli); - - if (!folio) { - spin_unlock_irq(&hugetlb_lock); - folio = alloc_buddy_hugetlb_folio(h, gfp, &mpoli); - mpol_cond_put(mpol); - if (!folio) { - mpol_cond_put(mpol); - ret = -ENOSPC; - goto out_uncharge_cgroup; - } - spin_lock_irq(&hugetlb_lock); - list_add(&folio->lru, &h->hugepage_activelist); - folio_ref_unfreeze(folio, 1); - /* Fall through */ - } + folio = hugetlb_alloc_folio(h, spool, gfp, &mpoli, alloc_flags); mpol_cond_put(mpol); - /* - * Either dequeued or buddy-allocated folio needs to add special - * mark to the folio when it consumes a global reservation. - */ - if (!gbl_chg) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); - /* If allocation is not consuming a reservation, also store the - * hugetlb_cgroup pointer on the page. - */ - if (map_chg) { - hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, folio); - } - - spin_unlock_irq(&hugetlb_lock); - - ret = mem_cgroup_charge_hugetlb(folio, gfp | __GFP_RETRY_MAYFAIL); - /* - * Unconditionally increment NR_HUGETLB here. If it turns out that - * mem_cgroup_charge_hugetlb failed, then immediately free the page and - * decrement NR_HUGETLB. - */ - lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); - - if (ret == -ENOMEM) { - free_huge_folio(folio); - /* - * Skip uncharging hugetlb_cgroup since the charges - * were committed to the folio and freeing the folio - * would have cleared those up. - */ + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out_subpool_put; } @@ -3038,12 +3053,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, return folio; -out_uncharge_cgroup: - hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); -out_uncharge_cgroup_reservation: - if (map_chg) - hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), - h_cg); out_subpool_put: /* * put page to subpool iff the quota of subpool's rsv_hpages is used @@ -3054,7 +3063,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_acct_memory(h, -gbl_reserve); } - out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); -- 2.54.0.563.g4f69b47b94-goog