On PowerPC Book3S64, MMU is selected at runtime, so macros like PMD_SHIFT are effectively runtime variables in the Book3S64 code. THP swap code uses these macros to size some of its array data structures based on PMD_ORDER e.g. SWAPFILE_CLUSTER macro is used for this very purpose. Hence this patch initializes SWAPFILE_CLUSTER at runtime and also modifies swap_table and swap_memcg_table which were earlier using this macro for defining the number of table entries. Signed-off-by: Ritesh Harjani (IBM) --- mm/swap.h | 5 +++-- mm/swap_table.h | 6 ++---- mm/swapfile.c | 27 ++++++++++++++++++++++----- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 77d2d14eda42..956879a69ddd 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -26,11 +26,12 @@ extern int page_cluster; #define SWAP_TABLE_HAS_ZEROFLAG ((BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - \ SWAP_CACHE_PFN_BITS) > SWAP_COUNT_MIN_BITS) +extern unsigned int swap_slots_in_cluster __read_mostly; +#define SWAPFILE_CLUSTER swap_slots_in_cluster + #ifdef CONFIG_THP_SWAP -#define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) #else -#define SWAPFILE_CLUSTER 256 #define swap_entry_order(order) 0 #endif diff --git a/mm/swap_table.h b/mm/swap_table.h index e6613e62f8d0..90e2a7852300 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -8,16 +8,14 @@ /* A typical flat array in each cluster as swap table */ struct swap_table { - atomic_long_t entries[SWAPFILE_CLUSTER]; + DECLARE_FLEX_ARRAY(atomic_long_t, entries); }; /* For storing memcg private id */ struct swap_memcg_table { - unsigned short id[SWAPFILE_CLUSTER]; + DECLARE_FLEX_ARRAY(unsigned short, id); }; -#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) - /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a diff --git a/mm/swapfile.c b/mm/swapfile.c index 78b49b0658ad..80846bfb5f76 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -129,6 +129,17 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .lock = INIT_LOCAL_LOCK(), }; +unsigned int swap_slots_in_cluster __read_mostly; +static bool swap_table_use_page __read_mostly; + +static unsigned int generic_swap_slots_in_clusters(void) +{ + if (IS_ENABLED(CONFIG_THP_SWAP)) + return HPAGE_PMD_NR; + else + return 256; +} + /* May return NULL on invalid type, caller must check for NULL return */ static struct swap_info_struct *swap_type_to_info(int type) { @@ -437,7 +448,7 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci) return; rcu_assign_pointer(ci->table, NULL); - if (!SWP_TABLE_USE_PAGE) { + if (!swap_table_use_page) { kmem_cache_free(swap_table_cachep, table); return; } @@ -456,7 +467,7 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) if (rcu_access_pointer(ci->table)) return 0; - if (SWP_TABLE_USE_PAGE) { + if (swap_table_use_page) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (folio) table = folio_address(folio); @@ -471,7 +482,8 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) #ifdef CONFIG_MEMCG if (!mem_cgroup_disabled()) { VM_WARN_ON_ONCE(ci->memcg_table); - ci->memcg_table = kzalloc_obj(*ci->memcg_table, gfp); + ci->memcg_table = kzalloc_flex(*ci->memcg_table, id, + SWAPFILE_CLUSTER, gfp); if (!ci->memcg_table) { swap_cluster_free_table(ci); return -ENOMEM; @@ -3912,14 +3924,19 @@ static int __init swapfile_init(void) { swapfile_maximum_size = arch_max_swapfile_size(); + swap_slots_in_cluster = generic_swap_slots_in_clusters(); + swap_table_use_page = + (swap_slots_in_cluster * sizeof(atomic_long_t) == PAGE_SIZE); + /* * Once a cluster is freed, it's swap table content is read * only, and all swap cache readers (swap_cache_*) verifies * the content before use. So it's safe to use RCU slab here. */ - if (!SWP_TABLE_USE_PAGE) + if (!swap_table_use_page) swap_table_cachep = kmem_cache_create("swap_table", - sizeof(struct swap_table), + struct_size_t(struct swap_table, entries, + SWAPFILE_CLUSTER), 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); #ifdef CONFIG_MIGRATION -- 2.39.5 SWAP_NR_ORDERS sizes a few small bounded arrays inside THP swap allocator code (nofull/frag cluster lists, percpu_swap_cluster's si/offset arrays, next array for rotational device). This currently expands to PMD_ORDER+1, which only works when PMD_ORDER is a compile time constant. However on architecture like PowerPC Book3S64, PMD_ORDER is a runtime variable which depends upon which MMU is selected (Radix / Hash), so in that case, PMD_ORDER cannot be used to size the static arrays. This patch provides an optional ARCH_MAX_PMD_ORDER (upper-bound) override for such architectures. The memory overhead on enabling this override is negligible. Even if we make SWAP_NR_ORDERS runtime alloc, default slab padding could cause some memory waste. Also we lose the per-cpu cacheline benefits (for percpu_swap_cluster) because it might cost an extra cacheline indirection overhead in swap_alloc_fast() for fetching si[order]/offset[order]. Note that a fully runtime SWAP_NR_ORDERS was considered in previous version but was dropped for this reason [1] [1]: https://lore.kernel.org/linuxppc-dev/pl1zdksc.ritesh.list@gmail.com/ Suggested-by: YoungJun Park Signed-off-by: Ritesh Harjani (IBM) --- arch/powerpc/include/asm/book3s/64/pgtable.h | 7 +++++++ include/linux/swap.h | 12 +++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index e67e64ac6e8c..7f22d5d5fbdf 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -204,6 +204,13 @@ extern unsigned long __pmd_frag_size_shift; #define MAX_PTRS_PER_PGD (1 << (H_PGD_INDEX_SIZE > RADIX_PGD_INDEX_SIZE ? \ H_PGD_INDEX_SIZE : RADIX_PGD_INDEX_SIZE)) +/* + * Compile-time upper bound on PMD_ORDER across hash and radix MMUs. + * Used by THP SWAP code. Check include/linux/swap.h + */ +#define ARCH_MAX_PMD_ORDER ((H_PTE_INDEX_SIZE > RADIX_PTE_INDEX_SIZE) ? \ + H_PTE_INDEX_SIZE : RADIX_PTE_INDEX_SIZE) + /* PMD_SHIFT determines what a second-level page table entry can map */ #define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) #define PMD_SIZE (1UL << PMD_SHIFT) diff --git a/include/linux/swap.h b/include/linux/swap.h index 8f0f68e245ba..317168aa2db5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -229,11 +229,21 @@ enum { */ #define SWAP_ENTRY_INVALID 0 +/* + * ARCH_MAX_PMD_ORDER is an optional arch hook: a compile-time upper bound for + * PMD_ORDER across all possible MMU configurations of that arch. It is used to + * size SWAP_NR_ORDERS on architectures (e.g. powerpc book3s64) where PMD_ORDER + * is selected at boot rather than at compile time. + */ #ifdef CONFIG_THP_SWAP +#ifdef ARCH_MAX_PMD_ORDER +#define SWAP_NR_ORDERS (ARCH_MAX_PMD_ORDER + 1) +#else #define SWAP_NR_ORDERS (PMD_ORDER + 1) +#endif /* ARCH_MAX_PMD_ORDER */ #else #define SWAP_NR_ORDERS 1 -#endif +#endif /* CONFIG_THP_SWAP */ /* * We keep using same cluster for rotational device so IO will be sequential. -- 2.39.5 THP_SWAP avoids splitting of a transparent huge folio into 32 smaller 64K folios (Radix-64K pagesize / 2M PMD) or into 256 smaller 64K folios (Hash-64K pagesize / 16M PMD), during swapout. This improves the swapping performance since all the bookking & I/O submission happens once per large folio. More details at [1]. PowerPC Book3S64 could not enable this before because PMD_ORDER is selected at runtime depending upon the chosen MMU. The earlier patches in this series turn SWAPFILE_CLUSTER into a runtime value and introduce an ARCH_MAX_PMD_ORDER upperbound override for SWAP_NR_ORDERS. With those changes, we can now enable THP SWAP for Book3S64. This increases bandwidth throughput with zram backend for swapout by 40-50% with Radix and 100-130% with Hash (Tested by Sayali) [1]: https://lore.kernel.org/all/20170515112522.32457-2-ying.huang@intel.com/ Tested-by: Sayali Patil Signed-off-by: Ritesh Harjani (IBM) --- arch/powerpc/platforms/Kconfig.cputype | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index bac02c83bb3e..48f74bd22343 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -113,6 +113,7 @@ config PPC_THP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE + select ARCH_WANTS_THP_SWAP if TRANSPARENT_HUGEPAGE choice prompt "CPU selection" -- 2.39.5