SWAP_NR_ORDERS sizes a few small bounded arrays inside THP swap allocator code (nofull/frag cluster lists, percpu_swap_cluster's si/offset arrays, next array for rotational device). This currently expands to PMD_ORDER+1, which only works when PMD_ORDER is a compile time constant. However on architecture like PowerPC Book3S64, PMD_ORDER is a runtime variable which depends upon which MMU is selected (Radix / Hash), so in that case, PMD_ORDER cannot be used to size the static arrays. This patch provides an optional ARCH_MAX_PMD_ORDER (upper-bound) override for such architectures. The memory overhead on enabling this override is negligible. Even if we make SWAP_NR_ORDERS runtime alloc, default slab padding could cause some memory waste. Also we lose the per-cpu cacheline benefits (for percpu_swap_cluster) because it might cost an extra cacheline indirection overhead in swap_alloc_fast() for fetching si[order]/offset[order]. Note that a fully runtime SWAP_NR_ORDERS was considered in previous version but was dropped for this reason [1] [1]: https://lore.kernel.org/linuxppc-dev/pl1zdksc.ritesh.list@gmail.com/ Suggested-by: YoungJun Park Signed-off-by: Ritesh Harjani (IBM) --- arch/powerpc/include/asm/book3s/64/pgtable.h | 7 +++++++ include/linux/swap.h | 12 +++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index e67e64ac6e8c..7f22d5d5fbdf 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -204,6 +204,13 @@ extern unsigned long __pmd_frag_size_shift; #define MAX_PTRS_PER_PGD (1 << (H_PGD_INDEX_SIZE > RADIX_PGD_INDEX_SIZE ? \ H_PGD_INDEX_SIZE : RADIX_PGD_INDEX_SIZE)) +/* + * Compile-time upper bound on PMD_ORDER across hash and radix MMUs. + * Used by THP SWAP code. Check include/linux/swap.h + */ +#define ARCH_MAX_PMD_ORDER ((H_PTE_INDEX_SIZE > RADIX_PTE_INDEX_SIZE) ? \ + H_PTE_INDEX_SIZE : RADIX_PTE_INDEX_SIZE) + /* PMD_SHIFT determines what a second-level page table entry can map */ #define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) #define PMD_SIZE (1UL << PMD_SHIFT) diff --git a/include/linux/swap.h b/include/linux/swap.h index 46c25523d7b8..4e1701b4a565 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -223,11 +223,21 @@ enum { */ #define SWAP_ENTRY_INVALID 0 +/* + * ARCH_MAX_PMD_ORDER is an optional arch hook: a compile-time upper bound for + * PMD_ORDER across all possible MMU configurations of that arch. It is used to + * size SWAP_NR_ORDERS on architectures (e.g. powerpc book3s64) where PMD_ORDER + * is selected at boot rather than at compile time. + */ #ifdef CONFIG_THP_SWAP +#ifdef ARCH_MAX_PMD_ORDER +#define SWAP_NR_ORDERS (ARCH_MAX_PMD_ORDER + 1) +#else #define SWAP_NR_ORDERS (PMD_ORDER + 1) +#endif /* ARCH_MAX_PMD_ORDER */ #else #define SWAP_NR_ORDERS 1 -#endif +#endif /* CONFIG_THP_SWAP */ /* * We keep using same cluster for rotational device so IO will be sequential. -- 2.39.5