By default, one byte per PFN is used to store hotness information. Limited number of bits are used to store the access time leading to coarse-grained time tracking. Also there aren't enough bits to track the toptier NID explicitly and hence the default target_nid is used for promotion. This precise mode relaxes the above situation by storing the hotness information in 4 bytes per PFN. More fine-grained access time tracking and toptier NID tracking becomes possible in this mode. Typically useful when toptier consists of more than one node. Signed-off-by: Bharata B Rao --- Documentation/admin-guide/mm/pghot.txt | 4 +- include/linux/mmzone.h | 2 +- include/linux/pghot.h | 31 ++++++++++++ mm/Kconfig | 11 ++++ mm/Makefile | 7 ++- mm/pghot-precise.c | 70 ++++++++++++++++++++++++++ mm/pghot.c | 13 +++-- 7 files changed, 130 insertions(+), 8 deletions(-) create mode 100644 mm/pghot-precise.c diff --git a/Documentation/admin-guide/mm/pghot.txt b/Documentation/admin-guide/mm/pghot.txt index 01291b72e7ab..b329e692ef89 100644 --- a/Documentation/admin-guide/mm/pghot.txt +++ b/Documentation/admin-guide/mm/pghot.txt @@ -38,7 +38,7 @@ Path: /sys/kernel/debug/pghot/ 3. **freq_threshold** - Minimum access frequency before a page is marked ready for promotion. - - Range: 1 to 3 + - Range: 1 to 3 in default mode, 1 to 7 in precision mode. - Default: 2 - Example: # echo 3 > /sys/kernel/debug/pghot/freq_threshold @@ -60,7 +60,7 @@ Path: /proc/sys/vm/pghot_promote_freq_window_ms - Controls the time window (in ms) for counting access frequency. A page is considered hot only when **freq_threshold** number of accesses occur with this time period. -- Default: 4000 (4 seconds) +- Default: 4000 (4 seconds) in default mode and 5000 (5s) in precision mode. - Example: # sysctl vm.pghot_promote_freq_window_ms=3000 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 22e08befb096..49c374064fc2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1924,7 +1924,7 @@ struct mem_section { #ifdef CONFIG_PGHOT /* * Per-PFN hotness data for this section. - * Array of phi_t (u8 in default mode). + * Array of phi_t (u8 in default mode, u32 in precision mode). * LSB is used as PGHOT_SECTION_HOT_BIT flag. */ void *hot_map; diff --git a/include/linux/pghot.h b/include/linux/pghot.h index 88e57aab697b..d3d59b0c0cf6 100644 --- a/include/linux/pghot.h +++ b/include/linux/pghot.h @@ -48,6 +48,36 @@ enum pghot_src_enabled { #define PGHOT_DEFAULT_NODE 0 +#if defined(CONFIG_PGHOT_PRECISE) +#define PGHOT_DEFAULT_FREQ_WINDOW (5 * MSEC_PER_SEC) + +/* + * Bits 0-26 are used to store nid, frequency and time. + * Bits 27-30 are unused now. + * Bit 31 is used to indicate the page is ready for migration. + */ +#define PGHOT_MIGRATE_READY 31 + +#define PGHOT_NID_WIDTH 10 +#define PGHOT_FREQ_WIDTH 3 +/* time is stored in 14 bits which can represent up to 16s with HZ=1000 */ +#define PGHOT_TIME_WIDTH 14 + +#define PGHOT_NID_SHIFT 0 +#define PGHOT_FREQ_SHIFT (PGHOT_NID_SHIFT + PGHOT_NID_WIDTH) +#define PGHOT_TIME_SHIFT (PGHOT_FREQ_SHIFT + PGHOT_FREQ_WIDTH) + +#define PGHOT_NID_MASK GENMASK(PGHOT_NID_WIDTH - 1, 0) +#define PGHOT_FREQ_MASK GENMASK(PGHOT_FREQ_WIDTH - 1, 0) +#define PGHOT_TIME_MASK GENMASK(PGHOT_TIME_WIDTH - 1, 0) + +#define PGHOT_NID_MAX ((1 << PGHOT_NID_WIDTH) - 1) +#define PGHOT_FREQ_MAX ((1 << PGHOT_FREQ_WIDTH) - 1) +#define PGHOT_TIME_MAX ((1 << PGHOT_TIME_WIDTH) - 1) + +typedef u32 phi_t; + +#else /* !CONFIG_PGHOT_PRECISE */ #define PGHOT_DEFAULT_FREQ_WINDOW (4 * MSEC_PER_SEC) /* @@ -74,6 +104,7 @@ enum pghot_src_enabled { #define PGHOT_TIME_MAX ((1 << PGHOT_TIME_WIDTH) - 1) typedef u8 phi_t; +#endif /* CONFIG_PGHOT_PRECISE */ #define PGHOT_RECORD_SIZE sizeof(phi_t) diff --git a/mm/Kconfig b/mm/Kconfig index f4f0147faac5..fde5aee3e16f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1478,6 +1478,17 @@ config PGHOT This adds 1 byte of metadata overhead per page in lower-tier memory nodes. +config PGHOT_PRECISE + bool "Hot page tracking precision mode" + def_bool n + depends on PGHOT + help + Enables precision mode for tracking hot pages with pghot sub-system. + Adds fine-grained access time tracking and explicit toptier target + NID tracking. Precise hot page tracking comes at the cost of using + 4 bytes per page against the default one byte per page. Preferable + to enable this on systems with multiple nodes in toptier. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 655a27f3a215..89f999647752 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -147,4 +147,9 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o -obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o pghot-default.o +obj-$(CONFIG_PGHOT) += pghot.o pghot-tunables.o +ifdef CONFIG_PGHOT_PRECISE +obj-$(CONFIG_PGHOT) += pghot-precise.o +else +obj-$(CONFIG_PGHOT) += pghot-default.o +endif diff --git a/mm/pghot-precise.c b/mm/pghot-precise.c new file mode 100644 index 000000000000..d8d4f15b3f9f --- /dev/null +++ b/mm/pghot-precise.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * pghot: Precision mode + * + * 4 byte hotness record per PFN (u32) + * NID, time and frequency tracked as part of the record. + */ + +#include +#include + +unsigned long pghot_access_latency(unsigned long old_time, unsigned long time) +{ + return jiffies_to_msecs((time - old_time) & PGHOT_TIME_MASK); +} + +bool pghot_update_record(phi_t *phi, int nid, unsigned long now) +{ + phi_t freq, old_freq, hotness, old_hotness, old_time, old_nid; + phi_t time = now & PGHOT_TIME_MASK; + + old_hotness = READ_ONCE(*phi); + do { + bool new_window = false; + + hotness = old_hotness; + old_nid = (hotness >> PGHOT_NID_SHIFT) & PGHOT_NID_MASK; + old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK; + old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK; + + if (pghot_access_latency(old_time, time) > sysctl_pghot_freq_window) + new_window = true; + + if (new_window) + freq = 1; + else if (old_freq < PGHOT_FREQ_MAX) + freq = old_freq + 1; + else + freq = old_freq; + nid = (nid == NUMA_NO_NODE) ? pghot_target_nid : nid; + + hotness &= ~(PGHOT_NID_MASK << PGHOT_NID_SHIFT); + hotness &= ~(PGHOT_FREQ_MASK << PGHOT_FREQ_SHIFT); + hotness &= ~(PGHOT_TIME_MASK << PGHOT_TIME_SHIFT); + + hotness |= (nid & PGHOT_NID_MASK) << PGHOT_NID_SHIFT; + hotness |= (freq & PGHOT_FREQ_MASK) << PGHOT_FREQ_SHIFT; + hotness |= (time & PGHOT_TIME_MASK) << PGHOT_TIME_SHIFT; + + if (freq >= pghot_freq_threshold) + hotness |= BIT(PGHOT_MIGRATE_READY); + } while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness))); + return !!(hotness & BIT(PGHOT_MIGRATE_READY)); +} + +int pghot_get_record(phi_t *phi, int *nid, int *freq, unsigned long *time) +{ + phi_t old_hotness, hotness = 0; + + old_hotness = READ_ONCE(*phi); + do { + if (!(old_hotness & BIT(PGHOT_MIGRATE_READY))) + return -EINVAL; + } while (unlikely(!try_cmpxchg(phi, &old_hotness, hotness))); + + *nid = (old_hotness >> PGHOT_NID_SHIFT) & PGHOT_NID_MASK; + *freq = (old_hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK; + *time = (old_hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK; + return 0; +} diff --git a/mm/pghot.c b/mm/pghot.c index 95b5012d5b99..bf1d9029cbaa 100644 --- a/mm/pghot.c +++ b/mm/pghot.c @@ -10,6 +10,9 @@ * the frequency of access and last access time. Promotions are done * to a default toptier NID. * + * In the precision mode, 4 bytes are used to store the frequency + * of access, last access time and the accessing NID. + * * A kernel thread named kmigrated is provided to migrate or promote * the hot pages. kmigrated runs for each lower tier node. It iterates * over the node's PFNs and migrates pages marked for migration into @@ -52,13 +55,15 @@ static bool kmigrated_started __ro_after_init; * for the purpose of tracking page hotness and subsequent promotion. * * @pfn: PFN of the page - * @nid: Unused + * @nid: Target NID to where the page needs to be migrated in precision + * mode but unused in default mode * @src: The identifier of the sub-system that reports the access * @now: Access time in jiffies * - * Updates the frequency and time of access and marks the page as - * ready for migration if the frequency crosses a threshold. The pages - * marked for migration are migrated by kmigrated kernel thread. + * Updates the NID (in precision mode only), frequency and time of access + * and marks the page as ready for migration if the frequency crosses a + * threshold. The pages marked for migration are migrated by kmigrated + * kernel thread. * * Return: 0 on success and -EINVAL on failure to record the access. */ -- 2.34.1