The idle and iowait statistics in /proc/stat are obtained through get_idle_time and get_iowait_time. Assuming CONFIG_NO_HZ_COMMON is enabled, when CPU is online, the idle and iowait values use the idle_sleeptime and iowait_sleeptime statistics from tick_cpu_sched, but use CPUTIME_IDLE and CPUTIME_IOWAIT items from kernel_cpustat when CPU is offline. Although /proc/stat do not print statistics of offline CPU, it still print aggregated statistics for all possible CPUs. tick_cpu_sched and kernel_cpustat are maintained by different logic, leading to a significant gap. The first line of the data below shows the /proc/stat output when only one CPU remains after CPU offline, the second line shows the /proc/stat output after all CPUs are brought back online: cpu 2408558 2 916619 4275883 5403 123758 64685 0 0 0 cpu 2408588 2 916693 4200737 4184 123762 64686 0 0 0 Obviously, other values do not experience significant fluctuations, while idle/iowait statistics show a substantial decrease, which make system CPU monitoring troublesome. Introduce get_cpu_idle_time_us_raw and get_cpu_iowait_time_us_raw, so that /proc/stat logic can use them to get the last raw value of idle_sleeptime and iowait_sleeptime from tick_cpu_sched without any calculation when CPU is offline. It avoids /proc/stat idle/iowait fluctuation when cpu hotplug. Signed-off-by: Xin Zhao --- fs/proc/stat.c | 4 ++++ include/linux/tick.h | 4 ++++ kernel/time/tick-sched.c | 46 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 8b444e862..de13a2e1c 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -28,6 +28,8 @@ u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); + else + idle_usecs = get_cpu_idle_time_us_raw(cpu); if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ @@ -44,6 +46,8 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) if (cpu_online(cpu)) iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); + else + iowait_usecs = get_cpu_iowait_time_us_raw(cpu); if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ diff --git a/include/linux/tick.h b/include/linux/tick.h index ac76ae9fa..bdd38d270 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -139,7 +139,9 @@ extern ktime_t tick_nohz_get_next_hrtimer(void); extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); +extern u64 get_cpu_idle_time_us_raw(int cpu); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); +extern u64 get_cpu_iowait_time_us_raw(int cpu); #else /* !CONFIG_NO_HZ_COMMON */ #define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } @@ -161,7 +163,9 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) return *delta_next; } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } +static inline u64 get_cpu_idle_time_us_raw(int cpu) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } +static inline u64 get_cpu_iowait_time_us_raw(int cpu) { return -1; } #endif /* !CONFIG_NO_HZ_COMMON */ /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4d089b290..607fc22d4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -829,6 +829,29 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); +/** + * get_cpu_idle_time_us_raw - get the raw total idle time of a CPU + * @cpu: CPU number to query + * + * Return the raw idle time (since boot) for a given CPU, in + * microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_idle_time_us_raw(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!tick_nohz_active) + return -1; + + return ktime_to_us(ts->idle_sleeptime); +} +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us_raw); + /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query @@ -855,6 +878,29 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +/** + * get_cpu_iowait_time_us_raw - get the raw total iowait time of a CPU + * @cpu: CPU number to query + * + * Return the raw iowait time (since boot) for a given CPU, in + * microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_iowait_time_us_raw(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!tick_nohz_active) + return -1; + + return ktime_to_us(ts->iowait_sleeptime); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us_raw); + static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); -- 2.34.1