From bcb2cf41f964e9ed4d27abe4dee8daa249f659bb Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 7 Apr 2026 12:48:07 -0400 Subject: [PATCH] Allow retrieving x86 TSC frequency/flags from CPUID This adds additional x86 specific CPUID checks for flags needed for determining whether the Time-Stamp Counter (TSC) is usable on a given system, as well as a helper function to retrieve the TSC frequency from CPUID. This is intended for a future patch that will utilize the TSC to lower the overhead of timing instrumentation. In passing, always make pg_cpuid_subleaf reset the variables used for its result, to avoid accidentally using stale results if __get_cpuid_count errors out and the caller doesn't check for it. Author: Lukas Fittl Author: David Geier Author: Andres Freund Reviewed-by: Andres Freund Reviewed-by: David Geier Reviewed-by: John Naylor Reviewed-by: Jakub Wartak (in an earlier version) Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- src/include/port/pg_cpu.h | 12 +++- src/port/pg_cpu_x86.c | 138 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 145 insertions(+), 5 deletions(-) diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index c5d96bb4f47..a5d42f1b68d 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -32,8 +32,16 @@ typedef enum X86FeatureId PG_AVX512_VL, PG_AVX512_VPCLMULQDQ, PG_AVX512_VPOPCNTDQ, + + /* identification */ + PG_HYPERVISOR, + + /* Time-Stamp Counter (TSC) flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, } X86FeatureId; -#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1) +#define X86FeaturesSize (PG_TSC_ADJUST + 1) extern PGDLLIMPORT bool X86Features[]; @@ -48,6 +56,8 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ #endif /* PG_CPU_H */ diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 40ff78633ca..3844da511fd 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -80,13 +80,13 @@ pg_cpuid(int leaf, unsigned int *reg) static inline bool pg_cpuid_subleaf(int leaf, int subleaf, unsigned int *reg) { + memset(reg, 0, 4 * sizeof(unsigned int)); #if defined(HAVE__GET_CPUID_COUNT) return __get_cpuid_count(leaf, subleaf, ®[EAX], ®[EBX], ®[ECX], ®[EDX]) == 1; #elif defined(HAVE__CPUIDEX) __cpuidex((int *) reg, leaf, subleaf); return true; #else - memset(reg, 0, 4 * sizeof(unsigned int)); return false; #endif } @@ -101,19 +101,24 @@ void set_x86_features(void) { unsigned int reg[4] = {0}; + bool have_osxsave; pg_cpuid(0x01, reg); X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1; X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1; + X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1; + have_osxsave = reg[ECX] >> 27 & 1; + + pg_cpuid_subleaf(0x07, 0, reg); + + X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1; /* leaf 7 features that depend on OSXSAVE */ - if (reg[ECX] & (1 << 27)) + if (have_osxsave) { uint32 xcr0_val = 0; - pg_cpuid_subleaf(0x07, 0, reg); - #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ xcr0_val = _xgetbv(0); @@ -135,7 +140,132 @@ set_x86_features(void) } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, reg); + X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1; + + pg_cpuid(0x80000007, reg); + X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU through CPUID, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates the frequency information was not accessible via CPUID. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + + if (x86_feature_available(PG_HYPERVISOR)) + { + uint32 freq = x86_hypervisor_tsc_frequency_khz(); + + /* + * If the hypervisor specific logic didn't figure out the frequency, + * it's possible (although not likely, as often that's hidden from + * guests) that the non-virtualized logic can figure out the + * frequency. + */ + if (freq > 0) + return freq; + } + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "IntelĀ® 64 and IA-32 Architectures Software Developer's Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, reg); + if (reg[ECX] > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (reg[EAX] == 0 || reg[EBX] == 0) + return 0; + + return reg[ECX] / 1000 * reg[EBX] / reg[EAX]; + } + + /* + * When CPUID.15H is not available/incomplete, we can instead try to get + * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor + * Frequency Information Leaf". + */ + pg_cpuid(0x16, reg); + if (reg[EAX] > 0) + return reg[EAX] * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access a model-specific register (MSR) to get the frequency. MSRs are + * separate from CPUID and typically not available for unprivileged processes, + * so we can't get the frequency this way. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + +#if defined(HAVE__CPUIDEX) + + /* + * The hypervisor is determined using the 0x40000000 Hypervisor + * information leaf, which requires use of __cpuidex to set ECX to 0 to + * access it. + * + * The similar __get_cpuid_count function does not work as expected since + * it contains a check for __get_cpuid_max, which has been observed to be + * lower than the special Hypervisor leaf, despite it being available. + */ + __cpuidex((int *) reg, 0x40000000, 0); + + if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg))) + { + __cpuidex((int *) reg, 0x40000010, 0); + if (reg[EAX] > 0) + return reg[EAX]; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */