Allow retrieving x86 TSC frequency/flags from CPUID

This adds additional x86 specific CPUID checks for flags needed for
determining whether the Time-Stamp Counter (TSC) is usable on a given system,
as well as a helper function to retrieve the TSC frequency from CPUID.

This is intended for a future patch that will utilize the TSC to lower the
overhead of timing instrumentation.

In passing, always make pg_cpuid_subleaf reset the variables used for its
result, to avoid accidentally using stale results if __get_cpuid_count errors
out and the caller doesn't check for it.

Author: Lukas Fittl <lukas@fittl.com>
Author: David Geier <geidav.pg@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: David Geier <geidav.pg@gmail.com>
Reviewed-by: John Naylor <john.naylor@postgresql.org>
Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> (in an earlier version)
Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
This commit is contained in:
Andres Freund 2026-04-07 12:48:07 -04:00
parent 0022622c93
commit bcb2cf41f9
2 changed files with 145 additions and 5 deletions

View file

@ -32,8 +32,16 @@ typedef enum X86FeatureId
PG_AVX512_VL,
PG_AVX512_VPCLMULQDQ,
PG_AVX512_VPOPCNTDQ,
/* identification */
PG_HYPERVISOR,
/* Time-Stamp Counter (TSC) flags */
PG_RDTSCP,
PG_TSC_INVARIANT,
PG_TSC_ADJUST,
} X86FeatureId;
#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1)
#define X86FeaturesSize (PG_TSC_ADJUST + 1)
extern PGDLLIMPORT bool X86Features[];
@ -48,6 +56,8 @@ x86_feature_available(X86FeatureId feature)
return X86Features[feature];
}
extern uint32 x86_tsc_frequency_khz(void);
#endif /* defined(USE_SSE2) || defined(__i386__) */
#endif /* PG_CPU_H */

View file

@ -80,13 +80,13 @@ pg_cpuid(int leaf, unsigned int *reg)
static inline bool
pg_cpuid_subleaf(int leaf, int subleaf, unsigned int *reg)
{
memset(reg, 0, 4 * sizeof(unsigned int));
#if defined(HAVE__GET_CPUID_COUNT)
return __get_cpuid_count(leaf, subleaf, &reg[EAX], &reg[EBX], &reg[ECX], &reg[EDX]) == 1;
#elif defined(HAVE__CPUIDEX)
__cpuidex((int *) reg, leaf, subleaf);
return true;
#else
memset(reg, 0, 4 * sizeof(unsigned int));
return false;
#endif
}
@ -101,19 +101,24 @@ void
set_x86_features(void)
{
unsigned int reg[4] = {0};
bool have_osxsave;
pg_cpuid(0x01, reg);
X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1;
X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1;
X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1;
have_osxsave = reg[ECX] >> 27 & 1;
pg_cpuid_subleaf(0x07, 0, reg);
X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1;
/* leaf 7 features that depend on OSXSAVE */
if (reg[ECX] & (1 << 27))
if (have_osxsave)
{
uint32 xcr0_val = 0;
pg_cpuid_subleaf(0x07, 0, reg);
#ifdef HAVE_XSAVE_INTRINSICS
/* get value of Extended Control Register */
xcr0_val = _xgetbv(0);
@ -135,7 +140,132 @@ set_x86_features(void)
}
}
/* Check for other TSC related flags */
pg_cpuid(0x80000001, reg);
X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1;
pg_cpuid(0x80000007, reg);
X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1;
X86Features[INIT_PG_X86] = true;
}
/* TSC (Time-stamp Counter) handling code */
static uint32 x86_hypervisor_tsc_frequency_khz(void);
/*
* Determine the TSC frequency of the CPU through CPUID, where supported.
*
* Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of
* 0 indicates the frequency information was not accessible via CPUID.
*/
uint32
x86_tsc_frequency_khz(void)
{
unsigned int reg[4] = {0};
if (x86_feature_available(PG_HYPERVISOR))
{
uint32 freq = x86_hypervisor_tsc_frequency_khz();
/*
* If the hypervisor specific logic didn't figure out the frequency,
* it's possible (although not likely, as often that's hidden from
* guests) that the non-virtualized logic can figure out the
* frequency.
*/
if (freq > 0)
return freq;
}
/*
* On modern Intel CPUs, the TSC is implemented by invariant timekeeping
* hardware, also called "Always Running Timer", or ART. The ART stays
* consistent even if the CPU changes frequency due to changing power
* levels.
*
* As documented in "Determining the Processor Base Frequency" in the
* "Intel® 64 and IA-32 Architectures Software Developer's Manual",
* February 2026 Edition, we can get the TSC frequency as follows:
*
* Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) /
* CPUID.15H:EAX[31:0]
*
* With CPUID.15H:ECX representing the nominal core crystal clock
* frequency, and EAX/EBX representing values used to translate the TSC
* value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of
* that manual.
*
* Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as
* such we fall back to alternate approaches.
*/
pg_cpuid(0x15, reg);
if (reg[ECX] > 0)
{
/*
* EBX not being set indicates invariant TSC is not available. Require
* EAX being non-zero too, to avoid a theoretical divide by zero.
*/
if (reg[EAX] == 0 || reg[EBX] == 0)
return 0;
return reg[ECX] / 1000 * reg[EBX] / reg[EAX];
}
/*
* When CPUID.15H is not available/incomplete, we can instead try to get
* the processor base frequency in MHz from CPUID.16H:EAX, the "Processor
* Frequency Information Leaf".
*/
pg_cpuid(0x16, reg);
if (reg[EAX] > 0)
return reg[EAX] * 1000;
return 0;
}
/*
* Support for reading TSC frequency for hypervisors passing it to a guest VM.
*
* Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz
* available at the vendor-specific 0x40000010 leaf in the EAX register.
*
* For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would
* need to access a model-specific register (MSR) to get the frequency. MSRs are
* separate from CPUID and typically not available for unprivileged processes,
* so we can't get the frequency this way.
*/
#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177) /* VMwareVMware */
#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */
static uint32
x86_hypervisor_tsc_frequency_khz(void)
{
unsigned int reg[4] = {0};
#if defined(HAVE__CPUIDEX)
/*
* The hypervisor is determined using the 0x40000000 Hypervisor
* information leaf, which requires use of __cpuidex to set ECX to 0 to
* access it.
*
* The similar __get_cpuid_count function does not work as expected since
* it contains a check for __get_cpuid_max, which has been observed to be
* lower than the special Hypervisor leaf, despite it being available.
*/
__cpuidex((int *) reg, 0x40000000, 0);
if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg)))
{
__cpuidex((int *) reg, 0x40000010, 0);
if (reg[EAX] > 0)
return reg[EAX];
}
#endif /* HAVE__CPUIDEX */
return 0;
}
#endif /* defined(USE_SSE2) || defined(__i386__) */