PGSS CPU time tracking.

This commit is contained in:
Alex Weisberger 2026-02-13 14:42:25 -05:00
parent aa082bed0b
commit 13f897fbc8
No known key found for this signature in database
16 changed files with 891 additions and 226 deletions

View file

@ -19,26 +19,9 @@ PGFILEDESC = "pg_stat_statements - execution statistics of SQL statements"
LDFLAGS_SL += $(filter -lm, $(LIBS))
REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_stat_statements/pg_stat_statements.conf
# Note: Test "cleanup" is kept second to last, removing the extension.
REGRESS = \
select \
dml \
cursors \
utility \
level_tracking \
planning \
user_activity \
wal \
entry_timestamp \
privileges \
extended \
parallel \
plancache \
squashing \
cleanup \
oldextversions
REGRESS = select dml cursors utility level_tracking planning \
user_activity wal entry_timestamp privileges extended \
parallel cpu_time cleanup oldextversions
# Disabled because these tests require "shared_preload_libraries=pg_stat_statements",
# which typical installcheck users do not have (e.g. buildfarm clients).
NO_INSTALLCHECK = 1

View file

@ -0,0 +1,163 @@
/*
* bench_cpu_time.c
*
* Benchmark to demonstrate the difference between wall clock time
* and CPU time (via getrusage) under varying system load.
*
* Under no load: wall ~= cpu
* Under CPU saturation (e.g. stress-ng): wall >> cpu (scheduling delay)
* Under I/O saturation: wall >> cpu (I/O wait)
*
* Usage:
* cc -O2 -o bench_cpu_time bench_cpu_time.c -lm
* ./bench_cpu_time [iterations]
*
* Then compare with:
* stress-ng --cpu $(nproc) --timeout 30s &
* ./bench_cpu_time
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#include <fcntl.h>
#define DEFAULT_ITERATIONS 10
#define WORK_SIZE 5000000 /* number of math ops per iteration */
static double
wall_time_ms(struct timespec *start, struct timespec *end)
{
return (end->tv_sec - start->tv_sec) * 1000.0
+ (end->tv_nsec - start->tv_nsec) / 1000000.0;
}
static double
rusage_cpu_ms(struct rusage *start, struct rusage *end)
{
double user_ms = (end->ru_utime.tv_sec - start->ru_utime.tv_sec) * 1000.0
+ (end->ru_utime.tv_usec - start->ru_utime.tv_usec) / 1000.0;
double sys_ms = (end->ru_stime.tv_sec - start->ru_stime.tv_sec) * 1000.0
+ (end->ru_stime.tv_usec - start->ru_stime.tv_usec) / 1000.0;
return user_ms + sys_ms;
}
/*
* Pure CPU-bound work: compute a bunch of sqrt/sin to keep the CPU busy.
* volatile to prevent the compiler from optimizing it away.
*/
static volatile double sink;
static void
cpu_bound_work(void)
{
double acc = 0.0;
for (int i = 0; i < WORK_SIZE; i++)
acc += sqrt((double)i) * sin((double)i);
sink = acc;
}
/*
* Mixed I/O + CPU work: write to a temp file between CPU bursts.
*/
static void
io_bound_work(void)
{
char buf[4096];
char path[] = "/tmp/bench_cpu_time_XXXXXX";
int fd;
double acc = 0.0;
memset(buf, 'x', sizeof(buf));
fd = mkstemp(path);
if (fd < 0)
{
perror("mkstemp");
return;
}
unlink(path);
for (int i = 0; i < WORK_SIZE / 10; i++)
{
acc += sqrt((double)i);
if (i % 1000 == 0)
{
write(fd, buf, sizeof(buf));
fsync(fd);
}
}
close(fd);
sink = acc;
}
static void
run_benchmark(const char *label, void (*workfn)(void), int iterations)
{
double total_wall = 0.0;
double total_cpu = 0.0;
printf("\n=== %s (%d iterations) ===\n", label, iterations);
printf("%4s %10s %10s %10s %7s\n",
"#", "wall(ms)", "cpu(ms)", "off-cpu(ms)", "cpu%");
for (int i = 0; i < iterations; i++)
{
struct timespec ts_start, ts_end;
struct rusage ru_start, ru_end;
getrusage(RUSAGE_SELF, &ru_start);
clock_gettime(CLOCK_MONOTONIC, &ts_start);
workfn();
clock_gettime(CLOCK_MONOTONIC, &ts_end);
getrusage(RUSAGE_SELF, &ru_end);
double wall = wall_time_ms(&ts_start, &ts_end);
double cpu = rusage_cpu_ms(&ru_start, &ru_end);
double off = wall - cpu;
double pct = (wall > 0) ? (cpu / wall) * 100.0 : 0.0;
printf("%4d %10.2f %10.2f %10.2f %6.1f%%\n",
i + 1, wall, cpu, off, pct);
total_wall += wall;
total_cpu += cpu;
}
double avg_wall = total_wall / iterations;
double avg_cpu = total_cpu / iterations;
double avg_off = avg_wall - avg_cpu;
double avg_pct = (avg_wall > 0) ? (avg_cpu / avg_wall) * 100.0 : 0.0;
printf("---- ---------- ---------- ---------- -------\n");
printf(" avg %10.2f %10.2f %10.2f %6.1f%%\n",
avg_wall, avg_cpu, avg_off, avg_pct);
}
int
main(int argc, char *argv[])
{
int iterations = DEFAULT_ITERATIONS;
if (argc > 1)
iterations = atoi(argv[1]);
if (iterations < 1)
iterations = 1;
printf("PID: %d\n", getpid());
printf("Work size: %d ops per iteration\n", WORK_SIZE);
run_benchmark("CPU-bound work", cpu_bound_work, iterations);
run_benchmark("I/O-bound work", io_bound_work, iterations);
return 0;
}

View file

@ -0,0 +1,142 @@
#!/bin/bash
#
# run_benchmark.sh - Benchmark pg_stat_statements CPU time tracking
#
# Runs pgbench at various concurrency levels, with and without CPU saturation,
# then queries pg_stat_statements to compare wall time vs CPU time.
#
set -euo pipefail
export PATH="$HOME/pginstall/bin:$PATH"
DB="bench"
NCPU=$(nproc)
DURATION=30 # seconds per pgbench run
PGBENCH_SCALE=10 # matches -s 10 from setup
RESULTS_FILE="$HOME/benchmark_results.txt"
# Concurrency levels to test
CLIENTS=(1 4 8 $((NCPU * 2)) $((NCPU * 4)))
# Stress-ng intensity: 0 = no load, N = N CPU-burning workers
STRESS_LEVELS=(0 $NCPU $((NCPU * 2)))
header() {
echo ""
echo "================================================================"
echo " $1"
echo "================================================================"
}
run_one() {
local clients=$1
local stress=$2
local label="clients=$clients, stress=$stress"
local stress_pid=""
echo ""
echo "--- $label ---"
# Reset stats
psql "$DB" -qc "SELECT pg_stat_statements_reset();" > /dev/null
# Start CPU stress if requested
if [ "$stress" -gt 0 ]; then
stress-ng --cpu "$stress" --timeout $((DURATION + 10))s &>/dev/null &
stress_pid=$!
sleep 1
fi
# Run pgbench
pgbench -c "$clients" -j "$clients" -T "$DURATION" "$DB" 2>&1 | grep -E "^(tps|latency|number)"
# Stop stress
if [ -n "$stress_pid" ]; then
kill "$stress_pid" 2>/dev/null
wait "$stress_pid" 2>/dev/null || true
fi
# Query pg_stat_statements for the pgbench queries
psql "$DB" -P pager=off -c "
SELECT
left(query, 60) AS query,
calls,
round(total_exec_time::numeric, 2) AS wall_ms,
round(total_exec_cpu_time::numeric, 2) AS cpu_ms,
round((total_exec_time - total_exec_cpu_time)::numeric, 2) AS off_cpu_ms,
CASE WHEN total_exec_time > 0
THEN round((total_exec_cpu_time / total_exec_time * 100)::numeric, 1)
ELSE 0
END AS cpu_pct
FROM pg_stat_statements
WHERE query NOT LIKE '%pg_stat_statements%'
AND query NOT LIKE '%BEGIN%'
AND query NOT LIKE '%END%'
AND calls > 10
ORDER BY total_exec_time DESC
LIMIT 8;
"
# Summary line for the aggregate
psql "$DB" -t -A -c "
SELECT
'$clients' AS clients,
'$stress' AS stress_workers,
round(sum(total_exec_time)::numeric, 2) AS total_wall_ms,
round(sum(total_exec_cpu_time)::numeric, 2) AS total_cpu_ms,
round(sum(total_exec_time - total_exec_cpu_time)::numeric, 2) AS total_off_cpu_ms,
CASE WHEN sum(total_exec_time) > 0
THEN round((sum(total_exec_cpu_time) / sum(total_exec_time) * 100)::numeric, 1)
ELSE 0
END AS cpu_pct
FROM pg_stat_statements
WHERE query NOT LIKE '%pg_stat_statements%'
AND calls > 0;
" >> "$RESULTS_FILE"
}
# ============================================================
# Main
# ============================================================
echo "Benchmark: pg_stat_statements CPU time tracking"
echo "Instance: $(uname -n) ($(nproc) vCPUs)"
echo "Duration per run: ${DURATION}s"
echo ""
# Initialize results file with CSV header
echo "clients,stress_workers,total_wall_ms,total_cpu_ms,total_off_cpu_ms,cpu_pct" > "$RESULTS_FILE"
header "Verifying CPU time columns exist"
psql "$DB" -c "
SELECT total_plan_cpu_time, total_exec_cpu_time
FROM pg_stat_statements
LIMIT 1;
"
for stress in "${STRESS_LEVELS[@]}"; do
if [ "$stress" -eq 0 ]; then
header "NO CPU STRESS"
else
header "CPU STRESS: $stress workers on $NCPU vCPUs"
fi
for clients in "${CLIENTS[@]}"; do
run_one "$clients" "$stress"
done
done
header "SUMMARY"
echo ""
echo "Instance: $(uname -n) ($(nproc) vCPUs)"
echo ""
echo "clients | stress | wall_ms | cpu_ms | off_cpu_ms | cpu%"
echo "--------|--------|---------------|---------------|---------------|------"
while IFS='|' read -r c s w cpu off pct; do
printf "%-8s| %-7s| %13s | %13s | %13s | %s%%\n" "$c" "$s" "$w" "$cpu" "$off" "$pct"
done < "$RESULTS_FILE"
echo ""
echo "Full results saved to: $RESULTS_FILE"
echo ""
echo "Key insight: as stress workers increase, cpu% should decrease"
echo "(more off-CPU time = scheduling delay from CPU contention)"

View file

@ -0,0 +1,70 @@
#!/bin/bash
#
# setup.sh - Build PostgreSQL from source and initialize a pgbench database
# with pg_stat_statements enabled.
#
# Run from the postgres source root: bash contrib/pg_stat_statements/benchmark/setup.sh
#
set -euo pipefail
PGDATA="$HOME/pgdata"
PGINSTALL="$HOME/pginstall"
NCPU=$(nproc)
echo "=== Building PostgreSQL (using $NCPU cores) ==="
# Wait for user_data to finish installing deps
while [ ! -f /tmp/setup_done ]; do
echo "Waiting for build dependencies to be installed..."
sleep 5
done
./configure \
--prefix="$PGINSTALL" \
--without-icu \
--with-openssl \
2>&1 | tail -3
make -j"$NCPU" 2>&1 | tail -5
make install 2>&1 | tail -3
echo "=== Building pg_stat_statements ==="
make -C contrib/pg_stat_statements install 2>&1 | tail -3
# Also build pgbench
make -C contrib/pgbench install 2>&1 | tail -3 || \
make -C src/bin/pgbench install 2>&1 | tail -3
export PATH="$PGINSTALL/bin:$PATH"
echo "=== Initializing database ==="
# Stop any existing server before wiping data
pg_ctl -D "$PGDATA" stop 2>/dev/null || true
rm -rf "$PGDATA"
initdb -D "$PGDATA" 2>&1 | tail -3
# Configure pg_stat_statements
cat >> "$PGDATA/postgresql.conf" <<EOF
# pg_stat_statements config
shared_preload_libraries = 'pg_stat_statements'
pg_stat_statements.max = 10000
pg_stat_statements.track = top
pg_stat_statements.track_utility = on
pg_stat_statements.track_planning = on
compute_query_id = on
EOF
echo "=== Starting PostgreSQL ==="
pg_ctl -D "$PGDATA" -l "$PGDATA/logfile" start
sleep 2
# Create the extension and initialize pgbench
createdb bench
psql bench -c "CREATE EXTENSION pg_stat_statements;"
pgbench -i -s 10 bench 2>&1 | tail -3
echo ""
echo "=== Setup complete ==="
echo "PostgreSQL is running with pg_stat_statements enabled."
echo "Run: bash contrib/pg_stat_statements/benchmark/run_benchmark.sh"

View file

@ -0,0 +1,5 @@
*.pem
*.tfstate
*.tfstate.backup
.terraform/
.terraform.lock.hcl

View file

@ -0,0 +1,128 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
http = {
source = "hashicorp/http"
version = "~> 3.0"
}
tls = {
source = "hashicorp/tls"
version = "~> 4.0"
}
local = {
source = "hashicorp/local"
version = "~> 2.0"
}
}
}
provider "aws" {
region = var.region
}
# Auto-detect the caller's public IP if allowed_ssh_cidr is "auto"
data "http" "my_ip" {
count = var.allowed_ssh_cidr == "auto" ? 1 : 0
url = "https://checkip.amazonaws.com"
}
locals {
ssh_cidr = var.allowed_ssh_cidr == "auto" ? "${trimspace(data.http.my_ip[0].response_body)}/32" : var.allowed_ssh_cidr
}
# Generate an SSH key pair
resource "tls_private_key" "bench" {
algorithm = "ED25519"
}
resource "aws_key_pair" "bench" {
key_name_prefix = "pgss-bench-"
public_key = tls_private_key.bench.public_key_openssh
}
# Write the private key to a local file
resource "local_file" "ssh_key" {
content = tls_private_key.bench.private_key_openssh
filename = "${path.module}/pgss-bench-key.pem"
file_permission = "0600"
}
# Use the default VPC
data "aws_vpc" "default" {
default = true
}
# Find the latest Amazon Linux 2023 AMI
data "aws_ami" "al2023" {
most_recent = true
owners = ["amazon"]
filter {
name = "name"
values = ["al2023-ami-*-x86_64"]
}
filter {
name = "virtualization-type"
values = ["hvm"]
}
}
resource "aws_security_group" "pgss_bench" {
name_prefix = "pgss-bench-"
description = "SSH access for pg_stat_statements benchmark"
vpc_id = data.aws_vpc.default.id
ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = [local.ssh_cidr]
description = "SSH"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
description = "All outbound"
}
tags = {
Name = "pgss-cpu-time-benchmark"
}
}
resource "aws_instance" "bench" {
ami = data.aws_ami.al2023.id
instance_type = var.instance_type
key_name = aws_key_pair.bench.key_name
vpc_security_group_ids = [aws_security_group.pgss_bench.id]
root_block_device {
volume_size = 30
volume_type = "gp3"
}
user_data = <<-USERDATA
#!/bin/bash
set -euo pipefail
# Install build dependencies for PostgreSQL
dnf groupinstall -y "Development Tools"
dnf install -y \
readline-devel zlib-devel libxml2-devel openssl-devel \
flex bison perl git rsync \
stress-ng
echo "=== Build dependencies installed ===" > /tmp/setup_done
USERDATA
tags = {
Name = "pgss-cpu-time-benchmark"
}
}

View file

@ -0,0 +1,26 @@
output "instance_id" {
value = aws_instance.bench.id
}
output "public_ip" {
value = aws_instance.bench.public_ip
}
output "ssh_key_file" {
description = "Path to the generated private key"
value = local_file.ssh_key.filename
}
output "ssh_command" {
value = "ssh -i ${abspath(local_file.ssh_key.filename)} ec2-user@${aws_instance.bench.public_ip}"
}
output "rsync_command" {
description = "Run this from the postgres repo root to push the source tree"
value = "rsync -avz -e 'ssh -i ${abspath(local_file.ssh_key.filename)}' --exclude='.git' --exclude='*.o' --exclude='*.dylib' . ec2-user@${aws_instance.bench.public_ip}:~/postgres/"
}
output "benchmark_command" {
description = "Run this after SSH'ing in to start the benchmark"
value = "cd ~/postgres && bash contrib/pg_stat_statements/benchmark/setup.sh && bash contrib/pg_stat_statements/benchmark/run_benchmark.sh"
}

View file

@ -0,0 +1,17 @@
variable "instance_type" {
description = "EC2 instance type"
type = string
default = "c5.2xlarge"
}
variable "region" {
description = "AWS region"
type = string
default = "us-east-1"
}
variable "allowed_ssh_cidr" {
description = "CIDR block allowed to SSH (e.g. your IP). Use 'auto' to detect."
type = string
default = "auto"
}

View file

@ -0,0 +1,62 @@
--
-- Tests for CPU time tracking
--
SET pg_stat_statements.track_utility = FALSE;
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
t
---
t
(1 row)
-- Run a CPU-intensive query
SELECT count(*) FROM generate_series(1, 100000);
count
--------
100000
(1 row)
-- Verify CPU time columns are populated and sensible
SELECT query,
total_exec_cpu_time > 0 AS has_cpu_time,
total_exec_cpu_time <= total_exec_time AS cpu_le_wall
FROM pg_stat_statements
WHERE query LIKE '%generate_series%'
ORDER BY query COLLATE "C";
query | has_cpu_time | cpu_le_wall
--------------------------------------------------+--------------+-------------
SELECT count(*) FROM generate_series($1, $2) | t | t
(1 row)
-- Test planning CPU time
SET pg_stat_statements.track_planning = TRUE;
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
t
---
t
(1 row)
SELECT count(*) FROM generate_series(1, 100000);
count
--------
100000
(1 row)
SELECT query,
total_exec_cpu_time >= 0 AS has_exec_cpu,
total_plan_cpu_time >= 0 AS has_plan_cpu
FROM pg_stat_statements
WHERE query LIKE '%generate_series%'
ORDER BY query COLLATE "C";
query | has_exec_cpu | has_plan_cpu
--------------------------------------------------+--------------+--------------
SELECT count(*) FROM generate_series($1, $2) | t | t
(1 row)
-- Cleanup
SET pg_stat_statements.track_planning = FALSE;
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
t
---
t
(1 row)

View file

@ -385,7 +385,6 @@ AlTER EXTENSION pg_stat_statements UPDATE TO '1.12';
wal_records | bigint | | |
wal_fpi | bigint | | |
wal_bytes | numeric | | |
wal_buffers_full | bigint | | |
jit_functions | bigint | | |
jit_generation_time | double precision | | |
jit_inlining_count | bigint | | |
@ -402,7 +401,7 @@ AlTER EXTENSION pg_stat_statements UPDATE TO '1.12';
minmax_stats_since | timestamp with time zone | | |
SELECT count(*) > 0 AS has_data FROM pg_stat_statements;
has_data
has_data
----------
t
(1 row)
@ -411,65 +410,64 @@ SELECT count(*) > 0 AS has_data FROM pg_stat_statements;
AlTER EXTENSION pg_stat_statements UPDATE TO '1.13';
\d pg_stat_statements
View "public.pg_stat_statements"
Column | Type | Collation | Nullable | Default
Column | Type | Collation | Nullable | Default
----------------------------+--------------------------+-----------+----------+---------
userid | oid | | |
dbid | oid | | |
toplevel | boolean | | |
queryid | bigint | | |
query | text | | |
plans | bigint | | |
total_plan_time | double precision | | |
min_plan_time | double precision | | |
max_plan_time | double precision | | |
mean_plan_time | double precision | | |
stddev_plan_time | double precision | | |
calls | bigint | | |
total_exec_time | double precision | | |
min_exec_time | double precision | | |
max_exec_time | double precision | | |
mean_exec_time | double precision | | |
stddev_exec_time | double precision | | |
rows | bigint | | |
shared_blks_hit | bigint | | |
shared_blks_read | bigint | | |
shared_blks_dirtied | bigint | | |
shared_blks_written | bigint | | |
local_blks_hit | bigint | | |
local_blks_read | bigint | | |
local_blks_dirtied | bigint | | |
local_blks_written | bigint | | |
temp_blks_read | bigint | | |
temp_blks_written | bigint | | |
shared_blk_read_time | double precision | | |
shared_blk_write_time | double precision | | |
local_blk_read_time | double precision | | |
local_blk_write_time | double precision | | |
temp_blk_read_time | double precision | | |
temp_blk_write_time | double precision | | |
wal_records | bigint | | |
wal_fpi | bigint | | |
wal_bytes | numeric | | |
wal_buffers_full | bigint | | |
jit_functions | bigint | | |
jit_generation_time | double precision | | |
jit_inlining_count | bigint | | |
jit_inlining_time | double precision | | |
jit_optimization_count | bigint | | |
jit_optimization_time | double precision | | |
jit_emission_count | bigint | | |
jit_emission_time | double precision | | |
jit_deform_count | bigint | | |
jit_deform_time | double precision | | |
parallel_workers_to_launch | bigint | | |
parallel_workers_launched | bigint | | |
generic_plan_calls | bigint | | |
custom_plan_calls | bigint | | |
stats_since | timestamp with time zone | | |
minmax_stats_since | timestamp with time zone | | |
userid | oid | | |
dbid | oid | | |
toplevel | boolean | | |
queryid | bigint | | |
query | text | | |
plans | bigint | | |
total_plan_time | double precision | | |
min_plan_time | double precision | | |
max_plan_time | double precision | | |
mean_plan_time | double precision | | |
stddev_plan_time | double precision | | |
calls | bigint | | |
total_exec_time | double precision | | |
min_exec_time | double precision | | |
max_exec_time | double precision | | |
mean_exec_time | double precision | | |
stddev_exec_time | double precision | | |
rows | bigint | | |
shared_blks_hit | bigint | | |
shared_blks_read | bigint | | |
shared_blks_dirtied | bigint | | |
shared_blks_written | bigint | | |
local_blks_hit | bigint | | |
local_blks_read | bigint | | |
local_blks_dirtied | bigint | | |
local_blks_written | bigint | | |
temp_blks_read | bigint | | |
temp_blks_written | bigint | | |
shared_blk_read_time | double precision | | |
shared_blk_write_time | double precision | | |
local_blk_read_time | double precision | | |
local_blk_write_time | double precision | | |
temp_blk_read_time | double precision | | |
temp_blk_write_time | double precision | | |
wal_records | bigint | | |
wal_fpi | bigint | | |
wal_bytes | numeric | | |
jit_functions | bigint | | |
jit_generation_time | double precision | | |
jit_inlining_count | bigint | | |
jit_inlining_time | double precision | | |
jit_optimization_count | bigint | | |
jit_optimization_time | double precision | | |
jit_emission_count | bigint | | |
jit_emission_time | double precision | | |
jit_deform_count | bigint | | |
jit_deform_time | double precision | | |
parallel_workers_to_launch | bigint | | |
parallel_workers_launched | bigint | | |
total_plan_cpu_time | double precision | | |
total_exec_cpu_time | double precision | | |
stats_since | timestamp with time zone | | |
minmax_stats_since | timestamp with time zone | | |
SELECT count(*) > 0 AS has_data FROM pg_stat_statements;
has_data
has_data
----------
t
(1 row)

View file

@ -1,4 +1,4 @@
# Copyright (c) 2022-2026, PostgreSQL Global Development Group
# Copyright (c) 2022-2024, PostgreSQL Global Development Group
pg_stat_statements_sources = files(
'pg_stat_statements.c',
@ -37,7 +37,6 @@ install_data(
kwargs: contrib_data_args,
)
# Note: Test "cleanup" is kept second to last, removing the extension.
tests += {
'name': 'pg_stat_statements',
'sd': meson.current_source_dir(),
@ -56,8 +55,7 @@ tests += {
'privileges',
'extended',
'parallel',
'plancache',
'squashing',
'cpu_time',
'cleanup',
'oldextversions',
],

View file

@ -50,7 +50,6 @@ CREATE FUNCTION pg_stat_statements(IN showtext boolean,
OUT wal_records int8,
OUT wal_fpi int8,
OUT wal_bytes numeric,
OUT wal_buffers_full int8,
OUT jit_functions int8,
OUT jit_generation_time float8,
OUT jit_inlining_count int8,
@ -63,8 +62,8 @@ CREATE FUNCTION pg_stat_statements(IN showtext boolean,
OUT jit_deform_time float8,
OUT parallel_workers_to_launch int8,
OUT parallel_workers_launched int8,
OUT generic_plan_calls int8,
OUT custom_plan_calls int8,
OUT total_plan_cpu_time float8,
OUT total_exec_cpu_time float8,
OUT stats_since timestamp with time zone,
OUT minmax_stats_since timestamp with time zone
)

View file

@ -34,7 +34,7 @@
* in the file to be read or written while holding only shared lock.
*
*
* Copyright (c) 2008-2026, PostgreSQL Global Development Group
* Copyright (c) 2008-2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/pg_stat_statements/pg_stat_statements.c
@ -47,7 +47,6 @@
#include <sys/stat.h>
#include <unistd.h>
#include "access/htup_details.h"
#include "access/parallel.h"
#include "catalog/pg_authid.h"
#include "common/int.h"
@ -72,10 +71,7 @@
#include "utils/memutils.h"
#include "utils/timestamp.h"
PG_MODULE_MAGIC_EXT(
.name = "pg_stat_statements",
.version = PG_VERSION
);
PG_MODULE_MAGIC;
/* Location of permanent stats file (valid when database is shut down) */
#define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
@ -86,7 +82,7 @@ PG_MODULE_MAGIC_EXT(
#define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat"
/* Magic number identifying the stats file format */
static const uint32 PGSS_FILE_HEADER = 0x20250731;
static const uint32 PGSS_FILE_HEADER = 0x20220408;
/* PostgreSQL major version number, changes in which invalidate all entries */
static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
@ -140,12 +136,13 @@ typedef enum pgssStoreKind
* If you add a new key to this struct, make sure to teach pgss_store() to
* zero the padding bytes. Otherwise, things will break, because pgss_hash is
* created using HASH_BLOBS, and thus tag_hash is used to hash this.
*/
typedef struct pgssHashKey
{
Oid userid; /* user OID */
Oid dbid; /* database OID */
int64 queryid; /* query identifier */
uint64 queryid; /* query identifier */
bool toplevel; /* query executed at top level */
} pgssHashKey;
@ -191,7 +188,6 @@ typedef struct Counters
int64 wal_records; /* # of WAL records generated */
int64 wal_fpi; /* # of WAL full page images generated */
uint64 wal_bytes; /* total amount of WAL generated in bytes */
int64 wal_buffers_full; /* # of times the WAL buffers became full */
int64 jit_functions; /* total number of JIT functions emitted */
double jit_generation_time; /* total time to generate jit code */
int64 jit_inlining_count; /* number of times inlining time has been
@ -211,8 +207,7 @@ typedef struct Counters
* to be launched */
int64 parallel_workers_launched; /* # of parallel workers actually
* launched */
int64 generic_plan_calls; /* number of calls using a generic plan */
int64 custom_plan_calls; /* number of calls using a custom plan */
double cpu_time[PGSS_NUMKIND]; /* total CPU (user+sys) time, in msec */
} Counters;
/*
@ -263,7 +258,7 @@ typedef struct pgssSharedState
/* Current nesting depth of planner/ExecutorRun/ProcessUtility calls */
static int nesting_level = 0;
/* Saved hook values */
/* Saved hook values in case of unload */
static shmem_request_hook_type prev_shmem_request_hook = NULL;
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
@ -302,6 +297,7 @@ static bool pgss_track_planning = false; /* whether to track planning
* duration */
static bool pgss_save = true; /* whether to save stats across shutdown */
#define pgss_enabled(level) \
(!IsParallelWorker() && \
(pgss_track == PGSS_TRACK_ALL || \
@ -338,12 +334,11 @@ static void pgss_post_parse_analyze(ParseState *pstate, Query *query,
static PlannedStmt *pgss_planner(Query *parse,
const char *query_string,
int cursorOptions,
ParamListInfo boundParams,
ExplainState *es);
ParamListInfo boundParams);
static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
static void pgss_ExecutorRun(QueryDesc *queryDesc,
ScanDirection direction,
uint64 count);
uint64 count, bool execute_once);
static void pgss_ExecutorFinish(QueryDesc *queryDesc);
static void pgss_ExecutorEnd(QueryDesc *queryDesc);
static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
@ -351,17 +346,17 @@ static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
ProcessUtilityContext context, ParamListInfo params,
QueryEnvironment *queryEnv,
DestReceiver *dest, QueryCompletion *qc);
static void pgss_store(const char *query, int64 queryId,
static void pgss_store(const char *query, uint64 queryId,
int query_location, int query_len,
pgssStoreKind kind,
double total_time, uint64 rows,
double total_time, double cpu_time,
uint64 rows,
const BufferUsage *bufusage,
const WalUsage *walusage,
const struct JitInstrumentation *jitusage,
JumbleState *jstate,
int parallel_workers_to_launch,
int parallel_workers_launched,
PlannedStmtOrigin planOrigin);
int parallel_workers_launched);
static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
pgssVersion api_version,
bool showtext);
@ -376,7 +371,25 @@ static char *qtext_fetch(Size query_offset, int query_len,
char *buffer, Size buffer_size);
static bool need_gc_qtexts(void);
static void gc_qtexts(void);
static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only);
/*
* Compute total CPU time (user + system) in milliseconds from two
* getrusage snapshots. Returns the difference ru1 - ru0.
*/
static double
pgss_cpu_time_diff_ms(const struct rusage *ru1, const struct rusage *ru0)
{
double user_ms,
sys_ms;
user_ms = (double) (ru1->ru_utime.tv_sec - ru0->ru_utime.tv_sec) * 1000.0
+ (double) (ru1->ru_utime.tv_usec - ru0->ru_utime.tv_usec) / 1000.0;
sys_ms = (double) (ru1->ru_stime.tv_sec - ru0->ru_stime.tv_sec) * 1000.0
+ (double) (ru1->ru_stime.tv_usec - ru0->ru_stime.tv_usec) / 1000.0;
return user_ms + sys_ms;
}
static TimestampTz entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only);
static char *generate_normalized_query(JumbleState *jstate, const char *query,
int query_loc, int *query_len_p);
static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
@ -858,7 +871,7 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
{
if (pgss_track_utility && IsA(query->utilityStmt, ExecuteStmt))
{
query->queryId = INT64CONST(0);
query->queryId = UINT64CONST(0);
return;
}
}
@ -876,15 +889,14 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
query->stmt_location,
query->stmt_len,
PGSS_INVALID,
0,
0,
0, 0, /* total_time, cpu_time */
0, /* rows */
NULL,
NULL,
NULL,
jstate,
0,
0,
PLAN_STMT_UNKNOWN);
0);
}
/*
@ -895,8 +907,7 @@ static PlannedStmt *
pgss_planner(Query *parse,
const char *query_string,
int cursorOptions,
ParamListInfo boundParams,
ExplainState *es)
ParamListInfo boundParams)
{
PlannedStmt *result;
@ -907,7 +918,7 @@ pgss_planner(Query *parse,
*/
if (pgss_enabled(nesting_level)
&& pgss_track_planning && query_string
&& parse->queryId != INT64CONST(0))
&& parse->queryId != UINT64CONST(0))
{
instr_time start;
instr_time duration;
@ -915,6 +926,8 @@ pgss_planner(Query *parse,
bufusage;
WalUsage walusage_start,
walusage;
struct rusage rusage_start,
rusage_end;
/* We need to track buffer usage as the planner can access them. */
bufusage_start = pgBufferUsage;
@ -924,6 +937,7 @@ pgss_planner(Query *parse,
* (e.g. setting a hint bit with those being WAL-logged)
*/
walusage_start = pgWalUsage;
getrusage(RUSAGE_SELF, &rusage_start);
INSTR_TIME_SET_CURRENT(start);
nesting_level++;
@ -931,10 +945,10 @@ pgss_planner(Query *parse,
{
if (prev_planner_hook)
result = prev_planner_hook(parse, query_string, cursorOptions,
boundParams, es);
boundParams);
else
result = standard_planner(parse, query_string, cursorOptions,
boundParams, es);
boundParams);
}
PG_FINALLY();
{
@ -944,6 +958,7 @@ pgss_planner(Query *parse,
INSTR_TIME_SET_CURRENT(duration);
INSTR_TIME_SUBTRACT(duration, start);
getrusage(RUSAGE_SELF, &rusage_end);
/* calc differences of buffer counters. */
memset(&bufusage, 0, sizeof(BufferUsage));
@ -959,14 +974,14 @@ pgss_planner(Query *parse,
parse->stmt_len,
PGSS_PLAN,
INSTR_TIME_GET_MILLISEC(duration),
pgss_cpu_time_diff_ms(&rusage_end, &rusage_start),
0,
&bufusage,
&walusage,
NULL,
NULL,
0,
0,
result->planOrigin);
0);
}
else
{
@ -980,10 +995,10 @@ pgss_planner(Query *parse,
{
if (prev_planner_hook)
result = prev_planner_hook(parse, query_string, cursorOptions,
boundParams, es);
boundParams);
else
result = standard_planner(parse, query_string, cursorOptions,
boundParams, es);
boundParams);
}
PG_FINALLY();
{
@ -1011,7 +1026,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
* counting of optimizable statements that are directly contained in
* utility statements.
*/
if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0))
if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0))
{
/*
* Set up to track total elapsed time in ExecutorRun. Make sure the
@ -1033,15 +1048,16 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
* ExecutorRun hook: all we need do is track nesting depth
*/
static void
pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count)
pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
bool execute_once)
{
nesting_level++;
PG_TRY();
{
if (prev_ExecutorRun)
prev_ExecutorRun(queryDesc, direction, count);
prev_ExecutorRun(queryDesc, direction, count, execute_once);
else
standard_ExecutorRun(queryDesc, direction, count);
standard_ExecutorRun(queryDesc, direction, count, execute_once);
}
PG_FINALLY();
{
@ -1077,9 +1093,9 @@ pgss_ExecutorFinish(QueryDesc *queryDesc)
static void
pgss_ExecutorEnd(QueryDesc *queryDesc)
{
int64 queryId = queryDesc->plannedstmt->queryId;
uint64 queryId = queryDesc->plannedstmt->queryId;
if (queryId != INT64CONST(0) && queryDesc->totaltime &&
if (queryId != UINT64CONST(0) && queryDesc->totaltime &&
pgss_enabled(nesting_level))
{
/*
@ -1093,15 +1109,15 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
queryDesc->plannedstmt->stmt_location,
queryDesc->plannedstmt->stmt_len,
PGSS_EXEC,
INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total),
queryDesc->totaltime->total * 1000.0, /* convert to msec */
queryDesc->totaltime->cpu_time * 1000.0, /* convert to msec */
queryDesc->estate->es_total_processed,
&queryDesc->totaltime->bufusage,
&queryDesc->totaltime->walusage,
queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
NULL,
queryDesc->estate->es_parallel_workers_to_launch,
queryDesc->estate->es_parallel_workers_launched,
queryDesc->plannedstmt->planOrigin);
queryDesc->estate->es_parallel_workers_launched);
}
if (prev_ExecutorEnd)
@ -1121,7 +1137,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
DestReceiver *dest, QueryCompletion *qc)
{
Node *parsetree = pstmt->utilityStmt;
int64 saved_queryId = pstmt->queryId;
uint64 saved_queryId = pstmt->queryId;
int saved_stmt_location = pstmt->stmt_location;
int saved_stmt_len = pstmt->stmt_len;
bool enabled = pgss_track_utility && pgss_enabled(nesting_level);
@ -1141,7 +1157,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
* only.
*/
if (enabled)
pstmt->queryId = INT64CONST(0);
pstmt->queryId = UINT64CONST(0);
/*
* If it's an EXECUTE statement, we don't track it and don't increment the
@ -1167,9 +1183,12 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
bufusage;
WalUsage walusage_start,
walusage;
struct rusage rusage_start,
rusage_end;
bufusage_start = pgBufferUsage;
walusage_start = pgWalUsage;
getrusage(RUSAGE_SELF, &rusage_start);
INSTR_TIME_SET_CURRENT(start);
nesting_level++;
@ -1202,6 +1221,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
INSTR_TIME_SET_CURRENT(duration);
INSTR_TIME_SUBTRACT(duration, start);
getrusage(RUSAGE_SELF, &rusage_end);
/*
* Track the total number of rows retrieved or affected by the utility
@ -1228,14 +1248,14 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
saved_stmt_len,
PGSS_EXEC,
INSTR_TIME_GET_MILLISEC(duration),
pgss_cpu_time_diff_ms(&rusage_end, &rusage_start),
rows,
&bufusage,
&walusage,
NULL,
NULL,
0,
0,
pstmt->planOrigin);
0);
}
else
{
@ -1289,17 +1309,17 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
* for the arrays in the Counters field.
*/
static void
pgss_store(const char *query, int64 queryId,
pgss_store(const char *query, uint64 queryId,
int query_location, int query_len,
pgssStoreKind kind,
double total_time, uint64 rows,
double total_time, double cpu_time,
uint64 rows,
const BufferUsage *bufusage,
const WalUsage *walusage,
const struct JitInstrumentation *jitusage,
JumbleState *jstate,
int parallel_workers_to_launch,
int parallel_workers_launched,
PlannedStmtOrigin planOrigin)
int parallel_workers_launched)
{
pgssHashKey key;
pgssEntry *entry;
@ -1316,7 +1336,7 @@ pgss_store(const char *query, int64 queryId,
* Nothing to do if compute_query_id isn't enabled and no other module
* computed a query identifier.
*/
if (queryId == INT64CONST(0))
if (queryId == UINT64CONST(0))
return;
/*
@ -1421,6 +1441,7 @@ pgss_store(const char *query, int64 queryId,
entry->counters.calls[kind] += 1;
entry->counters.total_time[kind] += total_time;
entry->counters.cpu_time[kind] += cpu_time;
if (entry->counters.calls[kind] == 1)
{
@ -1480,7 +1501,6 @@ pgss_store(const char *query, int64 queryId,
entry->counters.wal_records += walusage->wal_records;
entry->counters.wal_fpi += walusage->wal_fpi;
entry->counters.wal_bytes += walusage->wal_bytes;
entry->counters.wal_buffers_full += walusage->wal_buffers_full;
if (jitusage)
{
entry->counters.jit_functions += jitusage->created_functions;
@ -1507,12 +1527,6 @@ pgss_store(const char *query, int64 queryId,
entry->counters.parallel_workers_to_launch += parallel_workers_to_launch;
entry->counters.parallel_workers_launched += parallel_workers_launched;
/* plan cache counters */
if (planOrigin == PLAN_STMT_CACHE_GENERIC)
entry->counters.generic_plan_calls++;
else if (planOrigin == PLAN_STMT_CACHE_CUSTOM)
entry->counters.custom_plan_calls++;
SpinLockRelease(&entry->mutex);
}
@ -1532,11 +1546,11 @@ pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
{
Oid userid;
Oid dbid;
int64 queryid;
uint64 queryid;
userid = PG_GETARG_OID(0);
dbid = PG_GETARG_OID(1);
queryid = PG_GETARG_INT64(2);
queryid = (uint64) PG_GETARG_INT64(2);
entry_reset(userid, dbid, queryid, false);
@ -1548,12 +1562,12 @@ pg_stat_statements_reset_1_11(PG_FUNCTION_ARGS)
{
Oid userid;
Oid dbid;
int64 queryid;
uint64 queryid;
bool minmax_only;
userid = PG_GETARG_OID(0);
dbid = PG_GETARG_OID(1);
queryid = PG_GETARG_INT64(2);
queryid = (uint64) PG_GETARG_INT64(2);
minmax_only = PG_GETARG_BOOL(3);
PG_RETURN_TIMESTAMPTZ(entry_reset(userid, dbid, queryid, minmax_only));
@ -1579,9 +1593,9 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS)
#define PG_STAT_STATEMENTS_COLS_V1_9 33
#define PG_STAT_STATEMENTS_COLS_V1_10 43
#define PG_STAT_STATEMENTS_COLS_V1_11 49
#define PG_STAT_STATEMENTS_COLS_V1_12 52
#define PG_STAT_STATEMENTS_COLS_V1_13 54
#define PG_STAT_STATEMENTS_COLS 54 /* maximum of above */
#define PG_STAT_STATEMENTS_COLS_V1_12 51
#define PG_STAT_STATEMENTS_COLS_V1_13 53
#define PG_STAT_STATEMENTS_COLS 53 /* maximum of above */
/*
* Retrieve statement statistics.
@ -1992,10 +2006,6 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
Int32GetDatum(-1));
values[i++] = wal_bytes;
}
if (api_version >= PGSS_V1_12)
{
values[i++] = Int64GetDatumFast(tmp.wal_buffers_full);
}
if (api_version >= PGSS_V1_10)
{
values[i++] = Int64GetDatumFast(tmp.jit_functions);
@ -2019,8 +2029,8 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo,
}
if (api_version >= PGSS_V1_13)
{
values[i++] = Int64GetDatumFast(tmp.generic_plan_calls);
values[i++] = Int64GetDatumFast(tmp.custom_plan_calls);
values[i++] = Float8GetDatumFast(tmp.cpu_time[PGSS_PLAN]);
values[i++] = Float8GetDatumFast(tmp.cpu_time[PGSS_EXEC]);
}
if (api_version >= PGSS_V1_11)
{
@ -2710,13 +2720,13 @@ if (e) { \
* Reset entries corresponding to parameters passed.
*/
static TimestampTz
entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only)
entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only)
{
HASH_SEQ_STATUS hash_seq;
pgssEntry *entry;
FILE *qfile;
int64 num_entries;
int64 num_remove = 0;
long num_entries;
long num_remove = 0;
pgssHashKey key;
TimestampTz stats_reset;
@ -2730,7 +2740,7 @@ entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only)
stats_reset = GetCurrentTimestamp();
if (userid != 0 && dbid != 0 && queryid != INT64CONST(0))
if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0))
{
/* If all the parameters are available, use the fast path. */
memset(&key, 0, sizeof(pgssHashKey));
@ -2753,7 +2763,7 @@ entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only)
SINGLE_ENTRY_RESET(entry);
}
else if (userid != 0 || dbid != 0 || queryid != INT64CONST(0))
else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0))
{
/* Reset entries corresponding to valid parameters. */
hash_seq_init(&hash_seq, pgss_hash);
@ -2849,13 +2859,13 @@ generate_normalized_query(JumbleState *jstate, const char *query,
{
char *norm_query;
int query_len = *query_len_p;
int norm_query_buflen, /* Space allowed for norm_query */
int i,
norm_query_buflen, /* Space allowed for norm_query */
len_to_wrt, /* Length (in bytes) to write */
quer_loc = 0, /* Source query byte location */
n_quer_loc = 0, /* Normalized query byte location */
last_off = 0, /* Offset from start for previous tok */
last_tok_len = 0; /* Length (in bytes) of that tok */
int num_constants_replaced = 0;
/*
* Get constants' lengths (core system only gives us locations). Note
@ -2875,23 +2885,12 @@ generate_normalized_query(JumbleState *jstate, const char *query,
/* Allocate result buffer */
norm_query = palloc(norm_query_buflen + 1);
for (int i = 0; i < jstate->clocations_count; i++)
for (i = 0; i < jstate->clocations_count; i++)
{
int off, /* Offset from start for cur tok */
tok_len; /* Length (in bytes) of that tok */
/*
* If we have an external param at this location, but no lists are
* being squashed across the query, then we skip here; this will make
* us print the characters found in the original query that represent
* the parameter in the next iteration (or after the loop is done),
* which is a bit odd but seems to work okay in most cases.
*/
if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
continue;
off = jstate->clocations[i].location;
/* Adjust recorded location if we're dealing with partial string */
off -= query_loc;
@ -2903,21 +2902,15 @@ generate_normalized_query(JumbleState *jstate, const char *query,
/* Copy next chunk (what precedes the next constant) */
len_to_wrt = off - last_off;
len_to_wrt -= last_tok_len;
Assert(len_to_wrt >= 0);
memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
n_quer_loc += len_to_wrt;
/*
* And insert a param symbol in place of the constant token; and, if
* we have a squashable list, insert a placeholder comment starting
* from the list's second value.
*/
n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
num_constants_replaced + 1 + jstate->highest_extern_param_id,
jstate->clocations[i].squashed ? " /*, ... */" : "");
num_constants_replaced++;
/* And insert a param symbol in place of the constant token */
n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
i + 1 + jstate->highest_extern_param_id);
/* move forward */
quer_loc = off + tok_len;
last_off = off;
last_tok_len = tok_len;
@ -2954,8 +2947,9 @@ generate_normalized_query(JumbleState *jstate, const char *query,
* have originated from within the authoritative parser, this should not be
* a problem.
*
* Multiple constants can have the same location. We reset lengths of those
* past the first to -1 so that they can later be ignored.
* Duplicate constant pointers are possible, and will have their lengths
* marked as '-1', so that they are later ignored. (Actually, we assume the
* lengths were initialized as -1 to start with, and don't change them here.)
*
* If query_loc > 0, then "query" has been advanced by that much compared to
* the original string start, so we need to translate the provided locations
@ -2975,6 +2969,8 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query,
core_yy_extra_type yyextra;
core_YYSTYPE yylval;
YYLTYPE yylloc;
int last_loc = -1;
int i;
/*
* Sort the records by location so that we can process them in order while
@ -2991,30 +2987,24 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query,
&ScanKeywords,
ScanKeywordTokens);
/* we don't want to re-emit any escape string warnings */
yyextra.escape_string_warning = false;
/* Search for each constant, in sequence */
for (int i = 0; i < jstate->clocations_count; i++)
for (i = 0; i < jstate->clocations_count; i++)
{
int loc;
int loc = locs[i].location;
int tok;
/* Ignore constants after the first one in the same location */
if (i > 0 && locs[i].location == locs[i - 1].location)
{
locs[i].length = -1;
continue;
}
if (locs[i].squashed)
continue; /* squashable list, ignore */
/* Adjust recorded location if we're dealing with partial string */
loc = locs[i].location - query_loc;
loc -= query_loc;
Assert(loc >= 0);
/*
* We have a valid location for a constant that's not a dupe. Lex
* tokens until we find the desired constant.
*/
if (loc <= last_loc)
continue; /* Duplicate constant, ignore */
/* Lex tokens until we find the desired constant */
for (;;)
{
tok = core_yylex(&yylval, &yylloc, yyscanner);
@ -3060,6 +3050,8 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query,
/* If we hit end-of-string, give up, leaving remaining lengths -1 */
if (tok == 0)
break;
last_loc = loc;
}
scanner_finish(yyscanner);

View file

@ -0,0 +1,33 @@
--
-- Tests for CPU time tracking
--
SET pg_stat_statements.track_utility = FALSE;
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
-- Run a CPU-intensive query
SELECT count(*) FROM generate_series(1, 100000);
-- Verify CPU time columns are populated and sensible
SELECT query,
total_exec_cpu_time > 0 AS has_cpu_time,
total_exec_cpu_time <= total_exec_time AS cpu_le_wall
FROM pg_stat_statements
WHERE query LIKE '%generate_series%'
ORDER BY query COLLATE "C";
-- Test planning CPU time
SET pg_stat_statements.track_planning = TRUE;
SELECT pg_stat_statements_reset() IS NOT NULL AS t;
SELECT count(*) FROM generate_series(1, 100000);
SELECT query,
total_exec_cpu_time >= 0 AS has_exec_cpu,
total_plan_cpu_time >= 0 AS has_plan_cpu
FROM pg_stat_statements
WHERE query LIKE '%generate_series%'
ORDER BY query COLLATE "C";
-- Cleanup
SET pg_stat_statements.track_planning = FALSE;
SELECT pg_stat_statements_reset() IS NOT NULL AS t;

View file

@ -4,7 +4,7 @@
* functions for instrumentation of plan execution
*
*
* Copyright (c) 2001-2026, PostgreSQL Global Development Group
* Copyright (c) 2001-2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/executor/instrument.c
@ -13,6 +13,7 @@
*/
#include "postgres.h"
#include <sys/resource.h>
#include <unistd.h>
#include "executor/instrument.h"
@ -25,6 +26,23 @@ static WalUsage save_pgWalUsage;
static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
static void WalUsageAdd(WalUsage *dst, WalUsage *add);
/*
* Compute the CPU time delta (user + system) between two rusage snapshots,
* in seconds.
*/
static inline double
rusage_cpu_time_diff(const struct rusage *ru_end, const struct rusage *ru_start)
{
double user_sec,
sys_sec;
user_sec = (double) (ru_end->ru_utime.tv_sec - ru_start->ru_utime.tv_sec)
+ (double) (ru_end->ru_utime.tv_usec - ru_start->ru_utime.tv_usec) / 1000000.0;
sys_sec = (double) (ru_end->ru_stime.tv_sec - ru_start->ru_stime.tv_sec)
+ (double) (ru_end->ru_stime.tv_usec - ru_start->ru_stime.tv_usec) / 1000000.0;
return user_sec + sys_sec;
}
/* Allocate new instrumentation structure(s) */
Instrumentation *
@ -34,11 +52,12 @@ InstrAlloc(int n, int instrument_options, bool async_mode)
/* initialize all fields to zeroes, then modify as needed */
instr = palloc0(n * sizeof(Instrumentation));
if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL | INSTRUMENT_CPUUSAGE))
{
bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
bool need_cpuusage = (instrument_options & INSTRUMENT_CPUUSAGE) != 0;
int i;
for (i = 0; i < n; i++)
@ -46,6 +65,7 @@ InstrAlloc(int n, int instrument_options, bool async_mode)
instr[i].need_bufusage = need_buffers;
instr[i].need_walusage = need_wal;
instr[i].need_timer = need_timer;
instr[i].need_cpuusage = need_cpuusage;
instr[i].async_mode = async_mode;
}
}
@ -61,6 +81,7 @@ InstrInit(Instrumentation *instr, int instrument_options)
instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
instr->need_cpuusage = (instrument_options & INSTRUMENT_CPUUSAGE) != 0;
}
/* Entry to a plan node */
@ -77,6 +98,10 @@ InstrStartNode(Instrumentation *instr)
if (instr->need_walusage)
instr->walusage_start = pgWalUsage;
/* save CPU usage at node entry, if needed */
if (instr->need_cpuusage)
getrusage(RUSAGE_SELF, &instr->rusage_start);
}
/* Exit from a plan node */
@ -89,6 +114,22 @@ InstrStopNode(Instrumentation *instr, double nTuples)
/* count the returned tuples */
instr->tuplecount += nTuples;
/*
* Capture CPU usage before wall clock so the CPU measurement window is
* strictly contained within the wall clock window. InstrStartNode
* captures rusage after the wall clock start, so:
* [wall_start ... cpu_start ... cpu_end ... wall_end]
* This guarantees cpu_time <= total (wall) time.
*/
if (instr->need_cpuusage)
{
struct rusage rusage_end;
getrusage(RUSAGE_SELF, &rusage_end);
instr->counter_cpu += rusage_cpu_time_diff(&rusage_end,
&instr->rusage_start);
}
/* let's update the time only if the timer was requested */
if (instr->need_timer)
{
@ -114,7 +155,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
if (!instr->running)
{
instr->running = true;
instr->firsttuple = instr->counter;
instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
}
else
{
@ -123,7 +164,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
* this might be the first tuple
*/
if (instr->async_mode && save_tuplecount < 1.0)
instr->firsttuple = instr->counter;
instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
}
}
@ -139,6 +180,8 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
void
InstrEndLoop(Instrumentation *instr)
{
double totaltime;
/* Skip if nothing has happened, or already shut down */
if (!instr->running)
return;
@ -147,8 +190,11 @@ InstrEndLoop(Instrumentation *instr)
elog(ERROR, "InstrEndLoop called on running node");
/* Accumulate per-cycle statistics into totals */
INSTR_TIME_ADD(instr->startup, instr->firsttuple);
INSTR_TIME_ADD(instr->total, instr->counter);
totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
instr->startup += instr->firsttuple;
instr->total += totaltime;
instr->cpu_time += instr->counter_cpu;
instr->ntuples += instr->tuplecount;
instr->nloops += 1;
@ -156,7 +202,8 @@ InstrEndLoop(Instrumentation *instr)
instr->running = false;
INSTR_TIME_SET_ZERO(instr->starttime);
INSTR_TIME_SET_ZERO(instr->counter);
INSTR_TIME_SET_ZERO(instr->firsttuple);
instr->counter_cpu = 0;
instr->firsttuple = 0;
instr->tuplecount = 0;
}
@ -169,15 +216,16 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add)
dst->running = true;
dst->firsttuple = add->firsttuple;
}
else if (dst->running && add->running &&
INSTR_TIME_LT(dst->firsttuple, add->firsttuple))
else if (dst->running && add->running && dst->firsttuple > add->firsttuple)
dst->firsttuple = add->firsttuple;
INSTR_TIME_ADD(dst->counter, add->counter);
dst->tuplecount += add->tuplecount;
INSTR_TIME_ADD(dst->startup, add->startup);
INSTR_TIME_ADD(dst->total, add->total);
dst->counter_cpu += add->counter_cpu;
dst->startup += add->startup;
dst->total += add->total;
dst->cpu_time += add->cpu_time;
dst->ntuples += add->ntuples;
dst->ntuples2 += add->ntuples2;
dst->nloops += add->nloops;
@ -277,8 +325,6 @@ WalUsageAdd(WalUsage *dst, WalUsage *add)
dst->wal_bytes += add->wal_bytes;
dst->wal_records += add->wal_records;
dst->wal_fpi += add->wal_fpi;
dst->wal_fpi_bytes += add->wal_fpi_bytes;
dst->wal_buffers_full += add->wal_buffers_full;
}
void
@ -287,6 +333,4 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
dst->wal_bytes += add->wal_bytes - sub->wal_bytes;
dst->wal_records += add->wal_records - sub->wal_records;
dst->wal_fpi += add->wal_fpi - sub->wal_fpi;
dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes;
dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full;
}

View file

@ -4,7 +4,7 @@
* definitions for run-time statistics collection
*
*
* Copyright (c) 2001-2026, PostgreSQL Global Development Group
* Copyright (c) 2001-2024, PostgreSQL Global Development Group
*
* src/include/executor/instrument.h
*
@ -13,6 +13,8 @@
#ifndef INSTRUMENT_H
#define INSTRUMENT_H
#include <sys/resource.h>
#include "portability/instr_time.h"
@ -53,8 +55,6 @@ typedef struct WalUsage
int64 wal_records; /* # of WAL records produced */
int64 wal_fpi; /* # of WAL full page images produced */
uint64 wal_bytes; /* size of WAL records produced */
uint64 wal_fpi_bytes; /* size of WAL full page images produced */
int64 wal_buffers_full; /* # of times the WAL buffers became full */
} WalUsage;
/* Flag bits included in InstrAlloc's instrument_options bitmask */
@ -64,6 +64,7 @@ typedef enum InstrumentOption
INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */
INSTRUMENT_ROWS = 1 << 2, /* needs row count */
INSTRUMENT_WAL = 1 << 3, /* needs WAL usage */
INSTRUMENT_CPUUSAGE = 1 << 4, /* needs CPU usage via getrusage() */
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
@ -73,18 +74,22 @@ typedef struct Instrumentation
bool need_timer; /* true if we need timer data */
bool need_bufusage; /* true if we need buffer usage data */
bool need_walusage; /* true if we need WAL usage data */
bool need_cpuusage; /* true if we need CPU usage data */
bool async_mode; /* true if node is in async mode */
/* Info about current plan cycle: */
bool running; /* true if we've completed first tuple */
instr_time starttime; /* start time of current iteration of node */
instr_time counter; /* accumulated runtime for this node */
instr_time firsttuple; /* time for first tuple of this cycle */
double firsttuple; /* time for first tuple of this cycle */
double tuplecount; /* # of tuples emitted so far this cycle */
BufferUsage bufusage_start; /* buffer usage at start */
WalUsage walusage_start; /* WAL usage at start */
struct rusage rusage_start; /* CPU usage at start */
double counter_cpu; /* per-cycle CPU time accumulator (seconds) */
/* Accumulated statistics across all completed cycles: */
instr_time startup; /* total startup time */
instr_time total; /* total time */
double startup; /* total startup time (in seconds) */
double total; /* total time (in seconds) */
double cpu_time; /* total CPU time: user + system (seconds) */
double ntuples; /* total tuples produced */
double ntuples2; /* secondary node-specific tuple counter */
double nloops; /* # of run cycles for this node */