diff --git a/contrib/pg_stat_statements/Makefile b/contrib/pg_stat_statements/Makefile index c27e9529bb6..035e021eff6 100644 --- a/contrib/pg_stat_statements/Makefile +++ b/contrib/pg_stat_statements/Makefile @@ -19,26 +19,9 @@ PGFILEDESC = "pg_stat_statements - execution statistics of SQL statements" LDFLAGS_SL += $(filter -lm, $(LIBS)) REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_stat_statements/pg_stat_statements.conf - -# Note: Test "cleanup" is kept second to last, removing the extension. -REGRESS = \ - select \ - dml \ - cursors \ - utility \ - level_tracking \ - planning \ - user_activity \ - wal \ - entry_timestamp \ - privileges \ - extended \ - parallel \ - plancache \ - squashing \ - cleanup \ - oldextversions - +REGRESS = select dml cursors utility level_tracking planning \ + user_activity wal entry_timestamp privileges extended \ + parallel cpu_time cleanup oldextversions # Disabled because these tests require "shared_preload_libraries=pg_stat_statements", # which typical installcheck users do not have (e.g. buildfarm clients). NO_INSTALLCHECK = 1 diff --git a/contrib/pg_stat_statements/bench_cpu_time.c b/contrib/pg_stat_statements/bench_cpu_time.c new file mode 100644 index 00000000000..dfa09bf263c --- /dev/null +++ b/contrib/pg_stat_statements/bench_cpu_time.c @@ -0,0 +1,163 @@ +/* + * bench_cpu_time.c + * + * Benchmark to demonstrate the difference between wall clock time + * and CPU time (via getrusage) under varying system load. + * + * Under no load: wall ~= cpu + * Under CPU saturation (e.g. stress-ng): wall >> cpu (scheduling delay) + * Under I/O saturation: wall >> cpu (I/O wait) + * + * Usage: + * cc -O2 -o bench_cpu_time bench_cpu_time.c -lm + * ./bench_cpu_time [iterations] + * + * Then compare with: + * stress-ng --cpu $(nproc) --timeout 30s & + * ./bench_cpu_time + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_ITERATIONS 10 +#define WORK_SIZE 5000000 /* number of math ops per iteration */ + +static double +wall_time_ms(struct timespec *start, struct timespec *end) +{ + return (end->tv_sec - start->tv_sec) * 1000.0 + + (end->tv_nsec - start->tv_nsec) / 1000000.0; +} + +static double +rusage_cpu_ms(struct rusage *start, struct rusage *end) +{ + double user_ms = (end->ru_utime.tv_sec - start->ru_utime.tv_sec) * 1000.0 + + (end->ru_utime.tv_usec - start->ru_utime.tv_usec) / 1000.0; + double sys_ms = (end->ru_stime.tv_sec - start->ru_stime.tv_sec) * 1000.0 + + (end->ru_stime.tv_usec - start->ru_stime.tv_usec) / 1000.0; + return user_ms + sys_ms; +} + +/* + * Pure CPU-bound work: compute a bunch of sqrt/sin to keep the CPU busy. + * volatile to prevent the compiler from optimizing it away. + */ +static volatile double sink; + +static void +cpu_bound_work(void) +{ + double acc = 0.0; + + for (int i = 0; i < WORK_SIZE; i++) + acc += sqrt((double)i) * sin((double)i); + + sink = acc; +} + +/* + * Mixed I/O + CPU work: write to a temp file between CPU bursts. + */ +static void +io_bound_work(void) +{ + char buf[4096]; + char path[] = "/tmp/bench_cpu_time_XXXXXX"; + int fd; + double acc = 0.0; + + memset(buf, 'x', sizeof(buf)); + fd = mkstemp(path); + if (fd < 0) + { + perror("mkstemp"); + return; + } + unlink(path); + + for (int i = 0; i < WORK_SIZE / 10; i++) + { + acc += sqrt((double)i); + if (i % 1000 == 0) + { + write(fd, buf, sizeof(buf)); + fsync(fd); + } + } + + close(fd); + sink = acc; +} + +static void +run_benchmark(const char *label, void (*workfn)(void), int iterations) +{ + double total_wall = 0.0; + double total_cpu = 0.0; + + printf("\n=== %s (%d iterations) ===\n", label, iterations); + printf("%4s %10s %10s %10s %7s\n", + "#", "wall(ms)", "cpu(ms)", "off-cpu(ms)", "cpu%"); + + for (int i = 0; i < iterations; i++) + { + struct timespec ts_start, ts_end; + struct rusage ru_start, ru_end; + + getrusage(RUSAGE_SELF, &ru_start); + clock_gettime(CLOCK_MONOTONIC, &ts_start); + + workfn(); + + clock_gettime(CLOCK_MONOTONIC, &ts_end); + getrusage(RUSAGE_SELF, &ru_end); + + double wall = wall_time_ms(&ts_start, &ts_end); + double cpu = rusage_cpu_ms(&ru_start, &ru_end); + double off = wall - cpu; + double pct = (wall > 0) ? (cpu / wall) * 100.0 : 0.0; + + printf("%4d %10.2f %10.2f %10.2f %6.1f%%\n", + i + 1, wall, cpu, off, pct); + + total_wall += wall; + total_cpu += cpu; + } + + double avg_wall = total_wall / iterations; + double avg_cpu = total_cpu / iterations; + double avg_off = avg_wall - avg_cpu; + double avg_pct = (avg_wall > 0) ? (avg_cpu / avg_wall) * 100.0 : 0.0; + + printf("---- ---------- ---------- ---------- -------\n"); + printf(" avg %10.2f %10.2f %10.2f %6.1f%%\n", + avg_wall, avg_cpu, avg_off, avg_pct); +} + +int +main(int argc, char *argv[]) +{ + int iterations = DEFAULT_ITERATIONS; + + if (argc > 1) + iterations = atoi(argv[1]); + if (iterations < 1) + iterations = 1; + + printf("PID: %d\n", getpid()); + printf("Work size: %d ops per iteration\n", WORK_SIZE); + + run_benchmark("CPU-bound work", cpu_bound_work, iterations); + run_benchmark("I/O-bound work", io_bound_work, iterations); + + return 0; +} diff --git a/contrib/pg_stat_statements/benchmark/run_benchmark.sh b/contrib/pg_stat_statements/benchmark/run_benchmark.sh new file mode 100755 index 00000000000..f8ec8cb81e8 --- /dev/null +++ b/contrib/pg_stat_statements/benchmark/run_benchmark.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# +# run_benchmark.sh - Benchmark pg_stat_statements CPU time tracking +# +# Runs pgbench at various concurrency levels, with and without CPU saturation, +# then queries pg_stat_statements to compare wall time vs CPU time. +# +set -euo pipefail + +export PATH="$HOME/pginstall/bin:$PATH" +DB="bench" +NCPU=$(nproc) +DURATION=30 # seconds per pgbench run +PGBENCH_SCALE=10 # matches -s 10 from setup +RESULTS_FILE="$HOME/benchmark_results.txt" + +# Concurrency levels to test +CLIENTS=(1 4 8 $((NCPU * 2)) $((NCPU * 4))) + +# Stress-ng intensity: 0 = no load, N = N CPU-burning workers +STRESS_LEVELS=(0 $NCPU $((NCPU * 2))) + +header() { + echo "" + echo "================================================================" + echo " $1" + echo "================================================================" +} + +run_one() { + local clients=$1 + local stress=$2 + local label="clients=$clients, stress=$stress" + local stress_pid="" + + echo "" + echo "--- $label ---" + + # Reset stats + psql "$DB" -qc "SELECT pg_stat_statements_reset();" > /dev/null + + # Start CPU stress if requested + if [ "$stress" -gt 0 ]; then + stress-ng --cpu "$stress" --timeout $((DURATION + 10))s &>/dev/null & + stress_pid=$! + sleep 1 + fi + + # Run pgbench + pgbench -c "$clients" -j "$clients" -T "$DURATION" "$DB" 2>&1 | grep -E "^(tps|latency|number)" + + # Stop stress + if [ -n "$stress_pid" ]; then + kill "$stress_pid" 2>/dev/null + wait "$stress_pid" 2>/dev/null || true + fi + + # Query pg_stat_statements for the pgbench queries + psql "$DB" -P pager=off -c " + SELECT + left(query, 60) AS query, + calls, + round(total_exec_time::numeric, 2) AS wall_ms, + round(total_exec_cpu_time::numeric, 2) AS cpu_ms, + round((total_exec_time - total_exec_cpu_time)::numeric, 2) AS off_cpu_ms, + CASE WHEN total_exec_time > 0 + THEN round((total_exec_cpu_time / total_exec_time * 100)::numeric, 1) + ELSE 0 + END AS cpu_pct + FROM pg_stat_statements + WHERE query NOT LIKE '%pg_stat_statements%' + AND query NOT LIKE '%BEGIN%' + AND query NOT LIKE '%END%' + AND calls > 10 + ORDER BY total_exec_time DESC + LIMIT 8; + " + + # Summary line for the aggregate + psql "$DB" -t -A -c " + SELECT + '$clients' AS clients, + '$stress' AS stress_workers, + round(sum(total_exec_time)::numeric, 2) AS total_wall_ms, + round(sum(total_exec_cpu_time)::numeric, 2) AS total_cpu_ms, + round(sum(total_exec_time - total_exec_cpu_time)::numeric, 2) AS total_off_cpu_ms, + CASE WHEN sum(total_exec_time) > 0 + THEN round((sum(total_exec_cpu_time) / sum(total_exec_time) * 100)::numeric, 1) + ELSE 0 + END AS cpu_pct + FROM pg_stat_statements + WHERE query NOT LIKE '%pg_stat_statements%' + AND calls > 0; + " >> "$RESULTS_FILE" +} + +# ============================================================ +# Main +# ============================================================ + +echo "Benchmark: pg_stat_statements CPU time tracking" +echo "Instance: $(uname -n) ($(nproc) vCPUs)" +echo "Duration per run: ${DURATION}s" +echo "" + +# Initialize results file with CSV header +echo "clients,stress_workers,total_wall_ms,total_cpu_ms,total_off_cpu_ms,cpu_pct" > "$RESULTS_FILE" + +header "Verifying CPU time columns exist" +psql "$DB" -c " + SELECT total_plan_cpu_time, total_exec_cpu_time + FROM pg_stat_statements + LIMIT 1; +" + +for stress in "${STRESS_LEVELS[@]}"; do + if [ "$stress" -eq 0 ]; then + header "NO CPU STRESS" + else + header "CPU STRESS: $stress workers on $NCPU vCPUs" + fi + + for clients in "${CLIENTS[@]}"; do + run_one "$clients" "$stress" + done +done + +header "SUMMARY" +echo "" +echo "Instance: $(uname -n) ($(nproc) vCPUs)" +echo "" +echo "clients | stress | wall_ms | cpu_ms | off_cpu_ms | cpu%" +echo "--------|--------|---------------|---------------|---------------|------" +while IFS='|' read -r c s w cpu off pct; do + printf "%-8s| %-7s| %13s | %13s | %13s | %s%%\n" "$c" "$s" "$w" "$cpu" "$off" "$pct" +done < "$RESULTS_FILE" + +echo "" +echo "Full results saved to: $RESULTS_FILE" +echo "" +echo "Key insight: as stress workers increase, cpu% should decrease" +echo "(more off-CPU time = scheduling delay from CPU contention)" diff --git a/contrib/pg_stat_statements/benchmark/setup.sh b/contrib/pg_stat_statements/benchmark/setup.sh new file mode 100755 index 00000000000..55e4504ae4e --- /dev/null +++ b/contrib/pg_stat_statements/benchmark/setup.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# +# setup.sh - Build PostgreSQL from source and initialize a pgbench database +# with pg_stat_statements enabled. +# +# Run from the postgres source root: bash contrib/pg_stat_statements/benchmark/setup.sh +# +set -euo pipefail + +PGDATA="$HOME/pgdata" +PGINSTALL="$HOME/pginstall" +NCPU=$(nproc) + +echo "=== Building PostgreSQL (using $NCPU cores) ===" + +# Wait for user_data to finish installing deps +while [ ! -f /tmp/setup_done ]; do + echo "Waiting for build dependencies to be installed..." + sleep 5 +done + +./configure \ + --prefix="$PGINSTALL" \ + --without-icu \ + --with-openssl \ + 2>&1 | tail -3 + +make -j"$NCPU" 2>&1 | tail -5 +make install 2>&1 | tail -3 + +echo "=== Building pg_stat_statements ===" +make -C contrib/pg_stat_statements install 2>&1 | tail -3 + +# Also build pgbench +make -C contrib/pgbench install 2>&1 | tail -3 || \ + make -C src/bin/pgbench install 2>&1 | tail -3 + +export PATH="$PGINSTALL/bin:$PATH" + +echo "=== Initializing database ===" +# Stop any existing server before wiping data +pg_ctl -D "$PGDATA" stop 2>/dev/null || true +rm -rf "$PGDATA" +initdb -D "$PGDATA" 2>&1 | tail -3 + +# Configure pg_stat_statements +cat >> "$PGDATA/postgresql.conf" <&1 | tail -3 + +echo "" +echo "=== Setup complete ===" +echo "PostgreSQL is running with pg_stat_statements enabled." +echo "Run: bash contrib/pg_stat_statements/benchmark/run_benchmark.sh" diff --git a/contrib/pg_stat_statements/benchmark/terraform/.gitignore b/contrib/pg_stat_statements/benchmark/terraform/.gitignore new file mode 100644 index 00000000000..81736110ea7 --- /dev/null +++ b/contrib/pg_stat_statements/benchmark/terraform/.gitignore @@ -0,0 +1,5 @@ +*.pem +*.tfstate +*.tfstate.backup +.terraform/ +.terraform.lock.hcl diff --git a/contrib/pg_stat_statements/benchmark/terraform/main.tf b/contrib/pg_stat_statements/benchmark/terraform/main.tf new file mode 100644 index 00000000000..a702d7ce50f --- /dev/null +++ b/contrib/pg_stat_statements/benchmark/terraform/main.tf @@ -0,0 +1,128 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + http = { + source = "hashicorp/http" + version = "~> 3.0" + } + tls = { + source = "hashicorp/tls" + version = "~> 4.0" + } + local = { + source = "hashicorp/local" + version = "~> 2.0" + } + } +} + +provider "aws" { + region = var.region +} + +# Auto-detect the caller's public IP if allowed_ssh_cidr is "auto" +data "http" "my_ip" { + count = var.allowed_ssh_cidr == "auto" ? 1 : 0 + url = "https://checkip.amazonaws.com" +} + +locals { + ssh_cidr = var.allowed_ssh_cidr == "auto" ? "${trimspace(data.http.my_ip[0].response_body)}/32" : var.allowed_ssh_cidr +} + +# Generate an SSH key pair +resource "tls_private_key" "bench" { + algorithm = "ED25519" +} + +resource "aws_key_pair" "bench" { + key_name_prefix = "pgss-bench-" + public_key = tls_private_key.bench.public_key_openssh +} + +# Write the private key to a local file +resource "local_file" "ssh_key" { + content = tls_private_key.bench.private_key_openssh + filename = "${path.module}/pgss-bench-key.pem" + file_permission = "0600" +} + +# Use the default VPC +data "aws_vpc" "default" { + default = true +} + +# Find the latest Amazon Linux 2023 AMI +data "aws_ami" "al2023" { + most_recent = true + owners = ["amazon"] + + filter { + name = "name" + values = ["al2023-ami-*-x86_64"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } +} + +resource "aws_security_group" "pgss_bench" { + name_prefix = "pgss-bench-" + description = "SSH access for pg_stat_statements benchmark" + vpc_id = data.aws_vpc.default.id + + ingress { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = [local.ssh_cidr] + description = "SSH" + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + description = "All outbound" + } + + tags = { + Name = "pgss-cpu-time-benchmark" + } +} + +resource "aws_instance" "bench" { + ami = data.aws_ami.al2023.id + instance_type = var.instance_type + key_name = aws_key_pair.bench.key_name + vpc_security_group_ids = [aws_security_group.pgss_bench.id] + + root_block_device { + volume_size = 30 + volume_type = "gp3" + } + + user_data = <<-USERDATA + #!/bin/bash + set -euo pipefail + + # Install build dependencies for PostgreSQL + dnf groupinstall -y "Development Tools" + dnf install -y \ + readline-devel zlib-devel libxml2-devel openssl-devel \ + flex bison perl git rsync \ + stress-ng + + echo "=== Build dependencies installed ===" > /tmp/setup_done + USERDATA + + tags = { + Name = "pgss-cpu-time-benchmark" + } +} diff --git a/contrib/pg_stat_statements/benchmark/terraform/outputs.tf b/contrib/pg_stat_statements/benchmark/terraform/outputs.tf new file mode 100644 index 00000000000..b34ed158726 --- /dev/null +++ b/contrib/pg_stat_statements/benchmark/terraform/outputs.tf @@ -0,0 +1,26 @@ +output "instance_id" { + value = aws_instance.bench.id +} + +output "public_ip" { + value = aws_instance.bench.public_ip +} + +output "ssh_key_file" { + description = "Path to the generated private key" + value = local_file.ssh_key.filename +} + +output "ssh_command" { + value = "ssh -i ${abspath(local_file.ssh_key.filename)} ec2-user@${aws_instance.bench.public_ip}" +} + +output "rsync_command" { + description = "Run this from the postgres repo root to push the source tree" + value = "rsync -avz -e 'ssh -i ${abspath(local_file.ssh_key.filename)}' --exclude='.git' --exclude='*.o' --exclude='*.dylib' . ec2-user@${aws_instance.bench.public_ip}:~/postgres/" +} + +output "benchmark_command" { + description = "Run this after SSH'ing in to start the benchmark" + value = "cd ~/postgres && bash contrib/pg_stat_statements/benchmark/setup.sh && bash contrib/pg_stat_statements/benchmark/run_benchmark.sh" +} diff --git a/contrib/pg_stat_statements/benchmark/terraform/variables.tf b/contrib/pg_stat_statements/benchmark/terraform/variables.tf new file mode 100644 index 00000000000..f6b0720847c --- /dev/null +++ b/contrib/pg_stat_statements/benchmark/terraform/variables.tf @@ -0,0 +1,17 @@ +variable "instance_type" { + description = "EC2 instance type" + type = string + default = "c5.2xlarge" +} + +variable "region" { + description = "AWS region" + type = string + default = "us-east-1" +} + +variable "allowed_ssh_cidr" { + description = "CIDR block allowed to SSH (e.g. your IP). Use 'auto' to detect." + type = string + default = "auto" +} diff --git a/contrib/pg_stat_statements/expected/cpu_time.out b/contrib/pg_stat_statements/expected/cpu_time.out new file mode 100644 index 00000000000..250ccacd323 --- /dev/null +++ b/contrib/pg_stat_statements/expected/cpu_time.out @@ -0,0 +1,62 @@ +-- +-- Tests for CPU time tracking +-- +SET pg_stat_statements.track_utility = FALSE; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +-- Run a CPU-intensive query +SELECT count(*) FROM generate_series(1, 100000); + count +-------- + 100000 +(1 row) + +-- Verify CPU time columns are populated and sensible +SELECT query, + total_exec_cpu_time > 0 AS has_cpu_time, + total_exec_cpu_time <= total_exec_time AS cpu_le_wall + FROM pg_stat_statements + WHERE query LIKE '%generate_series%' + ORDER BY query COLLATE "C"; + query | has_cpu_time | cpu_le_wall +--------------------------------------------------+--------------+------------- + SELECT count(*) FROM generate_series($1, $2) | t | t +(1 row) + +-- Test planning CPU time +SET pg_stat_statements.track_planning = TRUE; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + +SELECT count(*) FROM generate_series(1, 100000); + count +-------- + 100000 +(1 row) + +SELECT query, + total_exec_cpu_time >= 0 AS has_exec_cpu, + total_plan_cpu_time >= 0 AS has_plan_cpu + FROM pg_stat_statements + WHERE query LIKE '%generate_series%' + ORDER BY query COLLATE "C"; + query | has_exec_cpu | has_plan_cpu +--------------------------------------------------+--------------+-------------- + SELECT count(*) FROM generate_series($1, $2) | t | t +(1 row) + +-- Cleanup +SET pg_stat_statements.track_planning = FALSE; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + t +--- + t +(1 row) + diff --git a/contrib/pg_stat_statements/expected/oldextversions.out b/contrib/pg_stat_statements/expected/oldextversions.out index 726383a99d7..e250008b270 100644 --- a/contrib/pg_stat_statements/expected/oldextversions.out +++ b/contrib/pg_stat_statements/expected/oldextversions.out @@ -385,7 +385,6 @@ AlTER EXTENSION pg_stat_statements UPDATE TO '1.12'; wal_records | bigint | | | wal_fpi | bigint | | | wal_bytes | numeric | | | - wal_buffers_full | bigint | | | jit_functions | bigint | | | jit_generation_time | double precision | | | jit_inlining_count | bigint | | | @@ -402,7 +401,7 @@ AlTER EXTENSION pg_stat_statements UPDATE TO '1.12'; minmax_stats_since | timestamp with time zone | | | SELECT count(*) > 0 AS has_data FROM pg_stat_statements; - has_data + has_data ---------- t (1 row) @@ -411,65 +410,64 @@ SELECT count(*) > 0 AS has_data FROM pg_stat_statements; AlTER EXTENSION pg_stat_statements UPDATE TO '1.13'; \d pg_stat_statements View "public.pg_stat_statements" - Column | Type | Collation | Nullable | Default + Column | Type | Collation | Nullable | Default ----------------------------+--------------------------+-----------+----------+--------- - userid | oid | | | - dbid | oid | | | - toplevel | boolean | | | - queryid | bigint | | | - query | text | | | - plans | bigint | | | - total_plan_time | double precision | | | - min_plan_time | double precision | | | - max_plan_time | double precision | | | - mean_plan_time | double precision | | | - stddev_plan_time | double precision | | | - calls | bigint | | | - total_exec_time | double precision | | | - min_exec_time | double precision | | | - max_exec_time | double precision | | | - mean_exec_time | double precision | | | - stddev_exec_time | double precision | | | - rows | bigint | | | - shared_blks_hit | bigint | | | - shared_blks_read | bigint | | | - shared_blks_dirtied | bigint | | | - shared_blks_written | bigint | | | - local_blks_hit | bigint | | | - local_blks_read | bigint | | | - local_blks_dirtied | bigint | | | - local_blks_written | bigint | | | - temp_blks_read | bigint | | | - temp_blks_written | bigint | | | - shared_blk_read_time | double precision | | | - shared_blk_write_time | double precision | | | - local_blk_read_time | double precision | | | - local_blk_write_time | double precision | | | - temp_blk_read_time | double precision | | | - temp_blk_write_time | double precision | | | - wal_records | bigint | | | - wal_fpi | bigint | | | - wal_bytes | numeric | | | - wal_buffers_full | bigint | | | - jit_functions | bigint | | | - jit_generation_time | double precision | | | - jit_inlining_count | bigint | | | - jit_inlining_time | double precision | | | - jit_optimization_count | bigint | | | - jit_optimization_time | double precision | | | - jit_emission_count | bigint | | | - jit_emission_time | double precision | | | - jit_deform_count | bigint | | | - jit_deform_time | double precision | | | - parallel_workers_to_launch | bigint | | | - parallel_workers_launched | bigint | | | - generic_plan_calls | bigint | | | - custom_plan_calls | bigint | | | - stats_since | timestamp with time zone | | | - minmax_stats_since | timestamp with time zone | | | + userid | oid | | | + dbid | oid | | | + toplevel | boolean | | | + queryid | bigint | | | + query | text | | | + plans | bigint | | | + total_plan_time | double precision | | | + min_plan_time | double precision | | | + max_plan_time | double precision | | | + mean_plan_time | double precision | | | + stddev_plan_time | double precision | | | + calls | bigint | | | + total_exec_time | double precision | | | + min_exec_time | double precision | | | + max_exec_time | double precision | | | + mean_exec_time | double precision | | | + stddev_exec_time | double precision | | | + rows | bigint | | | + shared_blks_hit | bigint | | | + shared_blks_read | bigint | | | + shared_blks_dirtied | bigint | | | + shared_blks_written | bigint | | | + local_blks_hit | bigint | | | + local_blks_read | bigint | | | + local_blks_dirtied | bigint | | | + local_blks_written | bigint | | | + temp_blks_read | bigint | | | + temp_blks_written | bigint | | | + shared_blk_read_time | double precision | | | + shared_blk_write_time | double precision | | | + local_blk_read_time | double precision | | | + local_blk_write_time | double precision | | | + temp_blk_read_time | double precision | | | + temp_blk_write_time | double precision | | | + wal_records | bigint | | | + wal_fpi | bigint | | | + wal_bytes | numeric | | | + jit_functions | bigint | | | + jit_generation_time | double precision | | | + jit_inlining_count | bigint | | | + jit_inlining_time | double precision | | | + jit_optimization_count | bigint | | | + jit_optimization_time | double precision | | | + jit_emission_count | bigint | | | + jit_emission_time | double precision | | | + jit_deform_count | bigint | | | + jit_deform_time | double precision | | | + parallel_workers_to_launch | bigint | | | + parallel_workers_launched | bigint | | | + total_plan_cpu_time | double precision | | | + total_exec_cpu_time | double precision | | | + stats_since | timestamp with time zone | | | + minmax_stats_since | timestamp with time zone | | | SELECT count(*) > 0 AS has_data FROM pg_stat_statements; - has_data + has_data ---------- t (1 row) diff --git a/contrib/pg_stat_statements/meson.build b/contrib/pg_stat_statements/meson.build index 9d78cb88b7d..4ae3ead3593 100644 --- a/contrib/pg_stat_statements/meson.build +++ b/contrib/pg_stat_statements/meson.build @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2026, PostgreSQL Global Development Group +# Copyright (c) 2022-2024, PostgreSQL Global Development Group pg_stat_statements_sources = files( 'pg_stat_statements.c', @@ -37,7 +37,6 @@ install_data( kwargs: contrib_data_args, ) -# Note: Test "cleanup" is kept second to last, removing the extension. tests += { 'name': 'pg_stat_statements', 'sd': meson.current_source_dir(), @@ -56,8 +55,7 @@ tests += { 'privileges', 'extended', 'parallel', - 'plancache', - 'squashing', + 'cpu_time', 'cleanup', 'oldextversions', ], diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql index 2f0eaf14ec3..13286f4f5b0 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.12--1.13.sql @@ -50,7 +50,6 @@ CREATE FUNCTION pg_stat_statements(IN showtext boolean, OUT wal_records int8, OUT wal_fpi int8, OUT wal_bytes numeric, - OUT wal_buffers_full int8, OUT jit_functions int8, OUT jit_generation_time float8, OUT jit_inlining_count int8, @@ -63,8 +62,8 @@ CREATE FUNCTION pg_stat_statements(IN showtext boolean, OUT jit_deform_time float8, OUT parallel_workers_to_launch int8, OUT parallel_workers_launched int8, - OUT generic_plan_calls int8, - OUT custom_plan_calls int8, + OUT total_plan_cpu_time float8, + OUT total_exec_cpu_time float8, OUT stats_since timestamp with time zone, OUT minmax_stats_since timestamp with time zone ) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 4a427533bd8..728922c697e 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -34,7 +34,7 @@ * in the file to be read or written while holding only shared lock. * * - * Copyright (c) 2008-2026, PostgreSQL Global Development Group + * Copyright (c) 2008-2024, PostgreSQL Global Development Group * * IDENTIFICATION * contrib/pg_stat_statements/pg_stat_statements.c @@ -47,7 +47,6 @@ #include #include -#include "access/htup_details.h" #include "access/parallel.h" #include "catalog/pg_authid.h" #include "common/int.h" @@ -72,10 +71,7 @@ #include "utils/memutils.h" #include "utils/timestamp.h" -PG_MODULE_MAGIC_EXT( - .name = "pg_stat_statements", - .version = PG_VERSION -); +PG_MODULE_MAGIC; /* Location of permanent stats file (valid when database is shut down) */ #define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat" @@ -86,7 +82,7 @@ PG_MODULE_MAGIC_EXT( #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat" /* Magic number identifying the stats file format */ -static const uint32 PGSS_FILE_HEADER = 0x20250731; +static const uint32 PGSS_FILE_HEADER = 0x20220408; /* PostgreSQL major version number, changes in which invalidate all entries */ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; @@ -140,12 +136,13 @@ typedef enum pgssStoreKind * If you add a new key to this struct, make sure to teach pgss_store() to * zero the padding bytes. Otherwise, things will break, because pgss_hash is * created using HASH_BLOBS, and thus tag_hash is used to hash this. + */ typedef struct pgssHashKey { Oid userid; /* user OID */ Oid dbid; /* database OID */ - int64 queryid; /* query identifier */ + uint64 queryid; /* query identifier */ bool toplevel; /* query executed at top level */ } pgssHashKey; @@ -191,7 +188,6 @@ typedef struct Counters int64 wal_records; /* # of WAL records generated */ int64 wal_fpi; /* # of WAL full page images generated */ uint64 wal_bytes; /* total amount of WAL generated in bytes */ - int64 wal_buffers_full; /* # of times the WAL buffers became full */ int64 jit_functions; /* total number of JIT functions emitted */ double jit_generation_time; /* total time to generate jit code */ int64 jit_inlining_count; /* number of times inlining time has been @@ -211,8 +207,7 @@ typedef struct Counters * to be launched */ int64 parallel_workers_launched; /* # of parallel workers actually * launched */ - int64 generic_plan_calls; /* number of calls using a generic plan */ - int64 custom_plan_calls; /* number of calls using a custom plan */ + double cpu_time[PGSS_NUMKIND]; /* total CPU (user+sys) time, in msec */ } Counters; /* @@ -263,7 +258,7 @@ typedef struct pgssSharedState /* Current nesting depth of planner/ExecutorRun/ProcessUtility calls */ static int nesting_level = 0; -/* Saved hook values */ +/* Saved hook values in case of unload */ static shmem_request_hook_type prev_shmem_request_hook = NULL; static shmem_startup_hook_type prev_shmem_startup_hook = NULL; static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL; @@ -302,6 +297,7 @@ static bool pgss_track_planning = false; /* whether to track planning * duration */ static bool pgss_save = true; /* whether to save stats across shutdown */ + #define pgss_enabled(level) \ (!IsParallelWorker() && \ (pgss_track == PGSS_TRACK_ALL || \ @@ -338,12 +334,11 @@ static void pgss_post_parse_analyze(ParseState *pstate, Query *query, static PlannedStmt *pgss_planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams, - ExplainState *es); + ParamListInfo boundParams); static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags); static void pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, - uint64 count); + uint64 count, bool execute_once); static void pgss_ExecutorFinish(QueryDesc *queryDesc); static void pgss_ExecutorEnd(QueryDesc *queryDesc); static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, @@ -351,17 +346,17 @@ static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, ProcessUtilityContext context, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, QueryCompletion *qc); -static void pgss_store(const char *query, int64 queryId, +static void pgss_store(const char *query, uint64 queryId, int query_location, int query_len, pgssStoreKind kind, - double total_time, uint64 rows, + double total_time, double cpu_time, + uint64 rows, const BufferUsage *bufusage, const WalUsage *walusage, const struct JitInstrumentation *jitusage, JumbleState *jstate, int parallel_workers_to_launch, - int parallel_workers_launched, - PlannedStmtOrigin planOrigin); + int parallel_workers_launched); static void pg_stat_statements_internal(FunctionCallInfo fcinfo, pgssVersion api_version, bool showtext); @@ -376,7 +371,25 @@ static char *qtext_fetch(Size query_offset, int query_len, char *buffer, Size buffer_size); static bool need_gc_qtexts(void); static void gc_qtexts(void); -static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only); + +/* + * Compute total CPU time (user + system) in milliseconds from two + * getrusage snapshots. Returns the difference ru1 - ru0. + */ +static double +pgss_cpu_time_diff_ms(const struct rusage *ru1, const struct rusage *ru0) +{ + double user_ms, + sys_ms; + + user_ms = (double) (ru1->ru_utime.tv_sec - ru0->ru_utime.tv_sec) * 1000.0 + + (double) (ru1->ru_utime.tv_usec - ru0->ru_utime.tv_usec) / 1000.0; + sys_ms = (double) (ru1->ru_stime.tv_sec - ru0->ru_stime.tv_sec) * 1000.0 + + (double) (ru1->ru_stime.tv_usec - ru0->ru_stime.tv_usec) / 1000.0; + + return user_ms + sys_ms; +} +static TimestampTz entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only); static char *generate_normalized_query(JumbleState *jstate, const char *query, int query_loc, int *query_len_p); static void fill_in_constant_lengths(JumbleState *jstate, const char *query, @@ -858,7 +871,7 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) { if (pgss_track_utility && IsA(query->utilityStmt, ExecuteStmt)) { - query->queryId = INT64CONST(0); + query->queryId = UINT64CONST(0); return; } } @@ -876,15 +889,14 @@ pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate) query->stmt_location, query->stmt_len, PGSS_INVALID, - 0, - 0, + 0, 0, /* total_time, cpu_time */ + 0, /* rows */ NULL, NULL, NULL, jstate, 0, - 0, - PLAN_STMT_UNKNOWN); + 0); } /* @@ -895,8 +907,7 @@ static PlannedStmt * pgss_planner(Query *parse, const char *query_string, int cursorOptions, - ParamListInfo boundParams, - ExplainState *es) + ParamListInfo boundParams) { PlannedStmt *result; @@ -907,7 +918,7 @@ pgss_planner(Query *parse, */ if (pgss_enabled(nesting_level) && pgss_track_planning && query_string - && parse->queryId != INT64CONST(0)) + && parse->queryId != UINT64CONST(0)) { instr_time start; instr_time duration; @@ -915,6 +926,8 @@ pgss_planner(Query *parse, bufusage; WalUsage walusage_start, walusage; + struct rusage rusage_start, + rusage_end; /* We need to track buffer usage as the planner can access them. */ bufusage_start = pgBufferUsage; @@ -924,6 +937,7 @@ pgss_planner(Query *parse, * (e.g. setting a hint bit with those being WAL-logged) */ walusage_start = pgWalUsage; + getrusage(RUSAGE_SELF, &rusage_start); INSTR_TIME_SET_CURRENT(start); nesting_level++; @@ -931,10 +945,10 @@ pgss_planner(Query *parse, { if (prev_planner_hook) result = prev_planner_hook(parse, query_string, cursorOptions, - boundParams, es); + boundParams); else result = standard_planner(parse, query_string, cursorOptions, - boundParams, es); + boundParams); } PG_FINALLY(); { @@ -944,6 +958,7 @@ pgss_planner(Query *parse, INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); + getrusage(RUSAGE_SELF, &rusage_end); /* calc differences of buffer counters. */ memset(&bufusage, 0, sizeof(BufferUsage)); @@ -959,14 +974,14 @@ pgss_planner(Query *parse, parse->stmt_len, PGSS_PLAN, INSTR_TIME_GET_MILLISEC(duration), + pgss_cpu_time_diff_ms(&rusage_end, &rusage_start), 0, &bufusage, &walusage, NULL, NULL, 0, - 0, - result->planOrigin); + 0); } else { @@ -980,10 +995,10 @@ pgss_planner(Query *parse, { if (prev_planner_hook) result = prev_planner_hook(parse, query_string, cursorOptions, - boundParams, es); + boundParams); else result = standard_planner(parse, query_string, cursorOptions, - boundParams, es); + boundParams); } PG_FINALLY(); { @@ -1011,7 +1026,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) * counting of optimizable statements that are directly contained in * utility statements. */ - if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != INT64CONST(0)) + if (pgss_enabled(nesting_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0)) { /* * Set up to track total elapsed time in ExecutorRun. Make sure the @@ -1033,15 +1048,16 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) * ExecutorRun hook: all we need do is track nesting depth */ static void -pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count) +pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count, + bool execute_once) { nesting_level++; PG_TRY(); { if (prev_ExecutorRun) - prev_ExecutorRun(queryDesc, direction, count); + prev_ExecutorRun(queryDesc, direction, count, execute_once); else - standard_ExecutorRun(queryDesc, direction, count); + standard_ExecutorRun(queryDesc, direction, count, execute_once); } PG_FINALLY(); { @@ -1077,9 +1093,9 @@ pgss_ExecutorFinish(QueryDesc *queryDesc) static void pgss_ExecutorEnd(QueryDesc *queryDesc) { - int64 queryId = queryDesc->plannedstmt->queryId; + uint64 queryId = queryDesc->plannedstmt->queryId; - if (queryId != INT64CONST(0) && queryDesc->totaltime && + if (queryId != UINT64CONST(0) && queryDesc->totaltime && pgss_enabled(nesting_level)) { /* @@ -1093,15 +1109,15 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) queryDesc->plannedstmt->stmt_location, queryDesc->plannedstmt->stmt_len, PGSS_EXEC, - INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total), + queryDesc->totaltime->total * 1000.0, /* convert to msec */ + queryDesc->totaltime->cpu_time * 1000.0, /* convert to msec */ queryDesc->estate->es_total_processed, &queryDesc->totaltime->bufusage, &queryDesc->totaltime->walusage, queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL, NULL, queryDesc->estate->es_parallel_workers_to_launch, - queryDesc->estate->es_parallel_workers_launched, - queryDesc->plannedstmt->planOrigin); + queryDesc->estate->es_parallel_workers_launched); } if (prev_ExecutorEnd) @@ -1121,7 +1137,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, DestReceiver *dest, QueryCompletion *qc) { Node *parsetree = pstmt->utilityStmt; - int64 saved_queryId = pstmt->queryId; + uint64 saved_queryId = pstmt->queryId; int saved_stmt_location = pstmt->stmt_location; int saved_stmt_len = pstmt->stmt_len; bool enabled = pgss_track_utility && pgss_enabled(nesting_level); @@ -1141,7 +1157,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * only. */ if (enabled) - pstmt->queryId = INT64CONST(0); + pstmt->queryId = UINT64CONST(0); /* * If it's an EXECUTE statement, we don't track it and don't increment the @@ -1167,9 +1183,12 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, bufusage; WalUsage walusage_start, walusage; + struct rusage rusage_start, + rusage_end; bufusage_start = pgBufferUsage; walusage_start = pgWalUsage; + getrusage(RUSAGE_SELF, &rusage_start); INSTR_TIME_SET_CURRENT(start); nesting_level++; @@ -1202,6 +1221,7 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); + getrusage(RUSAGE_SELF, &rusage_end); /* * Track the total number of rows retrieved or affected by the utility @@ -1228,14 +1248,14 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, saved_stmt_len, PGSS_EXEC, INSTR_TIME_GET_MILLISEC(duration), + pgss_cpu_time_diff_ms(&rusage_end, &rusage_start), rows, &bufusage, &walusage, NULL, NULL, 0, - 0, - pstmt->planOrigin); + 0); } else { @@ -1289,17 +1309,17 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, * for the arrays in the Counters field. */ static void -pgss_store(const char *query, int64 queryId, +pgss_store(const char *query, uint64 queryId, int query_location, int query_len, pgssStoreKind kind, - double total_time, uint64 rows, + double total_time, double cpu_time, + uint64 rows, const BufferUsage *bufusage, const WalUsage *walusage, const struct JitInstrumentation *jitusage, JumbleState *jstate, int parallel_workers_to_launch, - int parallel_workers_launched, - PlannedStmtOrigin planOrigin) + int parallel_workers_launched) { pgssHashKey key; pgssEntry *entry; @@ -1316,7 +1336,7 @@ pgss_store(const char *query, int64 queryId, * Nothing to do if compute_query_id isn't enabled and no other module * computed a query identifier. */ - if (queryId == INT64CONST(0)) + if (queryId == UINT64CONST(0)) return; /* @@ -1421,6 +1441,7 @@ pgss_store(const char *query, int64 queryId, entry->counters.calls[kind] += 1; entry->counters.total_time[kind] += total_time; + entry->counters.cpu_time[kind] += cpu_time; if (entry->counters.calls[kind] == 1) { @@ -1480,7 +1501,6 @@ pgss_store(const char *query, int64 queryId, entry->counters.wal_records += walusage->wal_records; entry->counters.wal_fpi += walusage->wal_fpi; entry->counters.wal_bytes += walusage->wal_bytes; - entry->counters.wal_buffers_full += walusage->wal_buffers_full; if (jitusage) { entry->counters.jit_functions += jitusage->created_functions; @@ -1507,12 +1527,6 @@ pgss_store(const char *query, int64 queryId, entry->counters.parallel_workers_to_launch += parallel_workers_to_launch; entry->counters.parallel_workers_launched += parallel_workers_launched; - /* plan cache counters */ - if (planOrigin == PLAN_STMT_CACHE_GENERIC) - entry->counters.generic_plan_calls++; - else if (planOrigin == PLAN_STMT_CACHE_CUSTOM) - entry->counters.custom_plan_calls++; - SpinLockRelease(&entry->mutex); } @@ -1532,11 +1546,11 @@ pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS) { Oid userid; Oid dbid; - int64 queryid; + uint64 queryid; userid = PG_GETARG_OID(0); dbid = PG_GETARG_OID(1); - queryid = PG_GETARG_INT64(2); + queryid = (uint64) PG_GETARG_INT64(2); entry_reset(userid, dbid, queryid, false); @@ -1548,12 +1562,12 @@ pg_stat_statements_reset_1_11(PG_FUNCTION_ARGS) { Oid userid; Oid dbid; - int64 queryid; + uint64 queryid; bool minmax_only; userid = PG_GETARG_OID(0); dbid = PG_GETARG_OID(1); - queryid = PG_GETARG_INT64(2); + queryid = (uint64) PG_GETARG_INT64(2); minmax_only = PG_GETARG_BOOL(3); PG_RETURN_TIMESTAMPTZ(entry_reset(userid, dbid, queryid, minmax_only)); @@ -1579,9 +1593,9 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS) #define PG_STAT_STATEMENTS_COLS_V1_9 33 #define PG_STAT_STATEMENTS_COLS_V1_10 43 #define PG_STAT_STATEMENTS_COLS_V1_11 49 -#define PG_STAT_STATEMENTS_COLS_V1_12 52 -#define PG_STAT_STATEMENTS_COLS_V1_13 54 -#define PG_STAT_STATEMENTS_COLS 54 /* maximum of above */ +#define PG_STAT_STATEMENTS_COLS_V1_12 51 +#define PG_STAT_STATEMENTS_COLS_V1_13 53 +#define PG_STAT_STATEMENTS_COLS 53 /* maximum of above */ /* * Retrieve statement statistics. @@ -1992,10 +2006,6 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, Int32GetDatum(-1)); values[i++] = wal_bytes; } - if (api_version >= PGSS_V1_12) - { - values[i++] = Int64GetDatumFast(tmp.wal_buffers_full); - } if (api_version >= PGSS_V1_10) { values[i++] = Int64GetDatumFast(tmp.jit_functions); @@ -2019,8 +2029,8 @@ pg_stat_statements_internal(FunctionCallInfo fcinfo, } if (api_version >= PGSS_V1_13) { - values[i++] = Int64GetDatumFast(tmp.generic_plan_calls); - values[i++] = Int64GetDatumFast(tmp.custom_plan_calls); + values[i++] = Float8GetDatumFast(tmp.cpu_time[PGSS_PLAN]); + values[i++] = Float8GetDatumFast(tmp.cpu_time[PGSS_EXEC]); } if (api_version >= PGSS_V1_11) { @@ -2710,13 +2720,13 @@ if (e) { \ * Reset entries corresponding to parameters passed. */ static TimestampTz -entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only) +entry_reset(Oid userid, Oid dbid, uint64 queryid, bool minmax_only) { HASH_SEQ_STATUS hash_seq; pgssEntry *entry; FILE *qfile; - int64 num_entries; - int64 num_remove = 0; + long num_entries; + long num_remove = 0; pgssHashKey key; TimestampTz stats_reset; @@ -2730,7 +2740,7 @@ entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only) stats_reset = GetCurrentTimestamp(); - if (userid != 0 && dbid != 0 && queryid != INT64CONST(0)) + if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0)) { /* If all the parameters are available, use the fast path. */ memset(&key, 0, sizeof(pgssHashKey)); @@ -2753,7 +2763,7 @@ entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only) SINGLE_ENTRY_RESET(entry); } - else if (userid != 0 || dbid != 0 || queryid != INT64CONST(0)) + else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0)) { /* Reset entries corresponding to valid parameters. */ hash_seq_init(&hash_seq, pgss_hash); @@ -2849,13 +2859,13 @@ generate_normalized_query(JumbleState *jstate, const char *query, { char *norm_query; int query_len = *query_len_p; - int norm_query_buflen, /* Space allowed for norm_query */ + int i, + norm_query_buflen, /* Space allowed for norm_query */ len_to_wrt, /* Length (in bytes) to write */ quer_loc = 0, /* Source query byte location */ n_quer_loc = 0, /* Normalized query byte location */ last_off = 0, /* Offset from start for previous tok */ last_tok_len = 0; /* Length (in bytes) of that tok */ - int num_constants_replaced = 0; /* * Get constants' lengths (core system only gives us locations). Note @@ -2875,23 +2885,12 @@ generate_normalized_query(JumbleState *jstate, const char *query, /* Allocate result buffer */ norm_query = palloc(norm_query_buflen + 1); - for (int i = 0; i < jstate->clocations_count; i++) + for (i = 0; i < jstate->clocations_count; i++) { int off, /* Offset from start for cur tok */ tok_len; /* Length (in bytes) of that tok */ - /* - * If we have an external param at this location, but no lists are - * being squashed across the query, then we skip here; this will make - * us print the characters found in the original query that represent - * the parameter in the next iteration (or after the loop is done), - * which is a bit odd but seems to work okay in most cases. - */ - if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists) - continue; - off = jstate->clocations[i].location; - /* Adjust recorded location if we're dealing with partial string */ off -= query_loc; @@ -2903,21 +2902,15 @@ generate_normalized_query(JumbleState *jstate, const char *query, /* Copy next chunk (what precedes the next constant) */ len_to_wrt = off - last_off; len_to_wrt -= last_tok_len; + Assert(len_to_wrt >= 0); memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); n_quer_loc += len_to_wrt; - /* - * And insert a param symbol in place of the constant token; and, if - * we have a squashable list, insert a placeholder comment starting - * from the list's second value. - */ - n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s", - num_constants_replaced + 1 + jstate->highest_extern_param_id, - jstate->clocations[i].squashed ? " /*, ... */" : ""); - num_constants_replaced++; + /* And insert a param symbol in place of the constant token */ + n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d", + i + 1 + jstate->highest_extern_param_id); - /* move forward */ quer_loc = off + tok_len; last_off = off; last_tok_len = tok_len; @@ -2954,8 +2947,9 @@ generate_normalized_query(JumbleState *jstate, const char *query, * have originated from within the authoritative parser, this should not be * a problem. * - * Multiple constants can have the same location. We reset lengths of those - * past the first to -1 so that they can later be ignored. + * Duplicate constant pointers are possible, and will have their lengths + * marked as '-1', so that they are later ignored. (Actually, we assume the + * lengths were initialized as -1 to start with, and don't change them here.) * * If query_loc > 0, then "query" has been advanced by that much compared to * the original string start, so we need to translate the provided locations @@ -2975,6 +2969,8 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query, core_yy_extra_type yyextra; core_YYSTYPE yylval; YYLTYPE yylloc; + int last_loc = -1; + int i; /* * Sort the records by location so that we can process them in order while @@ -2991,30 +2987,24 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query, &ScanKeywords, ScanKeywordTokens); + /* we don't want to re-emit any escape string warnings */ + yyextra.escape_string_warning = false; + /* Search for each constant, in sequence */ - for (int i = 0; i < jstate->clocations_count; i++) + for (i = 0; i < jstate->clocations_count; i++) { - int loc; + int loc = locs[i].location; int tok; - /* Ignore constants after the first one in the same location */ - if (i > 0 && locs[i].location == locs[i - 1].location) - { - locs[i].length = -1; - continue; - } - - if (locs[i].squashed) - continue; /* squashable list, ignore */ - /* Adjust recorded location if we're dealing with partial string */ - loc = locs[i].location - query_loc; + loc -= query_loc; + Assert(loc >= 0); - /* - * We have a valid location for a constant that's not a dupe. Lex - * tokens until we find the desired constant. - */ + if (loc <= last_loc) + continue; /* Duplicate constant, ignore */ + + /* Lex tokens until we find the desired constant */ for (;;) { tok = core_yylex(&yylval, &yylloc, yyscanner); @@ -3060,6 +3050,8 @@ fill_in_constant_lengths(JumbleState *jstate, const char *query, /* If we hit end-of-string, give up, leaving remaining lengths -1 */ if (tok == 0) break; + + last_loc = loc; } scanner_finish(yyscanner); diff --git a/contrib/pg_stat_statements/sql/cpu_time.sql b/contrib/pg_stat_statements/sql/cpu_time.sql new file mode 100644 index 00000000000..d5f3a51af6c --- /dev/null +++ b/contrib/pg_stat_statements/sql/cpu_time.sql @@ -0,0 +1,33 @@ +-- +-- Tests for CPU time tracking +-- +SET pg_stat_statements.track_utility = FALSE; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +-- Run a CPU-intensive query +SELECT count(*) FROM generate_series(1, 100000); + +-- Verify CPU time columns are populated and sensible +SELECT query, + total_exec_cpu_time > 0 AS has_cpu_time, + total_exec_cpu_time <= total_exec_time AS cpu_le_wall + FROM pg_stat_statements + WHERE query LIKE '%generate_series%' + ORDER BY query COLLATE "C"; + +-- Test planning CPU time +SET pg_stat_statements.track_planning = TRUE; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; + +SELECT count(*) FROM generate_series(1, 100000); + +SELECT query, + total_exec_cpu_time >= 0 AS has_exec_cpu, + total_plan_cpu_time >= 0 AS has_plan_cpu + FROM pg_stat_statements + WHERE query LIKE '%generate_series%' + ORDER BY query COLLATE "C"; + +-- Cleanup +SET pg_stat_statements.track_planning = FALSE; +SELECT pg_stat_statements_reset() IS NOT NULL AS t; diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index edab92a0ebe..24fefb39e8a 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -4,7 +4,7 @@ * functions for instrumentation of plan execution * * - * Copyright (c) 2001-2026, PostgreSQL Global Development Group + * Copyright (c) 2001-2024, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/executor/instrument.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include #include #include "executor/instrument.h" @@ -25,6 +26,23 @@ static WalUsage save_pgWalUsage; static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add); static void WalUsageAdd(WalUsage *dst, WalUsage *add); +/* + * Compute the CPU time delta (user + system) between two rusage snapshots, + * in seconds. + */ +static inline double +rusage_cpu_time_diff(const struct rusage *ru_end, const struct rusage *ru_start) +{ + double user_sec, + sys_sec; + + user_sec = (double) (ru_end->ru_utime.tv_sec - ru_start->ru_utime.tv_sec) + + (double) (ru_end->ru_utime.tv_usec - ru_start->ru_utime.tv_usec) / 1000000.0; + sys_sec = (double) (ru_end->ru_stime.tv_sec - ru_start->ru_stime.tv_sec) + + (double) (ru_end->ru_stime.tv_usec - ru_start->ru_stime.tv_usec) / 1000000.0; + return user_sec + sys_sec; +} + /* Allocate new instrumentation structure(s) */ Instrumentation * @@ -34,11 +52,12 @@ InstrAlloc(int n, int instrument_options, bool async_mode) /* initialize all fields to zeroes, then modify as needed */ instr = palloc0(n * sizeof(Instrumentation)); - if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL)) + if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL | INSTRUMENT_CPUUSAGE)) { bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0; bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0; bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + bool need_cpuusage = (instrument_options & INSTRUMENT_CPUUSAGE) != 0; int i; for (i = 0; i < n; i++) @@ -46,6 +65,7 @@ InstrAlloc(int n, int instrument_options, bool async_mode) instr[i].need_bufusage = need_buffers; instr[i].need_walusage = need_wal; instr[i].need_timer = need_timer; + instr[i].need_cpuusage = need_cpuusage; instr[i].async_mode = async_mode; } } @@ -61,6 +81,7 @@ InstrInit(Instrumentation *instr, int instrument_options) instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0; instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0; instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; + instr->need_cpuusage = (instrument_options & INSTRUMENT_CPUUSAGE) != 0; } /* Entry to a plan node */ @@ -77,6 +98,10 @@ InstrStartNode(Instrumentation *instr) if (instr->need_walusage) instr->walusage_start = pgWalUsage; + + /* save CPU usage at node entry, if needed */ + if (instr->need_cpuusage) + getrusage(RUSAGE_SELF, &instr->rusage_start); } /* Exit from a plan node */ @@ -89,6 +114,22 @@ InstrStopNode(Instrumentation *instr, double nTuples) /* count the returned tuples */ instr->tuplecount += nTuples; + /* + * Capture CPU usage before wall clock so the CPU measurement window is + * strictly contained within the wall clock window. InstrStartNode + * captures rusage after the wall clock start, so: + * [wall_start ... cpu_start ... cpu_end ... wall_end] + * This guarantees cpu_time <= total (wall) time. + */ + if (instr->need_cpuusage) + { + struct rusage rusage_end; + + getrusage(RUSAGE_SELF, &rusage_end); + instr->counter_cpu += rusage_cpu_time_diff(&rusage_end, + &instr->rusage_start); + } + /* let's update the time only if the timer was requested */ if (instr->need_timer) { @@ -114,7 +155,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (!instr->running) { instr->running = true; - instr->firsttuple = instr->counter; + instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); } else { @@ -123,7 +164,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) * this might be the first tuple */ if (instr->async_mode && save_tuplecount < 1.0) - instr->firsttuple = instr->counter; + instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); } } @@ -139,6 +180,8 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples) void InstrEndLoop(Instrumentation *instr) { + double totaltime; + /* Skip if nothing has happened, or already shut down */ if (!instr->running) return; @@ -147,8 +190,11 @@ InstrEndLoop(Instrumentation *instr) elog(ERROR, "InstrEndLoop called on running node"); /* Accumulate per-cycle statistics into totals */ - INSTR_TIME_ADD(instr->startup, instr->firsttuple); - INSTR_TIME_ADD(instr->total, instr->counter); + totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); + + instr->startup += instr->firsttuple; + instr->total += totaltime; + instr->cpu_time += instr->counter_cpu; instr->ntuples += instr->tuplecount; instr->nloops += 1; @@ -156,7 +202,8 @@ InstrEndLoop(Instrumentation *instr) instr->running = false; INSTR_TIME_SET_ZERO(instr->starttime); INSTR_TIME_SET_ZERO(instr->counter); - INSTR_TIME_SET_ZERO(instr->firsttuple); + instr->counter_cpu = 0; + instr->firsttuple = 0; instr->tuplecount = 0; } @@ -169,15 +216,16 @@ InstrAggNode(Instrumentation *dst, Instrumentation *add) dst->running = true; dst->firsttuple = add->firsttuple; } - else if (dst->running && add->running && - INSTR_TIME_LT(dst->firsttuple, add->firsttuple)) + else if (dst->running && add->running && dst->firsttuple > add->firsttuple) dst->firsttuple = add->firsttuple; INSTR_TIME_ADD(dst->counter, add->counter); dst->tuplecount += add->tuplecount; - INSTR_TIME_ADD(dst->startup, add->startup); - INSTR_TIME_ADD(dst->total, add->total); + dst->counter_cpu += add->counter_cpu; + dst->startup += add->startup; + dst->total += add->total; + dst->cpu_time += add->cpu_time; dst->ntuples += add->ntuples; dst->ntuples2 += add->ntuples2; dst->nloops += add->nloops; @@ -277,8 +325,6 @@ WalUsageAdd(WalUsage *dst, WalUsage *add) dst->wal_bytes += add->wal_bytes; dst->wal_records += add->wal_records; dst->wal_fpi += add->wal_fpi; - dst->wal_fpi_bytes += add->wal_fpi_bytes; - dst->wal_buffers_full += add->wal_buffers_full; } void @@ -287,6 +333,4 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_bytes += add->wal_bytes - sub->wal_bytes; dst->wal_records += add->wal_records - sub->wal_records; dst->wal_fpi += add->wal_fpi - sub->wal_fpi; - dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; - dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 9759f3ea5d8..50854609d70 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -4,7 +4,7 @@ * definitions for run-time statistics collection * * - * Copyright (c) 2001-2026, PostgreSQL Global Development Group + * Copyright (c) 2001-2024, PostgreSQL Global Development Group * * src/include/executor/instrument.h * @@ -13,6 +13,8 @@ #ifndef INSTRUMENT_H #define INSTRUMENT_H +#include + #include "portability/instr_time.h" @@ -53,8 +55,6 @@ typedef struct WalUsage int64 wal_records; /* # of WAL records produced */ int64 wal_fpi; /* # of WAL full page images produced */ uint64 wal_bytes; /* size of WAL records produced */ - uint64 wal_fpi_bytes; /* size of WAL full page images produced */ - int64 wal_buffers_full; /* # of times the WAL buffers became full */ } WalUsage; /* Flag bits included in InstrAlloc's instrument_options bitmask */ @@ -64,6 +64,7 @@ typedef enum InstrumentOption INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */ INSTRUMENT_ROWS = 1 << 2, /* needs row count */ INSTRUMENT_WAL = 1 << 3, /* needs WAL usage */ + INSTRUMENT_CPUUSAGE = 1 << 4, /* needs CPU usage via getrusage() */ INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; @@ -73,18 +74,22 @@ typedef struct Instrumentation bool need_timer; /* true if we need timer data */ bool need_bufusage; /* true if we need buffer usage data */ bool need_walusage; /* true if we need WAL usage data */ + bool need_cpuusage; /* true if we need CPU usage data */ bool async_mode; /* true if node is in async mode */ /* Info about current plan cycle: */ bool running; /* true if we've completed first tuple */ instr_time starttime; /* start time of current iteration of node */ instr_time counter; /* accumulated runtime for this node */ - instr_time firsttuple; /* time for first tuple of this cycle */ + double firsttuple; /* time for first tuple of this cycle */ double tuplecount; /* # of tuples emitted so far this cycle */ BufferUsage bufusage_start; /* buffer usage at start */ WalUsage walusage_start; /* WAL usage at start */ + struct rusage rusage_start; /* CPU usage at start */ + double counter_cpu; /* per-cycle CPU time accumulator (seconds) */ /* Accumulated statistics across all completed cycles: */ - instr_time startup; /* total startup time */ - instr_time total; /* total time */ + double startup; /* total startup time (in seconds) */ + double total; /* total time (in seconds) */ + double cpu_time; /* total CPU time: user + system (seconds) */ double ntuples; /* total tuples produced */ double ntuples2; /* secondary node-specific tuple counter */ double nloops; /* # of run cycles for this node */