From 07009121c235dc1b9338b57b6a85006a5e3b0bd8 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Mon, 6 Apr 2026 02:03:10 +0200 Subject: [PATCH] Test stabilization for online checksums Postcommit review and buildfarm/CI failures revealed a few issues in the test code which this commit attempts to resolve. These failures are verified using synthetic means. * Wait for launcher exit in enable/disable checksum tests When enabling or disabling data checksums in a test with waiting for an end state (on or off), the test typically want to perform more test against the cluster immediately. Make sure to wait for the launcher to exit in these cases before returning in order to know it can immediately be acted on. This is a more generic way of implementating 0036232ba8f. * Refactor injection point tests to use the injection_points test extension. Two injection points added for online checksums were better expressed using the injection_points extension with the test code embedded in datachecksum_state.c. * Make tests less timing dependent and allow transitions to "on" and not just "inprogress-on" in case a test manages to finish before it's checked for state. * When waiting on a blocking background psql keeping a temporary table open, the test first closed the background session abd then the server. This could cause data checksums to manage to get enabled in the brief window between dropping the temporary table and closing the server. Fix by closing the server first before the background session. * Remove a few superfluous duplicate checks and general cleanup of comments as well as making LSN logging consistent. These issues were reported by Andres as well as spotted in the buildfarm and on CI. Author: Daniel Gustafsson Reported-by: Andres Freund Discussion: https://postgr.es/m/92F25C14-801E-4198-994D-D83E31FEB0D8@yesql.se --- src/backend/postmaster/datachecksum_state.c | 26 +++++- .../modules/test_checksums/t/001_basic.pl | 6 +- .../test_checksums/t/003_standby_restarts.pl | 14 +++- .../modules/test_checksums/t/004_offline.pl | 8 +- .../modules/test_checksums/t/005_injection.pl | 12 ++- .../test_checksums/t/006_pgbench_single.pl | 16 +++- .../test_checksums/t/007_pgbench_standby.pl | 19 +++-- src/test/modules/test_checksums/t/008_pitr.pl | 8 +- .../test_checksums/t/DataChecksums/Utils.pm | 24 +++++- .../test_checksums/test_checksums--1.0.sql | 8 -- .../modules/test_checksums/test_checksums.c | 79 ------------------- 11 files changed, 101 insertions(+), 119 deletions(-) diff --git a/src/backend/postmaster/datachecksum_state.c b/src/backend/postmaster/datachecksum_state.c index 430e6e9a3c9..1243949eacb 100644 --- a/src/backend/postmaster/datachecksum_state.c +++ b/src/backend/postmaster/datachecksum_state.c @@ -1216,8 +1216,15 @@ ProcessAllDatabases(void) result = ProcessDatabase(db); +#ifdef USE_INJECTION_POINTS /* Allow a test process to alter the result of the operation */ - INJECTION_POINT("datachecksumsworker-modify-db-result", &result); + if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fail-db-result")) + { + result = DATACHECKSUMSWORKER_FAILED; + INJECTION_POINT_CACHED("datachecksumsworker-fail-db-result", + db->dbname); + } +#endif pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE, ++cumulative_total); @@ -1451,6 +1458,9 @@ DataChecksumsWorkerMain(Datum arg) BufferAccessStrategy strategy; bool aborted = false; int64 rels_done; +#ifdef USE_INJECTION_POINTS + bool retried = false; +#endif operation = ENABLE_DATACHECKSUMS; @@ -1566,7 +1576,19 @@ DataChecksumsWorkerMain(Datum arg) } list_free(CurrentTempTables); - INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft); +#ifdef USE_INJECTION_POINTS + if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fake-temptable-wait")) + { + /* Make sure to just cause one retry */ + if (!retried && numleft == 0) + { + numleft = 1; + retried = true; + + INJECTION_POINT_CACHED("datachecksumsworker-fake-temptable-wait", NULL); + } + } +#endif if (numleft == 0) break; diff --git a/src/test/modules/test_checksums/t/001_basic.pl b/src/test/modules/test_checksums/t/001_basic.pl index 5933c730da1..a78118320d5 100644 --- a/src/test/modules/test_checksums/t/001_basic.pl +++ b/src/test/modules/test_checksums/t/001_basic.pl @@ -35,11 +35,7 @@ my $result = is($result, '9999', 'ensure checksummed pages can be read back'); # Enable data checksums again which should be a no-op so we explicitly don't -# wait for any state transition as none should happen here. Make sure to let -# any running launcher finish in case it's still wrapping up. -$result = $node->poll_query_until('postgres', - "SELECT count(*) = 0 FROM pg_catalog.pg_stat_activity WHERE backend_type = 'datachecksum launcher';" -); +# wait for any state transition as none should happen here. enable_data_checksums($node); test_checksum_state($node, 'on'); # ..and make sure we can still read/write data diff --git a/src/test/modules/test_checksums/t/003_standby_restarts.pl b/src/test/modules/test_checksums/t/003_standby_restarts.pl index 6b016925651..7ad11417ca6 100644 --- a/src/test/modules/test_checksums/t/003_standby_restarts.pl +++ b/src/test/modules/test_checksums/t/003_standby_restarts.pl @@ -41,7 +41,7 @@ $node_standby->start; $node_primary->safe_psql('postgres', "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); -# Wait for standbys to catch up +# Wait for standby to catch up $node_primary->wait_for_catchup($node_standby, 'replay', $node_primary->lsn('insert')); @@ -54,8 +54,14 @@ test_checksum_state($node_standby, 'off'); # standby change state. # -# Ensure that the primary switches to "inprogress-on" -enable_data_checksums($node_primary, wait => 'inprogress-on'); +# Initiate enabling of checksums and ensure that the primary switches to +# either "inprogress-on" or "on" +enable_data_checksums($node_primary); +my $result = $node_primary->poll_query_until( + 'postgres', + "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'f'); +is($result, 1, 'ensure primary has transitioned from off'); # Wait for checksum enable to be replayed $node_primary->wait_for_catchup($node_standby, 'replay'); @@ -63,7 +69,7 @@ $node_primary->wait_for_catchup($node_standby, 'replay'); # would be "inprogress-on", but it is theoretically possible for the primary to # complete the checksum enabling *and* have the standby replay that record # before we reach the check below. -my $result = $node_standby->poll_query_until( +$result = $node_standby->poll_query_until( 'postgres', "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", 'f'); diff --git a/src/test/modules/test_checksums/t/004_offline.pl b/src/test/modules/test_checksums/t/004_offline.pl index f1972bddff1..73c279e75e0 100644 --- a/src/test/modules/test_checksums/t/004_offline.pl +++ b/src/test/modules/test_checksums/t/004_offline.pl @@ -62,11 +62,15 @@ $result = $node->safe_psql('postgres', "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); is($result, 't', 'ensure we can see the temporary table'); +# Enable, but stop waiting at inprogress-on since it will sit there until the +# above temporary table is removed. enable_data_checksums($node, wait => 'inprogress-on'); -# Turn the cluster off and enable checksums offline, then start back up +# Turn the cluster off and enable checksums offline, then start back up. +# Stop the cluster before exiting the background session since otherwise +# checksums might have time to get enabled before shutting down the cluster. +$node->stop('fast'); $bsession->quit; -$node->stop; $node->checksum_enable_offline; $node->start; diff --git a/src/test/modules/test_checksums/t/005_injection.pl b/src/test/modules/test_checksums/t/005_injection.pl index 897f282a1f2..a37a24dbbad 100644 --- a/src/test/modules/test_checksums/t/005_injection.pl +++ b/src/test/modules/test_checksums/t/005_injection.pl @@ -32,6 +32,7 @@ $node->start; # Set up test environment $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); # --------------------------------------------------------------------------- # Inducing failures and crashes in processing @@ -39,9 +40,12 @@ $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); # Force enabling checksums to fail by marking one of the databases as having # failed in processing. disable_data_checksums($node, wait => 1); -$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);'); +$node->safe_psql('postgres', + "SELECT injection_points_attach('datachecksumsworker-fail-db-result','notice');" +); enable_data_checksums($node, wait => 'off'); -$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);'); +$node->safe_psql('postgres', + "SELECT injection_points_detach('datachecksumsworker-fail-db-result');"); # Make sure that disabling after a failure works disable_data_checksums($node); @@ -66,7 +70,9 @@ SKIP: # will force the processing to wait and retry in order to wait for it to # disappear. disable_data_checksums($node, wait => 1); - $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);'); + $node->safe_psql('postgres', + "SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');" + ); enable_data_checksums($node, wait => 'on'); } diff --git a/src/test/modules/test_checksums/t/006_pgbench_single.pl b/src/test/modules/test_checksums/t/006_pgbench_single.pl index 0ab5b04b931..f5ccc1b5b08 100644 --- a/src/test/modules/test_checksums/t/006_pgbench_single.pl +++ b/src/test/modules/test_checksums/t/006_pgbench_single.pl @@ -89,14 +89,21 @@ sub background_rw_pgbench # before and after state. sub flip_data_checksums { + my $temptablewait = 0; + # First, make sure the cluster is in the state we expect it to be test_checksum_state($node, $data_checksum_state); if ($data_checksum_state eq 'off') { # Coin-toss to see if we are injecting a retry due to a temptable - $node->safe_psql('postgres', 'SELECT dcw_fake_temptable();') - if cointoss(); + if (cointoss()) + { + $node->safe_psql('postgres', + "SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');" + ); + $temptablewait = 1; + } # log LSN right before we start changing checksums my $result = @@ -117,7 +124,9 @@ sub flip_data_checksums random_sleep() if ($extended); - $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);'); + $node->safe_psql('postgres', + "SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');" + ) if ($temptablewait); $data_checksum_state = 'on'; } elsif ($data_checksum_state eq 'on') @@ -165,6 +174,7 @@ log_statement = none ]); $node->start; $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); # Create some content to have un-checksummed data in the cluster $node->safe_psql('postgres', "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); diff --git a/src/test/modules/test_checksums/t/007_pgbench_standby.pl b/src/test/modules/test_checksums/t/007_pgbench_standby.pl index b0d40d24005..f3611e7ce25 100644 --- a/src/test/modules/test_checksums/t/007_pgbench_standby.pl +++ b/src/test/modules/test_checksums/t/007_pgbench_standby.pl @@ -99,6 +99,8 @@ sub background_pgbench # before and after state. sub flip_data_checksums { + my $temptablewait = 0; + # First, make sure the cluster is in the state we expect it to be test_checksum_state($node_primary, $data_checksum_state); test_checksum_state($node_standby, $data_checksum_state); @@ -106,8 +108,13 @@ sub flip_data_checksums if ($data_checksum_state eq 'off') { # Coin-toss to see if we are injecting a retry due to a temptable - $node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();') - if cointoss(); + if (cointoss()) + { + $node_primary->safe_psql('postgres', + "SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');" + ); + $temptablewait = 1; + } # log LSN right before we start changing checksums my $result = @@ -154,7 +161,8 @@ sub flip_data_checksums wait_for_checksum_state($node_standby, 'on'); $node_primary->safe_psql('postgres', - 'SELECT dcw_fake_temptable(false);'); + "SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');" + ) if ($temptablewait); $data_checksum_state = 'on'; } elsif ($data_checksum_state eq 'on') @@ -170,6 +178,7 @@ sub flip_data_checksums $node_primary->wait_for_catchup($node_standby, 'replay'); # Wait for checksums disabled on the primary and standby + random_sleep() if ($extended); wait_for_checksum_state($node_primary, 'off'); wait_for_checksum_state($node_standby, 'off'); @@ -178,9 +187,6 @@ sub flip_data_checksums $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); note("LSN after disabling: " . $result . "\n"); - random_sleep() if ($extended); - wait_for_checksum_state($node_standby, 'off'); - $data_checksum_state = 'off'; } else @@ -207,6 +213,7 @@ log_statement = none ]); $node_primary->start; $node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); # Create some content to have un-checksummed data in the cluster $node_primary->safe_psql('postgres', "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); diff --git a/src/test/modules/test_checksums/t/008_pitr.pl b/src/test/modules/test_checksums/t/008_pitr.pl index b9b89f414ab..e8cb2b0ed96 100644 --- a/src/test/modules/test_checksums/t/008_pitr.pl +++ b/src/test/modules/test_checksums/t/008_pitr.pl @@ -67,15 +67,15 @@ sub flip_data_checksums # log LSN right before we start changing checksums $lsn_pre = $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN before disabling: " . $lsn_pre . "\n"); - disable_data_checksums($node_primary); - - # Wait for checksums disabled on the primary - wait_for_checksum_state($node_primary, 'off'); + # Disable checksums on the primary and wait for completion + disable_data_checksums($node_primary, wait => 1); # log LSN right after the primary flips checksums to "off" $lsn_post = $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); + note("LSN after disabling: " . $lsn_post . "\n"); $data_checksum_state = 'off'; } diff --git a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm index 9a2269e8a92..fb704623a60 100644 --- a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm +++ b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm @@ -175,8 +175,19 @@ EOQ $postgresnode->safe_psql('postgres', sprintf($query, $params{cost_delay}, $params{cost_limit})); - wait_for_checksum_state($postgresnode, $params{wait}) - if (defined($params{wait})); + if (defined($params{wait})) + { + wait_for_checksum_state($postgresnode, $params{wait}); + # If we are tasked with waiting for an end state, also wait for the + # launcher to exit. + if ($params{wait} eq 'on' || $params{wait} eq 'off') + { + $postgresnode->poll_query_until('postgres', + "SELECT count(*) = 0 " + . "FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum launcher';"); + } + } } =item disable_data_checksums($node, %params) @@ -204,7 +215,14 @@ sub disable_data_checksums $postgresnode->safe_psql('postgres', 'SELECT pg_disable_data_checksums();'); - wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait})); + if (defined($params{wait})) + { + wait_for_checksum_state($postgresnode, 'off'); + $postgresnode->poll_query_until('postgres', + "SELECT count(*) = 0 " + . "FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksum launcher';"); + } } =item cointoss diff --git a/src/test/modules/test_checksums/test_checksums--1.0.sql b/src/test/modules/test_checksums/test_checksums--1.0.sql index 90642d247fa..c674fee02bb 100644 --- a/src/test/modules/test_checksums/test_checksums--1.0.sql +++ b/src/test/modules/test_checksums/test_checksums--1.0.sql @@ -14,11 +14,3 @@ CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true) CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true) RETURNS pg_catalog.void AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true) - RETURNS pg_catalog.void - AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true) - RETURNS pg_catalog.void - AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_checksums/test_checksums.c b/src/test/modules/test_checksums/test_checksums.c index e2b91f9c78c..c2eabc2821c 100644 --- a/src/test/modules/test_checksums/test_checksums.c +++ b/src/test/modules/test_checksums/test_checksums.c @@ -25,8 +25,6 @@ extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_d extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg); extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg); -extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg); - /* * Test for delaying emission of procsignalbarriers. */ @@ -107,80 +105,3 @@ dcw_inject_startup_delay(PG_FUNCTION_ARGS) #endif PG_RETURN_VOID(); } - -#ifdef USE_INJECTION_POINTS -static uint32 db_fail = DATACHECKSUMSWORKER_FAILED; -#endif - -void -dc_modify_db_result(const char *name, const void *private_data, void *arg) -{ - DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg; - uint32 new_res = *(uint32 *) private_data; - - *res = new_res; -} - -PG_FUNCTION_INFO_V1(dcw_inject_fail_database); -Datum -dcw_inject_fail_database(PG_FUNCTION_ARGS) -{ -#ifdef USE_INJECTION_POINTS - bool attach = PG_GETARG_BOOL(0); - - if (attach) - InjectionPointAttach("datachecksumsworker-modify-db-result", - "test_checksums", - "dc_modify_db_result", - &db_fail, - sizeof(uint32)); - else - InjectionPointDetach("datachecksumsworker-modify-db-result"); -#else - elog(ERROR, - "test is not working as intended when injection points are disabled"); -#endif - PG_RETURN_VOID(); -} - -/* - * Test to force waiting for existing temptables. - */ -void -dc_fake_temptable(const char *name, const void *private_data, void *arg) -{ - static bool first_pass = true; - int *numleft = (int *) arg; - - if (first_pass) - *numleft = 1; - first_pass = false; -} - -PG_FUNCTION_INFO_V1(dcw_fake_temptable); -Datum -dcw_fake_temptable(PG_FUNCTION_ARGS) -{ -#ifdef USE_INJECTION_POINTS - bool attach = PG_GETARG_BOOL(0); - - if (attach) - InjectionPointAttach("datachecksumsworker-fake-temptable-wait", - "test_checksums", - "dc_fake_temptable", - NULL, - 0); - else - InjectionPointDetach("datachecksumsworker-fake-temptable-wait"); -#else - elog(ERROR, - "test is not working as intended when injection points are disabled"); -#endif - PG_RETURN_VOID(); -} - -void -crash(const char *name, const void *private_data, void *arg) -{ - abort(); -}