Test stabilization for online checksums

Postcommit review and buildfarm/CI failures revealed a few issues in
the test code which this commit attempts to resolve.  These failures
are verified using synthetic means.

  * Wait for launcher exit in enable/disable checksum tests

    When enabling or disabling data checksums in a test with waiting
    for an end state (on or off), the test typically want to perform
    more test against the cluster immediately. Make sure to wait for
    the launcher to exit in these cases before returning in order to
    know it can immediately be acted on.  This is a more generic way
    of implementating 0036232ba8.

  * Refactor injection point tests to use the injection_points test
    extension. Two injection points added for online checksums were
    better expressed using the injection_points extension with the
    test code embedded in datachecksum_state.c.

  * Make tests less timing dependent and allow transitions to "on"
    and not just "inprogress-on" in case a test manages to finish
    before it's checked for state.

  * When waiting on a blocking background psql keeping a temporary
    table open, the test first closed the background session abd
    then the server.  This could cause data checksums to manage to
    get enabled in the brief window between dropping the temporary
    table and closing the server.  Fix by closing the server first
    before the background session.

  * Remove a few superfluous duplicate checks and general cleanup
    of comments as well as making LSN logging consistent.

These issues were reported by Andres as well as spotted in the
buildfarm and on CI.

Author: Daniel Gustafsson <daniel@yesql.se>
Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/92F25C14-801E-4198-994D-D83E31FEB0D8@yesql.se
This commit is contained in:
Daniel Gustafsson 2026-04-06 02:03:10 +02:00
parent d771b0a907
commit 07009121c2
11 changed files with 101 additions and 119 deletions

View file

@ -1216,8 +1216,15 @@ ProcessAllDatabases(void)
result = ProcessDatabase(db);
#ifdef USE_INJECTION_POINTS
/* Allow a test process to alter the result of the operation */
INJECTION_POINT("datachecksumsworker-modify-db-result", &result);
if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fail-db-result"))
{
result = DATACHECKSUMSWORKER_FAILED;
INJECTION_POINT_CACHED("datachecksumsworker-fail-db-result",
db->dbname);
}
#endif
pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE,
++cumulative_total);
@ -1451,6 +1458,9 @@ DataChecksumsWorkerMain(Datum arg)
BufferAccessStrategy strategy;
bool aborted = false;
int64 rels_done;
#ifdef USE_INJECTION_POINTS
bool retried = false;
#endif
operation = ENABLE_DATACHECKSUMS;
@ -1566,7 +1576,19 @@ DataChecksumsWorkerMain(Datum arg)
}
list_free(CurrentTempTables);
INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft);
#ifdef USE_INJECTION_POINTS
if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fake-temptable-wait"))
{
/* Make sure to just cause one retry */
if (!retried && numleft == 0)
{
numleft = 1;
retried = true;
INJECTION_POINT_CACHED("datachecksumsworker-fake-temptable-wait", NULL);
}
}
#endif
if (numleft == 0)
break;

View file

@ -35,11 +35,7 @@ my $result =
is($result, '9999', 'ensure checksummed pages can be read back');
# Enable data checksums again which should be a no-op so we explicitly don't
# wait for any state transition as none should happen here. Make sure to let
# any running launcher finish in case it's still wrapping up.
$result = $node->poll_query_until('postgres',
"SELECT count(*) = 0 FROM pg_catalog.pg_stat_activity WHERE backend_type = 'datachecksum launcher';"
);
# wait for any state transition as none should happen here.
enable_data_checksums($node);
test_checksum_state($node, 'on');
# ..and make sure we can still read/write data

View file

@ -41,7 +41,7 @@ $node_standby->start;
$node_primary->safe_psql('postgres',
"CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
# Wait for standbys to catch up
# Wait for standby to catch up
$node_primary->wait_for_catchup($node_standby, 'replay',
$node_primary->lsn('insert'));
@ -54,8 +54,14 @@ test_checksum_state($node_standby, 'off');
# standby change state.
#
# Ensure that the primary switches to "inprogress-on"
enable_data_checksums($node_primary, wait => 'inprogress-on');
# Initiate enabling of checksums and ensure that the primary switches to
# either "inprogress-on" or "on"
enable_data_checksums($node_primary);
my $result = $node_primary->poll_query_until(
'postgres',
"SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
'f');
is($result, 1, 'ensure primary has transitioned from off');
# Wait for checksum enable to be replayed
$node_primary->wait_for_catchup($node_standby, 'replay');
@ -63,7 +69,7 @@ $node_primary->wait_for_catchup($node_standby, 'replay');
# would be "inprogress-on", but it is theoretically possible for the primary to
# complete the checksum enabling *and* have the standby replay that record
# before we reach the check below.
my $result = $node_standby->poll_query_until(
$result = $node_standby->poll_query_until(
'postgres',
"SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
'f');

View file

@ -62,11 +62,15 @@ $result = $node->safe_psql('postgres',
"SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';");
is($result, 't', 'ensure we can see the temporary table');
# Enable, but stop waiting at inprogress-on since it will sit there until the
# above temporary table is removed.
enable_data_checksums($node, wait => 'inprogress-on');
# Turn the cluster off and enable checksums offline, then start back up
# Turn the cluster off and enable checksums offline, then start back up.
# Stop the cluster before exiting the background session since otherwise
# checksums might have time to get enabled before shutting down the cluster.
$node->stop('fast');
$bsession->quit;
$node->stop;
$node->checksum_enable_offline;
$node->start;

View file

@ -32,6 +32,7 @@ $node->start;
# Set up test environment
$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
# ---------------------------------------------------------------------------
# Inducing failures and crashes in processing
@ -39,9 +40,12 @@ $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
# Force enabling checksums to fail by marking one of the databases as having
# failed in processing.
disable_data_checksums($node, wait => 1);
$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);');
$node->safe_psql('postgres',
"SELECT injection_points_attach('datachecksumsworker-fail-db-result','notice');"
);
enable_data_checksums($node, wait => 'off');
$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);');
$node->safe_psql('postgres',
"SELECT injection_points_detach('datachecksumsworker-fail-db-result');");
# Make sure that disabling after a failure works
disable_data_checksums($node);
@ -66,7 +70,9 @@ SKIP:
# will force the processing to wait and retry in order to wait for it to
# disappear.
disable_data_checksums($node, wait => 1);
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);');
$node->safe_psql('postgres',
"SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
);
enable_data_checksums($node, wait => 'on');
}

View file

@ -89,14 +89,21 @@ sub background_rw_pgbench
# before and after state.
sub flip_data_checksums
{
my $temptablewait = 0;
# First, make sure the cluster is in the state we expect it to be
test_checksum_state($node, $data_checksum_state);
if ($data_checksum_state eq 'off')
{
# Coin-toss to see if we are injecting a retry due to a temptable
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
if cointoss();
if (cointoss())
{
$node->safe_psql('postgres',
"SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
);
$temptablewait = 1;
}
# log LSN right before we start changing checksums
my $result =
@ -117,7 +124,9 @@ sub flip_data_checksums
random_sleep() if ($extended);
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);');
$node->safe_psql('postgres',
"SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');"
) if ($temptablewait);
$data_checksum_state = 'on';
}
elsif ($data_checksum_state eq 'on')
@ -165,6 +174,7 @@ log_statement = none
]);
$node->start;
$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
# Create some content to have un-checksummed data in the cluster
$node->safe_psql('postgres',
"CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");

View file

@ -99,6 +99,8 @@ sub background_pgbench
# before and after state.
sub flip_data_checksums
{
my $temptablewait = 0;
# First, make sure the cluster is in the state we expect it to be
test_checksum_state($node_primary, $data_checksum_state);
test_checksum_state($node_standby, $data_checksum_state);
@ -106,8 +108,13 @@ sub flip_data_checksums
if ($data_checksum_state eq 'off')
{
# Coin-toss to see if we are injecting a retry due to a temptable
$node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
if cointoss();
if (cointoss())
{
$node_primary->safe_psql('postgres',
"SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
);
$temptablewait = 1;
}
# log LSN right before we start changing checksums
my $result =
@ -154,7 +161,8 @@ sub flip_data_checksums
wait_for_checksum_state($node_standby, 'on');
$node_primary->safe_psql('postgres',
'SELECT dcw_fake_temptable(false);');
"SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');"
) if ($temptablewait);
$data_checksum_state = 'on';
}
elsif ($data_checksum_state eq 'on')
@ -170,6 +178,7 @@ sub flip_data_checksums
$node_primary->wait_for_catchup($node_standby, 'replay');
# Wait for checksums disabled on the primary and standby
random_sleep() if ($extended);
wait_for_checksum_state($node_primary, 'off');
wait_for_checksum_state($node_standby, 'off');
@ -178,9 +187,6 @@ sub flip_data_checksums
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
note("LSN after disabling: " . $result . "\n");
random_sleep() if ($extended);
wait_for_checksum_state($node_standby, 'off');
$data_checksum_state = 'off';
}
else
@ -207,6 +213,7 @@ log_statement = none
]);
$node_primary->start;
$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
# Create some content to have un-checksummed data in the cluster
$node_primary->safe_psql('postgres',
"CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");

View file

@ -67,15 +67,15 @@ sub flip_data_checksums
# log LSN right before we start changing checksums
$lsn_pre =
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
note("LSN before disabling: " . $lsn_pre . "\n");
disable_data_checksums($node_primary);
# Wait for checksums disabled on the primary
wait_for_checksum_state($node_primary, 'off');
# Disable checksums on the primary and wait for completion
disable_data_checksums($node_primary, wait => 1);
# log LSN right after the primary flips checksums to "off"
$lsn_post =
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
note("LSN after disabling: " . $lsn_post . "\n");
$data_checksum_state = 'off';
}

View file

@ -175,8 +175,19 @@ EOQ
$postgresnode->safe_psql('postgres',
sprintf($query, $params{cost_delay}, $params{cost_limit}));
wait_for_checksum_state($postgresnode, $params{wait})
if (defined($params{wait}));
if (defined($params{wait}))
{
wait_for_checksum_state($postgresnode, $params{wait});
# If we are tasked with waiting for an end state, also wait for the
# launcher to exit.
if ($params{wait} eq 'on' || $params{wait} eq 'off')
{
$postgresnode->poll_query_until('postgres',
"SELECT count(*) = 0 "
. "FROM pg_catalog.pg_stat_activity "
. "WHERE backend_type = 'datachecksum launcher';");
}
}
}
=item disable_data_checksums($node, %params)
@ -204,7 +215,14 @@ sub disable_data_checksums
$postgresnode->safe_psql('postgres',
'SELECT pg_disable_data_checksums();');
wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait}));
if (defined($params{wait}))
{
wait_for_checksum_state($postgresnode, 'off');
$postgresnode->poll_query_until('postgres',
"SELECT count(*) = 0 "
. "FROM pg_catalog.pg_stat_activity "
. "WHERE backend_type = 'datachecksum launcher';");
}
}
=item cointoss

View file

@ -14,11 +14,3 @@ CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true)
CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true)
RETURNS pg_catalog.void
AS 'MODULE_PATHNAME' LANGUAGE C;
CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true)
RETURNS pg_catalog.void
AS 'MODULE_PATHNAME' LANGUAGE C;
CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true)
RETURNS pg_catalog.void
AS 'MODULE_PATHNAME' LANGUAGE C;

View file

@ -25,8 +25,6 @@ extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_d
extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg);
extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg);
extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg);
/*
* Test for delaying emission of procsignalbarriers.
*/
@ -107,80 +105,3 @@ dcw_inject_startup_delay(PG_FUNCTION_ARGS)
#endif
PG_RETURN_VOID();
}
#ifdef USE_INJECTION_POINTS
static uint32 db_fail = DATACHECKSUMSWORKER_FAILED;
#endif
void
dc_modify_db_result(const char *name, const void *private_data, void *arg)
{
DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg;
uint32 new_res = *(uint32 *) private_data;
*res = new_res;
}
PG_FUNCTION_INFO_V1(dcw_inject_fail_database);
Datum
dcw_inject_fail_database(PG_FUNCTION_ARGS)
{
#ifdef USE_INJECTION_POINTS
bool attach = PG_GETARG_BOOL(0);
if (attach)
InjectionPointAttach("datachecksumsworker-modify-db-result",
"test_checksums",
"dc_modify_db_result",
&db_fail,
sizeof(uint32));
else
InjectionPointDetach("datachecksumsworker-modify-db-result");
#else
elog(ERROR,
"test is not working as intended when injection points are disabled");
#endif
PG_RETURN_VOID();
}
/*
* Test to force waiting for existing temptables.
*/
void
dc_fake_temptable(const char *name, const void *private_data, void *arg)
{
static bool first_pass = true;
int *numleft = (int *) arg;
if (first_pass)
*numleft = 1;
first_pass = false;
}
PG_FUNCTION_INFO_V1(dcw_fake_temptable);
Datum
dcw_fake_temptable(PG_FUNCTION_ARGS)
{
#ifdef USE_INJECTION_POINTS
bool attach = PG_GETARG_BOOL(0);
if (attach)
InjectionPointAttach("datachecksumsworker-fake-temptable-wait",
"test_checksums",
"dc_fake_temptable",
NULL,
0);
else
InjectionPointDetach("datachecksumsworker-fake-temptable-wait");
#else
elog(ERROR,
"test is not working as intended when injection points are disabled");
#endif
PG_RETURN_VOID();
}
void
crash(const char *name, const void *private_data, void *arg)
{
abort();
}