mirror of
https://github.com/postgres/postgres.git
synced 2026-05-28 04:35:45 -04:00
Test stabilization for online checksums
Postcommit review and buildfarm/CI failures revealed a few issues in
the test code which this commit attempts to resolve. These failures
are verified using synthetic means.
* Wait for launcher exit in enable/disable checksum tests
When enabling or disabling data checksums in a test with waiting
for an end state (on or off), the test typically want to perform
more test against the cluster immediately. Make sure to wait for
the launcher to exit in these cases before returning in order to
know it can immediately be acted on. This is a more generic way
of implementating 0036232ba8.
* Refactor injection point tests to use the injection_points test
extension. Two injection points added for online checksums were
better expressed using the injection_points extension with the
test code embedded in datachecksum_state.c.
* Make tests less timing dependent and allow transitions to "on"
and not just "inprogress-on" in case a test manages to finish
before it's checked for state.
* When waiting on a blocking background psql keeping a temporary
table open, the test first closed the background session abd
then the server. This could cause data checksums to manage to
get enabled in the brief window between dropping the temporary
table and closing the server. Fix by closing the server first
before the background session.
* Remove a few superfluous duplicate checks and general cleanup
of comments as well as making LSN logging consistent.
These issues were reported by Andres as well as spotted in the
buildfarm and on CI.
Author: Daniel Gustafsson <daniel@yesql.se>
Reported-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/92F25C14-801E-4198-994D-D83E31FEB0D8@yesql.se
This commit is contained in:
parent
d771b0a907
commit
07009121c2
11 changed files with 101 additions and 119 deletions
|
|
@ -1216,8 +1216,15 @@ ProcessAllDatabases(void)
|
|||
|
||||
result = ProcessDatabase(db);
|
||||
|
||||
#ifdef USE_INJECTION_POINTS
|
||||
/* Allow a test process to alter the result of the operation */
|
||||
INJECTION_POINT("datachecksumsworker-modify-db-result", &result);
|
||||
if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fail-db-result"))
|
||||
{
|
||||
result = DATACHECKSUMSWORKER_FAILED;
|
||||
INJECTION_POINT_CACHED("datachecksumsworker-fail-db-result",
|
||||
db->dbname);
|
||||
}
|
||||
#endif
|
||||
|
||||
pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE,
|
||||
++cumulative_total);
|
||||
|
|
@ -1451,6 +1458,9 @@ DataChecksumsWorkerMain(Datum arg)
|
|||
BufferAccessStrategy strategy;
|
||||
bool aborted = false;
|
||||
int64 rels_done;
|
||||
#ifdef USE_INJECTION_POINTS
|
||||
bool retried = false;
|
||||
#endif
|
||||
|
||||
operation = ENABLE_DATACHECKSUMS;
|
||||
|
||||
|
|
@ -1566,7 +1576,19 @@ DataChecksumsWorkerMain(Datum arg)
|
|||
}
|
||||
list_free(CurrentTempTables);
|
||||
|
||||
INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft);
|
||||
#ifdef USE_INJECTION_POINTS
|
||||
if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fake-temptable-wait"))
|
||||
{
|
||||
/* Make sure to just cause one retry */
|
||||
if (!retried && numleft == 0)
|
||||
{
|
||||
numleft = 1;
|
||||
retried = true;
|
||||
|
||||
INJECTION_POINT_CACHED("datachecksumsworker-fake-temptable-wait", NULL);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (numleft == 0)
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -35,11 +35,7 @@ my $result =
|
|||
is($result, '9999', 'ensure checksummed pages can be read back');
|
||||
|
||||
# Enable data checksums again which should be a no-op so we explicitly don't
|
||||
# wait for any state transition as none should happen here. Make sure to let
|
||||
# any running launcher finish in case it's still wrapping up.
|
||||
$result = $node->poll_query_until('postgres',
|
||||
"SELECT count(*) = 0 FROM pg_catalog.pg_stat_activity WHERE backend_type = 'datachecksum launcher';"
|
||||
);
|
||||
# wait for any state transition as none should happen here.
|
||||
enable_data_checksums($node);
|
||||
test_checksum_state($node, 'on');
|
||||
# ..and make sure we can still read/write data
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ $node_standby->start;
|
|||
$node_primary->safe_psql('postgres',
|
||||
"CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
|
||||
|
||||
# Wait for standbys to catch up
|
||||
# Wait for standby to catch up
|
||||
$node_primary->wait_for_catchup($node_standby, 'replay',
|
||||
$node_primary->lsn('insert'));
|
||||
|
||||
|
|
@ -54,8 +54,14 @@ test_checksum_state($node_standby, 'off');
|
|||
# standby change state.
|
||||
#
|
||||
|
||||
# Ensure that the primary switches to "inprogress-on"
|
||||
enable_data_checksums($node_primary, wait => 'inprogress-on');
|
||||
# Initiate enabling of checksums and ensure that the primary switches to
|
||||
# either "inprogress-on" or "on"
|
||||
enable_data_checksums($node_primary);
|
||||
my $result = $node_primary->poll_query_until(
|
||||
'postgres',
|
||||
"SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
|
||||
'f');
|
||||
is($result, 1, 'ensure primary has transitioned from off');
|
||||
# Wait for checksum enable to be replayed
|
||||
$node_primary->wait_for_catchup($node_standby, 'replay');
|
||||
|
||||
|
|
@ -63,7 +69,7 @@ $node_primary->wait_for_catchup($node_standby, 'replay');
|
|||
# would be "inprogress-on", but it is theoretically possible for the primary to
|
||||
# complete the checksum enabling *and* have the standby replay that record
|
||||
# before we reach the check below.
|
||||
my $result = $node_standby->poll_query_until(
|
||||
$result = $node_standby->poll_query_until(
|
||||
'postgres',
|
||||
"SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
|
||||
'f');
|
||||
|
|
|
|||
|
|
@ -62,11 +62,15 @@ $result = $node->safe_psql('postgres',
|
|||
"SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';");
|
||||
is($result, 't', 'ensure we can see the temporary table');
|
||||
|
||||
# Enable, but stop waiting at inprogress-on since it will sit there until the
|
||||
# above temporary table is removed.
|
||||
enable_data_checksums($node, wait => 'inprogress-on');
|
||||
|
||||
# Turn the cluster off and enable checksums offline, then start back up
|
||||
# Turn the cluster off and enable checksums offline, then start back up.
|
||||
# Stop the cluster before exiting the background session since otherwise
|
||||
# checksums might have time to get enabled before shutting down the cluster.
|
||||
$node->stop('fast');
|
||||
$bsession->quit;
|
||||
$node->stop;
|
||||
$node->checksum_enable_offline;
|
||||
$node->start;
|
||||
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ $node->start;
|
|||
|
||||
# Set up test environment
|
||||
$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
|
||||
$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Inducing failures and crashes in processing
|
||||
|
|
@ -39,9 +40,12 @@ $node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
|
|||
# Force enabling checksums to fail by marking one of the databases as having
|
||||
# failed in processing.
|
||||
disable_data_checksums($node, wait => 1);
|
||||
$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);');
|
||||
$node->safe_psql('postgres',
|
||||
"SELECT injection_points_attach('datachecksumsworker-fail-db-result','notice');"
|
||||
);
|
||||
enable_data_checksums($node, wait => 'off');
|
||||
$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);');
|
||||
$node->safe_psql('postgres',
|
||||
"SELECT injection_points_detach('datachecksumsworker-fail-db-result');");
|
||||
|
||||
# Make sure that disabling after a failure works
|
||||
disable_data_checksums($node);
|
||||
|
|
@ -66,7 +70,9 @@ SKIP:
|
|||
# will force the processing to wait and retry in order to wait for it to
|
||||
# disappear.
|
||||
disable_data_checksums($node, wait => 1);
|
||||
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);');
|
||||
$node->safe_psql('postgres',
|
||||
"SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
|
||||
);
|
||||
enable_data_checksums($node, wait => 'on');
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -89,14 +89,21 @@ sub background_rw_pgbench
|
|||
# before and after state.
|
||||
sub flip_data_checksums
|
||||
{
|
||||
my $temptablewait = 0;
|
||||
|
||||
# First, make sure the cluster is in the state we expect it to be
|
||||
test_checksum_state($node, $data_checksum_state);
|
||||
|
||||
if ($data_checksum_state eq 'off')
|
||||
{
|
||||
# Coin-toss to see if we are injecting a retry due to a temptable
|
||||
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
|
||||
if cointoss();
|
||||
if (cointoss())
|
||||
{
|
||||
$node->safe_psql('postgres',
|
||||
"SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
|
||||
);
|
||||
$temptablewait = 1;
|
||||
}
|
||||
|
||||
# log LSN right before we start changing checksums
|
||||
my $result =
|
||||
|
|
@ -117,7 +124,9 @@ sub flip_data_checksums
|
|||
|
||||
random_sleep() if ($extended);
|
||||
|
||||
$node->safe_psql('postgres', 'SELECT dcw_fake_temptable(false);');
|
||||
$node->safe_psql('postgres',
|
||||
"SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');"
|
||||
) if ($temptablewait);
|
||||
$data_checksum_state = 'on';
|
||||
}
|
||||
elsif ($data_checksum_state eq 'on')
|
||||
|
|
@ -165,6 +174,7 @@ log_statement = none
|
|||
]);
|
||||
$node->start;
|
||||
$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
|
||||
$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
|
||||
# Create some content to have un-checksummed data in the cluster
|
||||
$node->safe_psql('postgres',
|
||||
"CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");
|
||||
|
|
|
|||
|
|
@ -99,6 +99,8 @@ sub background_pgbench
|
|||
# before and after state.
|
||||
sub flip_data_checksums
|
||||
{
|
||||
my $temptablewait = 0;
|
||||
|
||||
# First, make sure the cluster is in the state we expect it to be
|
||||
test_checksum_state($node_primary, $data_checksum_state);
|
||||
test_checksum_state($node_standby, $data_checksum_state);
|
||||
|
|
@ -106,8 +108,13 @@ sub flip_data_checksums
|
|||
if ($data_checksum_state eq 'off')
|
||||
{
|
||||
# Coin-toss to see if we are injecting a retry due to a temptable
|
||||
$node_primary->safe_psql('postgres', 'SELECT dcw_fake_temptable();')
|
||||
if cointoss();
|
||||
if (cointoss())
|
||||
{
|
||||
$node_primary->safe_psql('postgres',
|
||||
"SELECT injection_points_attach('datachecksumsworker-fake-temptable-wait', 'notice');"
|
||||
);
|
||||
$temptablewait = 1;
|
||||
}
|
||||
|
||||
# log LSN right before we start changing checksums
|
||||
my $result =
|
||||
|
|
@ -154,7 +161,8 @@ sub flip_data_checksums
|
|||
wait_for_checksum_state($node_standby, 'on');
|
||||
|
||||
$node_primary->safe_psql('postgres',
|
||||
'SELECT dcw_fake_temptable(false);');
|
||||
"SELECT injection_points_detach('datachecksumsworker-fake-temptable-wait');"
|
||||
) if ($temptablewait);
|
||||
$data_checksum_state = 'on';
|
||||
}
|
||||
elsif ($data_checksum_state eq 'on')
|
||||
|
|
@ -170,6 +178,7 @@ sub flip_data_checksums
|
|||
$node_primary->wait_for_catchup($node_standby, 'replay');
|
||||
|
||||
# Wait for checksums disabled on the primary and standby
|
||||
random_sleep() if ($extended);
|
||||
wait_for_checksum_state($node_primary, 'off');
|
||||
wait_for_checksum_state($node_standby, 'off');
|
||||
|
||||
|
|
@ -178,9 +187,6 @@ sub flip_data_checksums
|
|||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
|
||||
note("LSN after disabling: " . $result . "\n");
|
||||
|
||||
random_sleep() if ($extended);
|
||||
wait_for_checksum_state($node_standby, 'off');
|
||||
|
||||
$data_checksum_state = 'off';
|
||||
}
|
||||
else
|
||||
|
|
@ -207,6 +213,7 @@ log_statement = none
|
|||
]);
|
||||
$node_primary->start;
|
||||
$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
|
||||
$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
|
||||
# Create some content to have un-checksummed data in the cluster
|
||||
$node_primary->safe_psql('postgres',
|
||||
"CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");
|
||||
|
|
|
|||
|
|
@ -67,15 +67,15 @@ sub flip_data_checksums
|
|||
# log LSN right before we start changing checksums
|
||||
$lsn_pre =
|
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
|
||||
note("LSN before disabling: " . $lsn_pre . "\n");
|
||||
|
||||
disable_data_checksums($node_primary);
|
||||
|
||||
# Wait for checksums disabled on the primary
|
||||
wait_for_checksum_state($node_primary, 'off');
|
||||
# Disable checksums on the primary and wait for completion
|
||||
disable_data_checksums($node_primary, wait => 1);
|
||||
|
||||
# log LSN right after the primary flips checksums to "off"
|
||||
$lsn_post =
|
||||
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
|
||||
note("LSN after disabling: " . $lsn_post . "\n");
|
||||
|
||||
$data_checksum_state = 'off';
|
||||
}
|
||||
|
|
|
|||
|
|
@ -175,8 +175,19 @@ EOQ
|
|||
$postgresnode->safe_psql('postgres',
|
||||
sprintf($query, $params{cost_delay}, $params{cost_limit}));
|
||||
|
||||
wait_for_checksum_state($postgresnode, $params{wait})
|
||||
if (defined($params{wait}));
|
||||
if (defined($params{wait}))
|
||||
{
|
||||
wait_for_checksum_state($postgresnode, $params{wait});
|
||||
# If we are tasked with waiting for an end state, also wait for the
|
||||
# launcher to exit.
|
||||
if ($params{wait} eq 'on' || $params{wait} eq 'off')
|
||||
{
|
||||
$postgresnode->poll_query_until('postgres',
|
||||
"SELECT count(*) = 0 "
|
||||
. "FROM pg_catalog.pg_stat_activity "
|
||||
. "WHERE backend_type = 'datachecksum launcher';");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
=item disable_data_checksums($node, %params)
|
||||
|
|
@ -204,7 +215,14 @@ sub disable_data_checksums
|
|||
$postgresnode->safe_psql('postgres',
|
||||
'SELECT pg_disable_data_checksums();');
|
||||
|
||||
wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait}));
|
||||
if (defined($params{wait}))
|
||||
{
|
||||
wait_for_checksum_state($postgresnode, 'off');
|
||||
$postgresnode->poll_query_until('postgres',
|
||||
"SELECT count(*) = 0 "
|
||||
. "FROM pg_catalog.pg_stat_activity "
|
||||
. "WHERE backend_type = 'datachecksum launcher';");
|
||||
}
|
||||
}
|
||||
|
||||
=item cointoss
|
||||
|
|
|
|||
|
|
@ -14,11 +14,3 @@ CREATE FUNCTION dcw_inject_launcher_delay(attach boolean DEFAULT true)
|
|||
CREATE FUNCTION dcw_inject_startup_delay(attach boolean DEFAULT true)
|
||||
RETURNS pg_catalog.void
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
||||
CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true)
|
||||
RETURNS pg_catalog.void
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
||||
CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true)
|
||||
RETURNS pg_catalog.void
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
|
|
|||
|
|
@ -25,8 +25,6 @@ extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_d
|
|||
extern PGDLLEXPORT void dc_modify_db_result(const char *name, const void *private_data, void *arg);
|
||||
extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg);
|
||||
|
||||
extern PGDLLEXPORT void crash(const char *name, const void *private_data, void *arg);
|
||||
|
||||
/*
|
||||
* Test for delaying emission of procsignalbarriers.
|
||||
*/
|
||||
|
|
@ -107,80 +105,3 @@ dcw_inject_startup_delay(PG_FUNCTION_ARGS)
|
|||
#endif
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
#ifdef USE_INJECTION_POINTS
|
||||
static uint32 db_fail = DATACHECKSUMSWORKER_FAILED;
|
||||
#endif
|
||||
|
||||
void
|
||||
dc_modify_db_result(const char *name, const void *private_data, void *arg)
|
||||
{
|
||||
DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg;
|
||||
uint32 new_res = *(uint32 *) private_data;
|
||||
|
||||
*res = new_res;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(dcw_inject_fail_database);
|
||||
Datum
|
||||
dcw_inject_fail_database(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#ifdef USE_INJECTION_POINTS
|
||||
bool attach = PG_GETARG_BOOL(0);
|
||||
|
||||
if (attach)
|
||||
InjectionPointAttach("datachecksumsworker-modify-db-result",
|
||||
"test_checksums",
|
||||
"dc_modify_db_result",
|
||||
&db_fail,
|
||||
sizeof(uint32));
|
||||
else
|
||||
InjectionPointDetach("datachecksumsworker-modify-db-result");
|
||||
#else
|
||||
elog(ERROR,
|
||||
"test is not working as intended when injection points are disabled");
|
||||
#endif
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
/*
|
||||
* Test to force waiting for existing temptables.
|
||||
*/
|
||||
void
|
||||
dc_fake_temptable(const char *name, const void *private_data, void *arg)
|
||||
{
|
||||
static bool first_pass = true;
|
||||
int *numleft = (int *) arg;
|
||||
|
||||
if (first_pass)
|
||||
*numleft = 1;
|
||||
first_pass = false;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(dcw_fake_temptable);
|
||||
Datum
|
||||
dcw_fake_temptable(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#ifdef USE_INJECTION_POINTS
|
||||
bool attach = PG_GETARG_BOOL(0);
|
||||
|
||||
if (attach)
|
||||
InjectionPointAttach("datachecksumsworker-fake-temptable-wait",
|
||||
"test_checksums",
|
||||
"dc_fake_temptable",
|
||||
NULL,
|
||||
0);
|
||||
else
|
||||
InjectionPointDetach("datachecksumsworker-fake-temptable-wait");
|
||||
#else
|
||||
elog(ERROR,
|
||||
"test is not working as intended when injection points are disabled");
|
||||
#endif
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
void
|
||||
crash(const char *name, const void *private_data, void *arg)
|
||||
{
|
||||
abort();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue