mirror of
https://github.com/postgres/postgres.git
synced 2026-06-09 00:32:10 -04:00
Fix checksum state transition during promotion
When a standby is promoted to primary during checksum enabling when the state is inprogress-on, the standby shall revert the state to off since checksums weren't fully enabled at the time of the crash. Consider the following scenario: 1) primary/standby cluster has checksums off 2) primary starts enabling checksums 3) primary moves to inprogress-on 4) standby receives that and moves to inprogress-on too 5) primary crashes 6) standby gets promoted, and does the StartupXLOG thing 7) standby moves from inprogress-on back to off Any processes in the standby need to be informed at step 6 to change state with a procsignalbarrier, else they will stay in inprogress-on while new backends will see the state as off. StartupXLOG failed to emit a procsignalbarrier which caused inconsistent state in the node promoted to primary. Fixed by emitting a procsignalbarrier during promotion, and adding a new test for this scenario. Author: Daniel Gustafsson <daniel@yesql.se> Reported-by: Tomas Vondra <tomas@vondra.me> Discussion: https://postgr.es/m/f1281cf3-89a3-4936-9bc5-2a5a6291229f@vondra.me
This commit is contained in:
parent
38470c2c1e
commit
5fee7cab1b
2 changed files with 62 additions and 1 deletions
|
|
@ -6610,6 +6610,7 @@ StartupXLOG(void)
|
|||
SetLocalDataChecksumState(XLogCtl->data_checksum_version);
|
||||
SpinLockRelease(&XLogCtl->info_lck);
|
||||
|
||||
EmitAndWaitDataChecksumsBarrier(PG_DATA_CHECKSUM_OFF);
|
||||
ereport(WARNING,
|
||||
errmsg("enabling data checksums was interrupted"),
|
||||
errhint("Data checksum processing must be manually restarted for checksums to be enabled."));
|
||||
|
|
@ -6621,7 +6622,7 @@ StartupXLOG(void)
|
|||
* checksums and we can move to off instead of prompting the user to
|
||||
* perform any action.
|
||||
*/
|
||||
if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF)
|
||||
else if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF)
|
||||
{
|
||||
XLogChecksums(PG_DATA_CHECKSUM_OFF);
|
||||
|
||||
|
|
@ -6629,6 +6630,8 @@ StartupXLOG(void)
|
|||
XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_OFF;
|
||||
SetLocalDataChecksumState(XLogCtl->data_checksum_version);
|
||||
SpinLockRelease(&XLogCtl->info_lck);
|
||||
|
||||
EmitAndWaitDataChecksumsBarrier(PG_DATA_CHECKSUM_OFF);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -226,4 +226,62 @@ unlike(
|
|||
qr/page verification failed,.+\d$/m,
|
||||
"no checksum validation errors in standby log");
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test that enforced state transitions during promotion (via StartupXLOG) are
|
||||
# performed as expected. When the primary crashes during inprogress-on the
|
||||
# standby should revert to off at promotion. In order to check the transition
|
||||
# the test will keep an open psql session with the standby during promotion.
|
||||
|
||||
# The cluster is currently broken down from the previous test. Start up the
|
||||
# primary as primary, disable checksums and create a new standby from that
|
||||
# state.
|
||||
$node_standby->clean_node();
|
||||
$node_primary->start();
|
||||
disable_data_checksums($node_primary, wait => 'off');
|
||||
|
||||
# Re-create a new streaming standby linking to primary. The replication slot
|
||||
# name is reused from earlier but a fresh backup is taken
|
||||
$backup_name = 'my_new_backup';
|
||||
$node_primary->backup($backup_name);
|
||||
$node_standby = PostgreSQL::Test::Cluster->new('standby_restarts_standby');
|
||||
$node_standby->init_from_backup($node_primary, $backup_name,
|
||||
has_streaming => 1);
|
||||
$node_standby->append_conf(
|
||||
'postgresql.conf', qq[
|
||||
primary_slot_name = '$slotname'
|
||||
]);
|
||||
$node_standby->start;
|
||||
$node_primary->wait_for_catchup($node_standby, 'replay');
|
||||
|
||||
# Open a background psql connection on the primary and inject a barrier to
|
||||
# block progress on to keep the state from advancing past inprogress-on
|
||||
my $node_primary_bpsql = $node_primary->background_psql('postgres');
|
||||
$node_primary_bpsql->query_safe('CREATE TEMPORARY TABLE tt (a integer);');
|
||||
# Also open a background psql connection to the standby to make sure we have
|
||||
# an active backend during promotion.
|
||||
my $node_standby_bpsql = $node_standby->background_psql('postgres');
|
||||
|
||||
# Start to enable checksums and wait until both primary and standby have moved
|
||||
# to the inprogress-on state. Processing will block here as the temporary rel
|
||||
# barrier will block the primary from finishing.
|
||||
enable_data_checksums($node_primary, wait => 'inprogress-on');
|
||||
$node_primary->wait_for_catchup($node_standby, 'replay');
|
||||
test_checksum_state($node_standby, 'inprogress-on');
|
||||
|
||||
# Crash the primary before checksums are enabled and promote the standby. The
|
||||
# new primary node will now revert the state of 'off' since checksums weren't
|
||||
# fully enabled during the crash.
|
||||
$node_primary->teardown_node();
|
||||
$node_standby->promote;
|
||||
wait_for_checksum_state($node_standby, 'off');
|
||||
|
||||
# Ensure that the any backend which was active before, and during, promotion
|
||||
# sees the new state.
|
||||
$result = $node_standby_bpsql->query_safe("SHOW data_checksums;");
|
||||
is($result, 'off',
|
||||
'ensure checksums are set to off after promotion during inprogress-on');
|
||||
|
||||
$node_standby_bpsql->quit;
|
||||
$node_standby->stop;
|
||||
|
||||
done_testing();
|
||||
|
|
|
|||
Loading…
Reference in a new issue