diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c index f6f5f0792d2..4cf6e41e2f5 100644 --- a/contrib/pg_walinspect/pg_walinspect.c +++ b/contrib/pg_walinspect/pg_walinspect.c @@ -99,6 +99,7 @@ InitXLogReaderState(XLogRecPtr lsn) XLogReaderState *xlogreader; ReadLocalXLogPageNoWaitPrivate *private_data; XLogRecPtr first_valid_record; + char *errormsg; /* * Reading WAL below the first page of the first segments isn't allowed. @@ -126,12 +127,19 @@ InitXLogReaderState(XLogRecPtr lsn) errdetail("Failed while allocating a WAL reading processor."))); /* first find a valid recptr to start from */ - first_valid_record = XLogFindNextRecord(xlogreader, lsn); + first_valid_record = XLogFindNextRecord(xlogreader, lsn, &errormsg); if (!XLogRecPtrIsValid(first_valid_record)) - ereport(ERROR, - errmsg("could not find a valid record after %X/%08X", - LSN_FORMAT_ARGS(lsn))); + { + if (errormsg) + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X: %s", + LSN_FORMAT_ARGS(lsn), errormsg)); + else + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(lsn))); + } return xlogreader; } diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 8cb2110cb99..8849610db00 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1390,14 +1390,21 @@ XLogReaderResetError(XLogReaderState *state) * * This positions the reader, like XLogBeginRead(), so that the next call to * XLogReadRecord() will read the next valid record. + * + * On failure, InvalidXLogRecPtr is returned, and *errormsg is set to a string + * with details of the failure. + * + * When set, *errormsg points to an internal buffer that's valid until the next + * call to XLogReadRecord. */ XLogRecPtr -XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) +XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) { XLogRecPtr tmpRecPtr; XLogRecPtr found = InvalidXLogRecPtr; XLogPageHeader header; - char *errormsg; + + *errormsg = NULL; Assert(XLogRecPtrIsValid(RecPtr)); @@ -1482,7 +1489,7 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) * or we just jumped over the remaining data of a continuation. */ XLogBeginRead(state, tmpRecPtr); - while (XLogReadRecord(state, &errormsg) != NULL) + while (XLogReadRecord(state, errormsg) != NULL) { /* past the record we've found, break out */ if (RecPtr <= state->ReadRecPtr) @@ -1497,6 +1504,17 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) err: XLogReaderInvalReadState(state); + /* + * We may have reported errors due to invalid WAL header, propagate the + * error message to the caller. + */ + if (state->errormsg_deferred) + { + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + state->errormsg_deferred = false; + } + return InvalidXLogRecPtr; } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index e1aa102f41d..0c0670f7da9 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -915,6 +915,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, WalSummaryIO io; BlockRefTable *brtab = CreateEmptyBlockRefTable(); bool fast_forward = true; + char *errormsg; /* Initialize private data for xlogreader. */ private_data = palloc0_object(SummarizerReadLocalXLogPrivate); @@ -966,7 +967,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, } else { - summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn); + summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn, &errormsg); if (!XLogRecPtrIsValid(summary_start_lsn)) { /* @@ -995,9 +996,16 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, switch_lsn = xlogreader->EndRecPtr; } else - ereport(ERROR, - errmsg("could not find a valid record after %X/%08X", - LSN_FORMAT_ARGS(start_lsn))); + { + if (errormsg) + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X: %s", + LSN_FORMAT_ARGS(start_lsn), errormsg)); + else + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(start_lsn))); + } } /* We shouldn't go backward. */ @@ -1010,7 +1018,6 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, while (1) { int block_id; - char *errormsg; XLogRecord *record; uint8 rmid; diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index f82507ef696..630d9859882 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -1384,11 +1384,17 @@ main(int argc, char **argv) pg_fatal("out of memory while allocating a WAL reading processor"); /* first find a valid recptr to start from */ - first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + first_record = XLogFindNextRecord(xlogreader_state, private.startptr, &errormsg); if (!XLogRecPtrIsValid(first_record)) - pg_fatal("could not find a valid record after %X/%08X", - LSN_FORMAT_ARGS(private.startptr)); + { + if (errormsg) + pg_fatal("could not find a valid record after %X/%08X: %s", + LSN_FORMAT_ARGS(private.startptr), errormsg); + else + pg_fatal("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(private.startptr)); + } /* * Display a message that we're skipping data if `from` wasn't a pointer diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl index 11df7e092bf..8bb8fa225f6 100644 --- a/src/bin/pg_waldump/t/001_basic.pl +++ b/src/bin/pg_waldump/t/001_basic.pl @@ -4,6 +4,7 @@ use strict; use warnings FATAL => 'all'; use Cwd; +use File::Copy; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; @@ -246,6 +247,33 @@ command_like( qr/^$/, 'no output with --quiet option'); +# Test that pg_waldump reports a detailed error message when dumping +# a WAL file with an invalid magic number (0000). +# +# The broken WAL file is created by copying a valid WAL file and +# overwriting its magic number with 0000. +my $broken_wal_dir = PostgreSQL::Test::Utils::tempdir_short(); +my $broken_wal = "$broken_wal_dir/$start_walfile"; +copy($node->data_dir . '/pg_wal/' . $start_walfile, $broken_wal) + || die "copying $start_walfile $!"; + +my $fh; +open($fh, '+<', $broken_wal) + or BAIL_OUT("open failed: $!"); +binmode $fh; + +sysseek($fh, 0, 0) + or BAIL_OUT("sysseek failed: $!"); +syswrite($fh, pack("S", 0)) + or BAIL_OUT("syswrite failed: $!"); +close($fh) + or BAIL_OUT("close failed: $!"); + +command_fails_like( + [ 'pg_waldump', $broken_wal ], + qr/invalid magic number 0000/i, + 'detailed error message shown for invalid WAL page magic'); + # Test for: Display a message that we're skipping data if `from` # wasn't a pointer to the start of a record. sub test_pg_waldump_skip_bytes diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 80f1a548e08..97eae2c1dab 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -342,7 +342,8 @@ extern void XLogReaderSetDecodeBuffer(XLogReaderState *state, /* Position the XLogReader to given record */ extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr); -extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr); +extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, + char **errormsg); /* Return values from XLogPageReadCB. */ typedef enum XLogPageReadResult