Allow logical replication snapshots to be database-specific

By default, the logical decoding assumes access to shared catalogs, so
the snapshot builder needs to consider cluster-wide XIDs during startup.
That in turn means that, if any transaction is already running (and has
XID assigned), the snapshot builder needs to wait for its completion, as
it does not know if that transaction performed catalog changes earlier.

A possible problem with this concept is that if REPACK (CONCURRENTLY) is
running in some database, backends running the same command in other
databases get stuck until the first one has committed. Thus only a
single backend in the cluster can run REPACK (CONCURRENTLY) at any time.
Likewise, REPACK (CONCURRENTLY) can block walsenders starting on behalf
of subscriptions throughout the cluster.

This patch adds a new option to logical replication output plugin, to
declare that it does not use shared catalogs (i.e. catalogs that can be
changed by transactions running in other databases in the cluster). In
that case, no snapshot the backend will use during the decoding needs to
contain information about transactions running in other databases. Thus
the snapshot builder only needs to wait for completion of transactions
in the current database.

Currently we only use this option in the REPACK background worker. It
could possibly be used in the plugin for logical replication too,
however that would need thorough analysis of that plugin.

Bump WAL version number, due to a new field in xl_running_xacts.

Author: Antonin Houska <ah@cybertec.at>
Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de>
Discussion: https://postgr.es/m/90475.1775218118@localhost
This commit is contained in:
Álvaro Herrera 2026-04-07 12:31:18 +02:00
parent a3b069ef90
commit 0d3dba38c7
No known key found for this signature in database
GPG key ID: 1C20ACB9D5C564AE
21 changed files with 166 additions and 18 deletions

View file

@ -621,7 +621,7 @@ GetStrictOldestNonRemovableTransactionId(Relation rel)
else if (rel == NULL || rel->rd_rel->relisshared)
{
/* Shared relation: take into account all running xids */
runningTransactions = GetRunningTransactionData();
runningTransactions = GetRunningTransactionData(InvalidOid);
LWLockRelease(ProcArrayLock);
LWLockRelease(XidGenLock);
return runningTransactions->oldestRunningXid;
@ -632,7 +632,7 @@ GetStrictOldestNonRemovableTransactionId(Relation rel)
* Normal relation: take into account xids running within the current
* database
*/
runningTransactions = GetRunningTransactionData();
runningTransactions = GetRunningTransactionData(InvalidOid);
LWLockRelease(ProcArrayLock);
LWLockRelease(XidGenLock);
return runningTransactions->oldestDatabaseRunningXid;

View file

@ -959,6 +959,7 @@ typedef struct OutputPluginOptions
{
OutputPluginOutputType output_type;
bool receive_rewrites;
bool need_shared_catalogs;
} OutputPluginOptions;
</programlisting>
<literal>output_type</literal> has to either be set to
@ -969,6 +970,9 @@ typedef struct OutputPluginOptions
also be called for changes made by heap rewrites during certain DDL
operations. These are of interest to plugins that handle DDL
replication, but they require special handling.
<literal>need_shared_catalogs</literal> can be set to false if you are
certain the plugin functions do not access shared system catalogs.
Doing so can speed up creation of replication slots that use this plugin.
</para>
<para>

View file

@ -394,6 +394,14 @@ systable_beginscan(Relation heapRelation,
SysScanDesc sysscan;
Relation irel;
/*
* If this backend promised that it won't access shared catalogs during
* logical decoding, this it the right place to verify.
*/
Assert(!HistoricSnapshotActive() ||
accessSharedCatalogsInDecoding ||
!heapRelation->rd_rel->relisshared);
if (indexOK &&
!IgnoreSystemIndexes &&
!ReindexIsProcessingIndex(indexId))

View file

@ -41,6 +41,8 @@ standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec)
for (i = 0; i < xlrec->subxcnt; i++)
appendStringInfo(buf, " %u", xlrec->xids[xlrec->xcnt + i]);
}
appendStringInfo(buf, "; dbid: %u", xlrec->dbid);
}
void

View file

@ -7735,7 +7735,7 @@ CreateCheckPoint(int flags)
* recovery we don't need to write running xact data.
*/
if (!shutdown && XLogStandbyInfoActive())
LogStandbySnapshot();
LogStandbySnapshot(InvalidOid);
START_CRIT_SECTION();

View file

@ -245,7 +245,7 @@ pg_log_standby_snapshot(PG_FUNCTION_ARGS)
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("pg_log_standby_snapshot() can only be used if \"wal_level\" >= \"replica\"")));
recptr = LogStandbySnapshot();
recptr = LogStandbySnapshot(InvalidOid);
/*
* As a convenience, return the WAL location of the last inserted record

View file

@ -289,7 +289,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
if (now >= timeout &&
last_snapshot_lsn <= GetLastImportantRecPtr())
{
last_snapshot_lsn = LogStandbySnapshot();
last_snapshot_lsn = LogStandbySnapshot(InvalidOid);
last_snapshot_ts = now;
}
}

View file

@ -382,7 +382,16 @@ standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
{
xl_running_xacts *running = (xl_running_xacts *) XLogRecGetData(r);
SnapBuildProcessRunningXacts(builder, buf->origptr, running);
/*
* Update this decoder's idea of transactions currently
* running. In doing so we will determine whether we have
* reached consistent status.
*
* If the output plugin doesn't need access to shared
* catalogs, we can ignore transactions in other databases.
*/
SnapBuildProcessRunningXacts(builder, buf->origptr, running,
!ctx->options.need_shared_catalogs);
/*
* Abort all transactions that we keep track of, that are
@ -392,8 +401,12 @@ standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
* all running transactions which includes prepared ones,
* while shutdown checkpoints just know that no non-prepared
* transactions are in progress.
*
* The database-specific records might work here too, but it's
* not their purpose.
*/
ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid);
if (!OidIsValid(running->dbid))
ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid);
}
break;
case XLOG_STANDBY_LOCK:

View file

@ -285,6 +285,9 @@ StartupDecodingContext(List *output_plugin_options,
ctx->write = do_write;
ctx->update_progress = update_progress;
/* Assume shared catalog access. The startup callback can change it. */
ctx->options.need_shared_catalogs = true;
ctx->output_plugin_options = output_plugin_options;
ctx->fast_forward = fast_forward;

View file

@ -154,6 +154,14 @@
static ResourceOwner SavedResourceOwnerDuringExport = NULL;
static bool ExportInProgress = false;
/*
* If a backend is going to do logical decoding and the output plugin does
* not need to access shared catalogs, setting this variable to false can make
* the decoding startup faster. In particular, the backend will not need to
* wait for completion of already running transactions in other databases.
*/
bool accessSharedCatalogsInDecoding = true;
/* ->committed and ->catchange manipulation */
static void SnapBuildPurgeOlderTxn(SnapBuild *builder);
@ -170,7 +178,8 @@ static inline bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, Transaction
uint32 xinfo);
/* xlog reading helper functions for SnapBuildProcessRunningXacts */
static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn,
xl_running_xacts *running);
static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff);
/* serialization functions */
@ -226,6 +235,9 @@ AllocateSnapshotBuilder(ReorderBuffer *reorder,
MemoryContextSwitchTo(oldcontext);
/* The default is that shared catalog are used. */
accessSharedCatalogsInDecoding = true;
return builder;
}
@ -244,6 +256,9 @@ FreeSnapshotBuilder(SnapBuild *builder)
builder->snapshot = NULL;
}
/* The default is that shared catalog are used. */
accessSharedCatalogsInDecoding = true;
/* other resources are deallocated via memory context reset */
MemoryContextDelete(context);
}
@ -1136,7 +1151,8 @@ SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid,
* anymore.
*/
void
SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running,
bool db_specific)
{
ReorderBufferTXN *txn;
TransactionId xmin;
@ -1148,6 +1164,33 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact
*/
if (builder->state < SNAPBUILD_CONSISTENT)
{
/*
* To reduce the potential for unnecessarily waiting for completion of
* unrelated transactions, the caller can declare that only
* transactions of the current database are relevant at this stage.
*/
if (db_specific)
{
/*
* If we must only keep track of transactions running in the
* current database, we need transaction info from exactly that
* database.
*/
if (running->dbid != MyDatabaseId)
{
LogStandbySnapshot(MyDatabaseId);
return;
}
/*
* We'd better be able to check during scan if the plugin does not
* lie.
*/
if (accessSharedCatalogsInDecoding)
accessSharedCatalogsInDecoding = false;
}
/* returns false if there's no point in performing cleanup just yet */
if (!SnapBuildFindSnapshot(builder, lsn, running))
return;
@ -1155,6 +1198,16 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact
else
SnapBuildSerialize(builder, lsn);
/*
* Database specific transaction info may exist to reach CONSISTENT state
* faster, however the code below makes no use of it. Moreover, such
* record might cause problems because the following normal (cluster-wide)
* record can have lower value of oldestRunningXid. In that case, let's
* wait with the cleanup for the next regular cluster-wide record.
*/
if (OidIsValid(running->dbid))
return;
/*
* Update range of interesting xids based on the running xacts
* information. We don't increase ->xmax using it, because once we are in
@ -1465,7 +1518,11 @@ SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff)
*/
if (!RecoveryInProgress())
{
LogStandbySnapshot();
/*
* If the last transaction info was about specific database, so needs
* to be the next one - at least until we're in the CONSISTENT state.
*/
LogStandbySnapshot(running->dbid);
}
}

View file

@ -52,6 +52,13 @@ repack_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
/* Probably unnecessary, as we don't use the SQL interface ... */
opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT;
/*
* REPACK doesn't need access to shared catalogs, so we can speed up the
* historic snapshot creation by setting this flag. We'll only have to
* wait for transactions in our database.
*/
opt->need_shared_catalogs = false;
if (ctx->output_plugin_options != NIL)
{
ereport(ERROR,

View file

@ -1756,7 +1756,7 @@ ReplicationSlotReserveWal(void)
XLogRecPtr flushptr;
/* make sure we have enough information to start */
flushptr = LogStandbySnapshot();
flushptr = LogStandbySnapshot(InvalidOid);
/* and make sure it's fsynced to disk */
XLogFlush(flushptr);

View file

@ -2623,9 +2623,11 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
*
* Note that if any transaction has overflowed its cached subtransactions
* then there is no real need include any subtransactions.
*
* If 'dbid' is valid, only gather transactions running in that database.
*/
RunningTransactions
GetRunningTransactionData(void)
GetRunningTransactionData(Oid dbid)
{
/* result workspace */
static RunningTransactionsData CurrentRunningXactsData;
@ -2700,6 +2702,18 @@ GetRunningTransactionData(void)
if (!TransactionIdIsValid(xid))
continue;
/*
* Filter by database OID if requested.
*/
if (OidIsValid(dbid))
{
int pgprocno = arrayP->pgprocnos[index];
PGPROC *proc = &allProcs[pgprocno];
if (proc->databaseId != dbid)
continue;
}
/*
* Be careful not to exclude any xids before calculating the values of
* oldestRunningXid and suboverflowed, since these are used to clean
@ -2750,6 +2764,12 @@ GetRunningTransactionData(void)
PGPROC *proc = &allProcs[pgprocno];
int nsubxids;
/*
* Filter by database OID if requested.
*/
if (OidIsValid(dbid) && proc->databaseId != dbid)
continue;
/*
* Save subtransaction XIDs. Other backends can't add or remove
* entries while we're holding XidGenLock.
@ -2783,6 +2803,7 @@ GetRunningTransactionData(void)
* increases if slots do.
*/
CurrentRunningXacts->dbid = dbid;
CurrentRunningXacts->xcnt = count - subcount;
CurrentRunningXacts->subxcnt = subcount;
CurrentRunningXacts->subxid_status = suboverflowed ? SUBXIDS_IN_SUBTRANS : SUBXIDS_IN_ARRAY;

View file

@ -1188,6 +1188,14 @@ standby_redo(XLogReaderState *record)
xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
RunningTransactionsData running;
/*
* Records issued for specific database are not suitable for physical
* replication because that affects the whole cluster. In particular,
* the list of XID is probably incomplete here.
*/
if (OidIsValid(xlrec->dbid))
return;
running.xcnt = xlrec->xcnt;
running.subxcnt = xlrec->subxcnt;
running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
@ -1277,11 +1285,22 @@ standby_redo(XLogReaderState *record)
* as there's no independent knob to just enable logical decoding. For
* details of how this is used, check snapbuild.c's introductory comment.
*
* If 'dbid' is valid, only gather transactions running in that
* database. snapbuild.c can use such running xacts information for faster
* startup, but it still needs normal (cluster-wide) during the actual
* decoding - see standby_decode() and SnapBuildProcessRunningXacts() for
* details. Other processes (e.g. checkpointer) issue the cluster-wide records
* whether logical decoding is active or not.
*
* Please be careful about using this argument for other purposes. In
* particular, physical replication *must* ignore the database-specific
* records, exactly because they do not cover the whole cluster - see
* standby_redo().
*
* Returns the RecPtr of the last inserted record.
*/
XLogRecPtr
LogStandbySnapshot(void)
LogStandbySnapshot(Oid dbid)
{
XLogRecPtr recptr;
RunningTransactions running;
@ -1314,7 +1333,7 @@ LogStandbySnapshot(void)
* Log details of all in-progress transactions. This should be the last
* record we write, because standby will open up when it sees this.
*/
running = GetRunningTransactionData();
running = GetRunningTransactionData(dbid);
/*
* GetRunningTransactionData() acquired ProcArrayLock, we must release it.
@ -1358,6 +1377,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
xl_running_xacts xlrec;
XLogRecPtr recptr;
xlrec.dbid = CurrRunningXacts->dbid;
xlrec.xcnt = CurrRunningXacts->xcnt;
xlrec.subxcnt = CurrRunningXacts->subxcnt;
xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);

View file

@ -32,7 +32,7 @@
/*
* Each page of XLOG file has a header like this:
*/
#define XLOG_PAGE_MAGIC 0xD11E /* can be used as WAL version indicator */
#define XLOG_PAGE_MAGIC 0xD11F /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{

View file

@ -310,6 +310,15 @@ extern void PreventCommandIfReadOnly(const char *cmdname);
extern void PreventCommandIfParallelMode(const char *cmdname);
extern void PreventCommandDuringRecovery(const char *cmdname);
/* in replication/snapbuild.c */
/*
* Keep track of whether logical decoding in this backend promised not to
* access shared catalogs, as a safety check. This is checked by genam.c when
* a catalog scan takes place to verify that no shared catalogs are accessed.
*/
extern bool accessSharedCatalogsInDecoding;
/*****************************************************************************
* pdir.h -- *
* POSTGRES directory path definitions. *

View file

@ -27,6 +27,7 @@ typedef struct OutputPluginOptions
{
OutputPluginOutputType output_type;
bool receive_rewrites;
bool need_shared_catalogs;
} OutputPluginOptions;
/*

View file

@ -92,7 +92,8 @@ extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
XLogRecPtr lsn,
xl_heap_new_cid *xlrec);
extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
xl_running_xacts *running);
xl_running_xacts *running,
bool db_specific);
extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
extern bool SnapBuildSnapshotExists(XLogRecPtr lsn);

View file

@ -47,7 +47,7 @@ extern bool ProcArrayInstallImportedXmin(TransactionId xmin,
VirtualTransactionId *sourcevxid);
extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc);
extern RunningTransactions GetRunningTransactionData(void);
extern RunningTransactions GetRunningTransactionData(Oid dbid);
extern bool TransactionIdIsInProgress(TransactionId xid);
extern TransactionId GetOldestNonRemovableTransactionId(Relation rel);

View file

@ -126,6 +126,7 @@ typedef enum
typedef struct RunningTransactionsData
{
Oid dbid; /* only track xacts in this database */
int xcnt; /* # of xact ids in xids[] */
int subxcnt; /* # of subxact ids in xids[] */
subxids_array_status subxid_status;
@ -143,7 +144,7 @@ typedef RunningTransactionsData *RunningTransactions;
extern void LogAccessExclusiveLock(Oid dbOid, Oid relOid);
extern void LogAccessExclusiveLockPrepare(void);
extern XLogRecPtr LogStandbySnapshot(void);
extern XLogRecPtr LogStandbySnapshot(Oid dbid);
extern void LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
bool relcacheInitFileInval);

View file

@ -46,6 +46,7 @@ typedef struct xl_standby_locks
*/
typedef struct xl_running_xacts
{
Oid dbid; /* only track xacts in this database */
int xcnt; /* # of xact ids in xids[] */
int subxcnt; /* # of subxact ids in xids[] */
bool subxid_overflow; /* snapshot overflowed, subxids missing */