Fix REPACK decoding worker not cleaned up on FATAL exit

When the launching backend of REPACK (CONCURRENTLY) is terminated via pg_terminate_backend(), ProcDiePending causes ereport(FATAL) which bypasses PG_FINALLY blocks. As a result, stop_repack_decoding_worker() is never called, leaving the decoding worker running indefinitely and holding its temporary replication slot. Fix by using PG_ENSURE_ERROR_CLEANUP, which handles both ERROR and FATAL exits. Author: Baji Shaik <baji.pgdev@gmail.com> Reviewed-by: Sami Imseih <samimseih@gmail.com> Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de> Discussion: https://postgr.es/m/CA+fm-RNoPxL2N7db_A0anMXV_aDu6jWj4PNOPtMtBUAPDPvSXQ@mail.gmail.com
2026-06-10 17:20:31 -04:00 · 2026-05-19 11:37:46 -07:00 · 2026-05-19 11:37:46 -07:00 · 0160143ad9
commit 0160143ad9
parent 83df16f1fa
1 changed files with 25 additions and 17 deletions
--- a/src/backend/commands/repack.c
+++ b/src/backend/commands/repack.c
@ -64,6 +64,7 @@
 #include "pgstat.h"
 #include "replication/logicalrelation.h"
 #include "storage/bufmgr.h"
+#include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@ -211,6 +212,7 @@ static Oid	determine_clustered_index(Relation rel, bool usingindex,

 static void start_repack_decoding_worker(Oid relid);
 static void stop_repack_decoding_worker(void);
+static void stop_repack_decoding_worker_cb(int code, Datum arg);
 static Snapshot get_initial_snapshot(DecodingWorker *worker);

 static void ProcessRepackMessage(StringInfo msg);
@ -666,27 +668,26 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
 	if (!concurrent)
 		TransferPredicateLocksToHeapRelation(OldHeap);

-	/* rebuild_relation does all the dirty work */
-	PG_TRY();
+	/*
+	 * rebuild_relation does all the dirty work, and closes OldHeap and index,
+	 * if valid.
+	 *
+	 * In concurrent mode, make sure the worker terminates; normally it does
+	 * so by itself, but a PG_ENSURE_ERROR_CLEANUP callback ensures that this
+	 * happens even in case this backend dies early on a FATAL exit.  Normal
+	 * mode doesn't need that overhead.
+	 */
+	if (concurrent)
 	{
-		rebuild_relation(OldHeap, index, verbose, ident_idx);
-	}
-	PG_FINALLY();
-	{
-		if (concurrent)
+		PG_ENSURE_ERROR_CLEANUP(stop_repack_decoding_worker_cb, 0);
 		{
-			/*
-			 * Since during normal operation the worker was already asked to
-			 * exit, stopping it explicitly is especially important on ERROR.
-			 * However it still seems a good practice to make sure that the
-			 * worker never survives the REPACK command.
-			 */
-			stop_repack_decoding_worker();
+			rebuild_relation(OldHeap, index, verbose, ident_idx);
 		}
+		PG_END_ENSURE_ERROR_CLEANUP(stop_repack_decoding_worker_cb, 0);
+		stop_repack_decoding_worker();
 	}
-	PG_END_TRY();
-
-	/* rebuild_relation closes OldHeap, and index if valid */
+	else
+		rebuild_relation(OldHeap, index, verbose, ident_idx);

 out:
 	/* Roll back any GUC changes executed by index functions */
@ -3534,6 +3535,13 @@ stop_repack_decoding_worker(void)
 	decoding_worker = NULL;
 }

+/* stop_repack_decoding_worker, wrapped as a before_shmem_exit callback */
+static void
+stop_repack_decoding_worker_cb(int code, Datum arg)
+{
+	stop_repack_decoding_worker();
+}
+
 /*
 * Get the initial snapshot from the decoding worker.
 */