From e41d954da6aa04f0a4453e566d3bcc064512d457 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Wed, 24 Sep 2025 04:11:53 +0000 Subject: Fix LOCK_TIMEOUT handling during parallel apply. Previously, the parallel apply worker used SIGINT to receive a graceful shutdown signal from the leader apply worker. However, SIGINT is also used by the LOCK_TIMEOUT handler to trigger a query-cancel interrupt. This overlap caused the parallel apply worker to miss LOCK_TIMEOUT signals, leading to incorrect behavior during lock wait/contention. This patch resolves the conflict by switching the graceful shutdown signal from SIGINT to SIGUSR2. Reported-by: Zane Duffield Diagnosed-by: Zhijie Hou Author: Hayato Kuroda Reviewed-by: Amit Kapila Backpatch-through: 16, where it was introduced Discussion: https://postgr.es/m/CACMiCkXyC4au74kvE2g6Y=mCEF8X6r-Ne_ty4r7qWkUjRE4+oQ@mail.gmail.com --- src/backend/postmaster/interrupt.c | 5 ++--- src/backend/replication/logical/applyparallelworker.c | 17 ++++++++++++----- src/backend/replication/logical/launcher.c | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c index 0ae9bf906ec..ba63b84dfc5 100644 --- a/src/backend/postmaster/interrupt.c +++ b/src/backend/postmaster/interrupt.c @@ -94,9 +94,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS) * shut down and exit. * * Typically, this handler would be used for SIGTERM, but some processes use - * other signals. In particular, the checkpointer exits on SIGUSR2, and the WAL - * writer and the logical replication parallel apply worker exits on either - * SIGINT or SIGTERM. + * other signals. In particular, the checkpointer and parallel apply worker + * exit on SIGUSR2, and the WAL writer exits on either SIGINT or SIGTERM. * * ShutdownRequestPending should be checked at a convenient place within the * main loop, or else the main loop should call ProcessMainLoopInterrupts. diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c index 31a92d1a24a..33b7ec7f029 100644 --- a/src/backend/replication/logical/applyparallelworker.c +++ b/src/backend/replication/logical/applyparallelworker.c @@ -870,10 +870,17 @@ ParallelApplyWorkerMain(Datum main_arg) InitializingApplyWorker = true; - /* Setup signal handling. */ + /* + * Setup signal handling. + * + * Note: We intentionally used SIGUSR2 to trigger a graceful shutdown + * initiated by the leader apply worker. This helps to differentiate it + * from the case where we abort the current transaction and exit on + * receiving SIGTERM. + */ pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGINT, SignalHandlerForShutdownRequest); pqsignal(SIGTERM, die); + pqsignal(SIGUSR2, SignalHandlerForShutdownRequest); BackgroundWorkerUnblockSignals(); /* @@ -972,9 +979,9 @@ ParallelApplyWorkerMain(Datum main_arg) /* * The parallel apply worker must not get here because the parallel apply - * worker will only stop when it receives a SIGTERM or SIGINT from the - * leader, or when there is an error. None of these cases will allow the - * code to reach here. + * worker will only stop when it receives a SIGTERM or SIGUSR2 from the + * leader, or SIGINT from itself, or when there is an error. None of these + * cases will allow the code to reach here. */ Assert(false); } diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index c900b6cf3b1..218cefe86e2 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -650,7 +650,7 @@ logicalrep_worker_stop(Oid subid, Oid relid) /* * Stop the given logical replication parallel apply worker. * - * Node that the function sends SIGINT instead of SIGTERM to the parallel apply + * Node that the function sends SIGUSR2 instead of SIGTERM to the parallel apply * worker so that the worker exits cleanly. */ void @@ -688,7 +688,7 @@ logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo) * Only stop the worker if the generation matches and the worker is alive. */ if (worker->generation == generation && worker->proc) - logicalrep_worker_stop_internal(worker, SIGINT); + logicalrep_worker_stop_internal(worker, SIGUSR2); LWLockRelease(LogicalRepWorkerLock); } -- cgit v1.2.3