summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAmit Kapila <akapila@postgresql.org>2025-08-19 05:33:17 +0000
committerAmit Kapila <akapila@postgresql.org>2025-08-19 05:33:17 +0000
commitaa21e49225a1b4f8465dee5a9410e52b5a889f90 (patch)
tree1ae4cdeb8e8556fd6d8403a87bdaac652b30195c /src
parenta977e419ee6ee15cb7bd45d7c9b7540cf183d1e2 (diff)
Fix self-deadlock during DROP SUBSCRIPTION.
The DROP SUBSCRIPTION command performs several operations: it stops the subscription workers, removes subscription-related entries from system catalogs, and deletes the replication slot on the publisher server. Previously, this command acquired an AccessExclusiveLock on pg_subscription before initiating these steps. However, while holding this lock, the command attempts to connect to the publisher to remove the replication slot. In cases where the connection is made to a newly created database on the same server as subscriber, the cache-building process during connection tries to acquire an AccessShareLock on pg_subscription, resulting in a self-deadlock. To resolve this issue, we reduce the lock level on pg_subscription during DROP SUBSCRIPTION from AccessExclusiveLock to RowExclusiveLock. Earlier, the higher lock level was used to prevent the launcher from starting a new worker during the drop operation, as a restarted worker could become orphaned. Now, instead of relying on a strict lock, we acquire an AccessShareLock on the specific subscription being dropped and re-validate its existence after acquiring the lock. If the subscription is no longer valid, the worker exits gracefully. This approach avoids the deadlock while still ensuring that orphan workers are not created. Reported-by: Alexander Lakhin <exclusion@gmail.com> Author: Dilip Kumar <dilipbalaut@gmail.com> Reviewed-by: vignesh C <vignesh21@gmail.com> Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Backpatch-through: 13 Discussion: https://postgr.es/m/18988-7312c868be2d467f@postgresql.org
Diffstat (limited to 'src')
-rw-r--r--src/backend/commands/subscriptioncmds.c8
-rw-r--r--src/backend/replication/logical/worker.c7
-rw-r--r--src/test/subscription/t/100_bugs.pl30
3 files changed, 42 insertions, 3 deletions
diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c
index faa3650d287..4c01d21b2f3 100644
--- a/src/backend/commands/subscriptioncmds.c
+++ b/src/backend/commands/subscriptioncmds.c
@@ -1803,10 +1803,12 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel)
bool must_use_password;
/*
- * Lock pg_subscription with AccessExclusiveLock to ensure that the
- * launcher doesn't restart new worker during dropping the subscription
+ * The launcher may concurrently start a new worker for this subscription.
+ * During initialization, the worker checks for subscription validity and
+ * exits if the subscription has already been dropped. See
+ * InitializeLogRepWorker.
*/
- rel = table_open(SubscriptionRelationId, AccessExclusiveLock);
+ rel = table_open(SubscriptionRelationId, RowExclusiveLock);
tup = SearchSysCache2(SUBSCRIPTIONNAME, ObjectIdGetDatum(MyDatabaseId),
CStringGetDatum(stmt->subname));
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index 8e343873454..22ad9051db3 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -5415,6 +5415,13 @@ InitializeLogRepWorker(void)
StartTransactionCommand();
oldctx = MemoryContextSwitchTo(ApplyContext);
+ /*
+ * Lock the subscription to prevent it from being concurrently dropped,
+ * then re-verify its existence. After the initialization, the worker will
+ * be terminated gracefully if the subscription is dropped.
+ */
+ LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, 0,
+ AccessShareLock);
MySubscription = GetSubscription(MyLogicalRepWorker->subid, true);
if (!MySubscription)
{
diff --git a/src/test/subscription/t/100_bugs.pl b/src/test/subscription/t/100_bugs.pl
index 5e357701183..50223054918 100644
--- a/src/test/subscription/t/100_bugs.pl
+++ b/src/test/subscription/t/100_bugs.pl
@@ -575,4 +575,34 @@ is($result, 't',
$node_publisher->stop('fast');
$node_subscriber->stop('fast');
+# BUG #18988
+# The bug happened due to a self-deadlock between the DROP SUBSCRIPTION
+# command and the walsender process for accessing pg_subscription. This
+# occurred when DROP SUBSCRIPTION attempted to remove a replication slot by
+# connecting to a newly created database whose caches are not yet
+# initialized.
+#
+# The bug is fixed by reducing the lock-level during DROP SUBSCRIPTION.
+$node_publisher->start();
+
+$publisher_connstr = $node_publisher->connstr . ' dbname=regress_db';
+$node_publisher->safe_psql(
+ 'postgres', qq(
+ CREATE DATABASE regress_db;
+ CREATE SUBSCRIPTION regress_sub1 CONNECTION '$publisher_connstr' PUBLICATION regress_pub WITH (connect=false);
+));
+
+my ($ret, $stdout, $stderr) =
+ $node_publisher->psql('postgres', q{DROP SUBSCRIPTION regress_sub1});
+
+isnt($ret, 0, "replication slot does not exist: exit code not 0");
+like(
+ $stderr,
+ qr/ERROR: could not drop replication slot "regress_sub1" on publisher/,
+ "could not drop replication slot: error message");
+
+$node_publisher->safe_psql('postgres', "DROP DATABASE regress_db");
+
+$node_publisher->stop('fast');
+
done_testing();