summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Eisentraut <peter@eisentraut.org>2024-06-17 09:42:51 +0200
committerPeter Eisentraut <peter@eisentraut.org>2024-06-17 09:42:51 +0200
commit04c8634c0c4d636540c9283efdd695558403dc4e (patch)
tree244e8d12bb0393825590f359f3c4e455de556ff3
parent8f1888eb6d6023b80525fbf7a8a1053daa6eb31e (diff)
pg_createsubscriber: Only --recovery-timeout controls the end of recovery process
It used to check if the target server is connected to the primary server (send required WAL) to rapidly react when the process won't succeed. This code is not enough to guarantee that the recovery process will complete. There is a window between the walreceiver shutdown and the pg_is_in_recovery() returns false that can reach NUM_CONN_ATTEMPTS attempts and fails. Instead, rely only on the --recovery-timeout option to give up the process after the specified number of seconds. This should help with buildfarm failures on slow machines. Author: Euler Taveira <euler.taveira@enterprisedb.com> Reviewed-by: Hayato Kuroda <kuroda.hayato@fujitsu.com> Discussion: https://www.postgresql.org/message-id/776c5cac-5ef5-4001-b1bc-5b698bc0c62a%40app.fastmail.com
-rw-r--r--doc/src/sgml/ref/pg_createsubscriber.sgml7
-rw-r--r--src/bin/pg_basebackup/pg_createsubscriber.c29
-rw-r--r--src/bin/pg_basebackup/t/040_pg_createsubscriber.pl2
3 files changed, 5 insertions, 33 deletions
diff --git a/doc/src/sgml/ref/pg_createsubscriber.sgml b/doc/src/sgml/ref/pg_createsubscriber.sgml
index 142bffff023..a700697f887 100644
--- a/doc/src/sgml/ref/pg_createsubscriber.sgml
+++ b/doc/src/sgml/ref/pg_createsubscriber.sgml
@@ -326,13 +326,6 @@ PostgreSQL documentation
</para>
<para>
- During the recovery process, if the target server disconnects from the
- source server, <application>pg_createsubscriber</application> will check a
- few times if the connection has been reestablished to stream the required
- WAL. After a few attempts, it terminates with an error.
- </para>
-
- <para>
Since DDL commands are not replicated by logical replication, avoid
executing DDL commands that change the database schema while running
<application>pg_createsubscriber</application>. If the target server has
diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c
index 90cc580811d..f62f34b1a76 100644
--- a/src/bin/pg_basebackup/pg_createsubscriber.c
+++ b/src/bin/pg_basebackup/pg_createsubscriber.c
@@ -1360,6 +1360,9 @@ stop_standby_server(const char *datadir)
*
* If recovery_timeout option is set, terminate abnormally without finishing
* the recovery process. By default, it waits forever.
+ *
+ * XXX Is the recovery process still in progress? When recovery process has a
+ * better progress reporting mechanism, it should be added here.
*/
static void
wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions *opt)
@@ -1367,9 +1370,6 @@ wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions
PGconn *conn;
int status = POSTMASTER_STILL_STARTING;
int timer = 0;
- int count = 0; /* number of consecutive connection attempts */
-
-#define NUM_CONN_ATTEMPTS 10
pg_log_info("waiting for the target server to reach the consistent state");
@@ -1377,7 +1377,6 @@ wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions
for (;;)
{
- PGresult *res;
bool in_recovery = server_is_in_recovery(conn);
/*
@@ -1391,28 +1390,6 @@ wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions
break;
}
- /*
- * If it is still in recovery, make sure the target server is
- * connected to the primary so it can receive the required WAL to
- * finish the recovery process. If it is disconnected try
- * NUM_CONN_ATTEMPTS in a row and bail out if not succeed.
- */
- res = PQexec(conn,
- "SELECT 1 FROM pg_catalog.pg_stat_wal_receiver");
- if (PQntuples(res) == 0)
- {
- if (++count > NUM_CONN_ATTEMPTS)
- {
- stop_standby_server(subscriber_dir);
- pg_log_error("standby server disconnected from the primary");
- break;
- }
- }
- else
- count = 0; /* reset counter if it connects again */
-
- PQclear(res);
-
/* Bail out after recovery_timeout seconds if this option is set */
if (opt->recovery_timeout > 0 && timer >= opt->recovery_timeout)
{
diff --git a/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl b/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl
index 2b883e69104..a677fefa94f 100644
--- a/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl
+++ b/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl
@@ -264,6 +264,7 @@ $node_p->restart;
command_ok(
[
'pg_createsubscriber', '--verbose',
+ '--recovery-timeout', "$PostgreSQL::Test::Utils::timeout_default",
'--dry-run', '--pgdata',
$node_s->data_dir, '--publisher-server',
$node_p->connstr('pg1'), '--socket-directory',
@@ -301,6 +302,7 @@ command_ok(
command_ok(
[
'pg_createsubscriber', '--verbose',
+ '--recovery-timeout', "$PostgreSQL::Test::Utils::timeout_default",
'--verbose', '--pgdata',
$node_s->data_dir, '--publisher-server',
$node_p->connstr('pg1'), '--socket-directory',