In immediate shutdown, postmaster should not exit till children are gone.

This adjusts commit 82233ce7ea42d6ba519aaec63008aff49da6c7af so that the postmaster does not exit until all its child processes have exited, even if the 5-second timeout elapses and we have to send SIGKILL. There is no great value in having the postmaster process quit sooner, and doing so can mislead onlookers into thinking that the cluster is fully terminated when actually some child processes still survive. This effect might explain recent test failures on buildfarm member hamster, wherein we failed to restart a cluster just after shutting it down with "pg_ctl stop -m immediate". I also did a bit of code review/beautification, including fixing a faulty use of the Max() macro on a volatile expression. Back-patch to 9.4. In older branches, the postmaster never waited for children to exit during immediate shutdowns, and changing that would be too much of a behavioral change.
author: Tom Lane <tgl@sss.pgh.pa.us> 2015-06-19 14:23:39 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2015-06-19 14:23:39 -0400
commit: 48913db887e6a41fa3f1b6cdf80ee89e38f21d9d (patch)
tree: 9961e708c2972a39ae1274d96fa2d8e2de408ed9 /src
parent: da1a9d0f5bed1f93908be9233a4fef39b988e505 (diff)
1 files changed, 12 insertions, 15 deletions
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 324bf7aad14..1757b4df37e 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -324,8 +324,10 @@ typedef enum
 
 static PMState pmState = PM_INIT;
 
-/* Start time of abort processing at immediate shutdown or child crash */
-static time_t AbortStartTime;
+/* Start time of SIGKILL timeout during immediate shutdown or child crash */
+/* Zero means timeout is not running */
+static time_t AbortStartTime = 0;
+/* Length of said timeout */
 #define SIGKILL_CHILDREN_AFTER_SECS		5
 
 static bool ReachedNormalRunning = false;		/* T if we've reached PM_RUN */
@@ -1419,7 +1421,8 @@ checkDataDir(void)
  * In normal conditions we wait at most one minute, to ensure that the other
  * background tasks handled by ServerLoop get done even when no requests are
  * arriving.  However, if there are background workers waiting to be started,
- * we don't actually sleep so that they are quickly serviced.
+ * we don't actually sleep so that they are quickly serviced.  Other exception
+ * cases are as shown in the code.
  */
 static void
 DetermineSleepTime(struct timeval * timeout)
@@ -1433,11 +1436,12 @@ DetermineSleepTime(struct timeval * timeout)
 	if (Shutdown > NoShutdown ||
 		(!StartWorkerNeeded && !HaveCrashedWorker))
 	{
-		if (AbortStartTime > 0)
+		if (AbortStartTime != 0)
 		{
 			/* time left to abort; clamp to 0 in case it already expired */
-			timeout->tv_sec = Max(SIGKILL_CHILDREN_AFTER_SECS -
-								  (time(NULL) - AbortStartTime), 0);
+			timeout->tv_sec = SIGKILL_CHILDREN_AFTER_SECS -
+				(time(NULL) - AbortStartTime);
+			timeout->tv_sec = Max(timeout->tv_sec, 0);
 			timeout->tv_usec = 0;
 		}
 		else
@@ -1707,20 +1711,13 @@ ServerLoop(void)
 		 * Note we also do this during recovery from a process crash.
 		 */
 		if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) &&
-			AbortStartTime > 0 &&
-			now - AbortStartTime >= SIGKILL_CHILDREN_AFTER_SECS)
+			AbortStartTime != 0 &&
+			(now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS)
 		{
 			/* We were gentle with them before. Not anymore */
 			TerminateChildren(SIGKILL);
 			/* reset flag so we don't SIGKILL again */
 			AbortStartTime = 0;
-
-			/*
-			 * Additionally, unless we're recovering from a process crash,
-			 * it's now the time for postmaster to abandon ship.
-			 */
-			if (!FatalError)
-				ExitPostmaster(1);
 		}
 	}
 }
author	Tom Lane <tgl@sss.pgh.pa.us>	2015-06-19 14:23:39 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2015-06-19 14:23:39 -0400
commit	48913db887e6a41fa3f1b6cdf80ee89e38f21d9d (patch)
tree	9961e708c2972a39ae1274d96fa2d8e2de408ed9 /src
parent	da1a9d0f5bed1f93908be9233a4fef39b988e505 (diff)