diff options
| author | Tom Lane <tgl@sss.pgh.pa.us> | 2008-01-11 00:54:09 +0000 | 
|---|---|---|
| committer | Tom Lane <tgl@sss.pgh.pa.us> | 2008-01-11 00:54:09 +0000 | 
| commit | e6a442c71b30f62e7b5eee6058afc961b1c7f29b (patch) | |
| tree | 70303a4a11ef5ce38272e57cefa33c5abe8c90ec /src/backend/postmaster/pgarch.c | |
| parent | 21a00dc6fd8ce606dee278106988db6edb0a4ccb (diff) | |
Restructure the shutdown procedure for the archiver process to allow it to
finish archiving everything (when there's no error), and to eliminate various
hazards as best we can.  This fixes a previous 8.3 patch that caused the
postmaster to kill and then restart the archiver during shutdown (!?).
The new behavior is that the archiver is allowed to run unmolested until
the bgwriter has exited; then it is sent SIGUSR2 to tell it to do a final
archiving cycle and quit.  We only SIGQUIT the archiver if we want a panic
stop; this is important since SIGQUIT will also be sent to any active
archive_command.  The postmaster also now doesn't SIGQUIT the stats collector
until the bgwriter is done, since the bgwriter can send stats messages in 8.3.
The postmaster will not exit until both the archiver and stats collector are
gone; this provides some defense (not too bulletproof) against conflicting
archiver or stats collector processes being started by a new postmaster
instance.  We continue the prior practice that the archiver will check
for postmaster death immediately before issuing any archive_command; that
gives some additional protection against conflicting archivers.
Also, modify the archiver process to notice SIGTERM and refuse to issue any
more archive commands if it gets it.  The postmaster doesn't ever send it
SIGTERM; we assume that any such signal came from init and is a notice of
impending whole-system shutdown.  In this situation it seems imprudent to try
to start new archive commands --- if they aren't extremely quick they're
likely to get SIGKILL'd by init.
All per discussion.
Diffstat (limited to 'src/backend/postmaster/pgarch.c')
| -rw-r--r-- | src/backend/postmaster/pgarch.c | 91 | 
1 files changed, 75 insertions, 16 deletions
| diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 6cb32fcb601..e181950c0fe 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -19,7 +19,7 @@   *   *   * IDENTIFICATION - *	  $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.37 2008/01/01 19:45:51 momjian Exp $ + *	  $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.38 2008/01/11 00:54:08 tgl Exp $   *   *-------------------------------------------------------------------------   */ @@ -77,12 +77,15 @@   * ----------   */  static time_t last_pgarch_start_time; +static time_t last_sigterm_time = 0;  /*   * Flags set by interrupt handlers for later service in the main loop.   */  static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t got_SIGTERM = false;  static volatile sig_atomic_t wakened = false; +static volatile sig_atomic_t ready_to_stop = false;  /* ----------   * Local function forward declarations @@ -95,7 +98,9 @@ static pid_t pgarch_forkexec(void);  NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]);  static void pgarch_exit(SIGNAL_ARGS);  static void ArchSigHupHandler(SIGNAL_ARGS); +static void ArchSigTermHandler(SIGNAL_ARGS);  static void pgarch_waken(SIGNAL_ARGS); +static void pgarch_waken_stop(SIGNAL_ARGS);  static void pgarch_MainLoop(void);  static void pgarch_ArchiverCopyLoop(void);  static bool pgarch_archiveXlog(char *xlog); @@ -236,16 +241,16 @@ PgArchiverMain(int argc, char *argv[])  	/*  	 * Ignore all signals usually bound to some action in the postmaster, -	 * except for SIGHUP, SIGUSR1 and SIGQUIT. +	 * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT.  	 */  	pqsignal(SIGHUP, ArchSigHupHandler);  	pqsignal(SIGINT, SIG_IGN); -	pqsignal(SIGTERM, SIG_IGN); +	pqsignal(SIGTERM, ArchSigTermHandler);  	pqsignal(SIGQUIT, pgarch_exit);  	pqsignal(SIGALRM, SIG_IGN);  	pqsignal(SIGPIPE, SIG_IGN);  	pqsignal(SIGUSR1, pgarch_waken); -	pqsignal(SIGUSR2, SIG_IGN); +	pqsignal(SIGUSR2, pgarch_waken_stop);  	pqsignal(SIGCHLD, SIG_DFL);  	pqsignal(SIGTTIN, SIG_DFL);  	pqsignal(SIGTTOU, SIG_DFL); @@ -267,28 +272,47 @@ PgArchiverMain(int argc, char *argv[])  static void  pgarch_exit(SIGNAL_ARGS)  { -	/* -	 * For now, we just nail the doors shut and get out of town.  It might -	 * seem cleaner to finish up any pending archive copies, but there's a -	 * nontrivial risk that init will kill us partway through. -	 */ -	exit(0); +	/* SIGQUIT means curl up and die ... */ +	exit(1);  } -/* SIGHUP: set flag to re-read config file at next convenient time */ +/* SIGHUP signal handler for archiver process */  static void  ArchSigHupHandler(SIGNAL_ARGS)  { +	/* set flag to re-read config file at next convenient time */  	got_SIGHUP = true;  } +/* SIGTERM signal handler for archiver process */ +static void +ArchSigTermHandler(SIGNAL_ARGS) +{ +	/* +	 * The postmaster never sends us SIGTERM, so we assume that this means +	 * that init is trying to shut down the whole system.  If we hang around +	 * too long we'll get SIGKILL'd.  Set flag to prevent starting any more +	 * archive commands. +	 */ +	got_SIGTERM = true; +} +  /* SIGUSR1 signal handler for archiver process */  static void  pgarch_waken(SIGNAL_ARGS)  { +	/* set flag that there is work to be done */  	wakened = true;  } +/* SIGUSR2 signal handler for archiver process */ +static void +pgarch_waken_stop(SIGNAL_ARGS) +{ +	/* set flag to do a final cycle and shut down afterwards */ +	ready_to_stop = true; +} +  /*   * pgarch_MainLoop   * @@ -298,6 +322,7 @@ static void  pgarch_MainLoop(void)  {  	time_t		last_copy_time = 0; +	bool		time_to_stop;  	/*  	 * We run the copy loop immediately upon entry, in case there are @@ -309,6 +334,9 @@ pgarch_MainLoop(void)  	do  	{ +		/* When we get SIGUSR2, we do one more archive cycle, then exit */ +		time_to_stop = ready_to_stop; +  		/* Check for config update */  		if (got_SIGHUP)  		{ @@ -316,8 +344,26 @@ pgarch_MainLoop(void)  			ProcessConfigFile(PGC_SIGHUP);  		} +		/* +		 * If we've gotten SIGTERM, we normally just sit and do nothing until +		 * SIGUSR2 arrives.  However, that means a random SIGTERM would +		 * disable archiving indefinitely, which doesn't seem like a good +		 * idea.  If more than 60 seconds pass since SIGTERM, exit anyway, +		 * so that the postmaster can start a new archiver if needed. +		 */ +		if (got_SIGTERM) +		{ +			time_t		curtime = time(NULL); + +			if (last_sigterm_time == 0) +				last_sigterm_time = curtime; +			else if ((unsigned int) (curtime - last_sigterm_time) >= +					 (unsigned int) 60) +				break; +		} +  		/* Do what we're here for */ -		if (wakened) +		if (wakened || time_to_stop)  		{  			wakened = false;  			pgarch_ArchiverCopyLoop(); @@ -334,7 +380,8 @@ pgarch_MainLoop(void)  		 * sleep into 1-second increments, and check for interrupts after each  		 * nap.  		 */ -		while (!(wakened || got_SIGHUP)) +		while (!(wakened || ready_to_stop || got_SIGHUP || +				 !PostmasterIsAlive(true)))  		{  			time_t		curtime; @@ -344,7 +391,13 @@ pgarch_MainLoop(void)  				(unsigned int) PGARCH_AUTOWAKE_INTERVAL)  				wakened = true;  		} -	} while (PostmasterIsAlive(true)); + +		/* +		 * The archiver quits either when the postmaster dies (not expected) +		 * or after completing one more archiving cycle after receiving +		 * SIGUSR2. +		 */ +	} while (PostmasterIsAlive(true) && !time_to_stop);  }  /* @@ -377,8 +430,14 @@ pgarch_ArchiverCopyLoop(void)  		for (;;)  		{ -			/* Abandon processing if we notice our postmaster has died */ -			if (!PostmasterIsAlive(true)) +			/* +			 * Do not initiate any more archive commands after receiving +			 * SIGTERM, nor after the postmaster has died unexpectedly. +			 * The first condition is to try to keep from having init +			 * SIGKILL the command, and the second is to avoid conflicts +			 * with another archiver spawned by a newer postmaster. +			 */ +			if (got_SIGTERM || !PostmasterIsAlive(true))  				return;  			if (pgarch_archiveXlog(xlog)) | 
