4 files changed, 209 insertions, 15 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0e4a8ac800a..8e051fc9b46 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -65,7 +65,7 @@
 #define BUF_WRITTEN				0x01
 #define BUF_REUSABLE			0x02
 
-#define DROP_RELS_BSEARCH_THRESHOLD		20
+#define RELS_BSEARCH_THRESHOLD		20
 
 typedef struct PrivateRefCountEntry
 {
@@ -104,6 +104,19 @@ typedef struct CkptTsStatus
 	int			index;
 } CkptTsStatus;
 
+/*
+ * Type for array used to sort SMgrRelations
+ *
+ * FlushRelationsAllBuffers shares the same comparator function with
+ * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
+ * compatible.
+ */
+typedef struct SMgrSortArray
+{
+	RelFileNode rnode;			/* This must be the first member */
+	SMgrRelation srel;
+} SMgrSortArray;
+
 /* GUC variables */
 bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
@@ -2977,7 +2990,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 	 * an exactly determined value, as it depends on many factors (CPU and RAM
 	 * speeds, amount of shared buffers etc.).
 	 */
-	use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
+	use_bsearch = n > RELS_BSEARCH_THRESHOLD;
 
 	/* sort the list of rnodes if necessary */
 	if (use_bsearch)
@@ -3228,6 +3241,104 @@ FlushRelationBuffers(Relation rel)
 }
 
 /* ---------------------------------------------------------------------
+ *		FlushRelationsAllBuffers
+ *
+ *		This function flushes out of the buffer pool all the pages of all
+ *		forks of the specified smgr relations.  It's equivalent to calling
+ *		FlushRelationBuffers once per fork per relation.  The relations are
+ *		assumed not to use local buffers.
+ * --------------------------------------------------------------------
+ */
+void
+FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
+{
+	int			i;
+	SMgrSortArray *srels;
+	bool		use_bsearch;
+
+	if (nrels == 0)
+		return;
+
+	/* fill-in array for qsort */
+	srels = palloc(sizeof(SMgrSortArray) * nrels);
+
+	for (i = 0; i < nrels; i++)
+	{
+		Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
+
+		srels[i].rnode = smgrs[i]->smgr_rnode.node;
+		srels[i].srel = smgrs[i];
+	}
+
+	/*
+	 * Save the bsearch overhead for low number of relations to sync. See
+	 * DropRelFileNodesAllBuffers for details.
+	 */
+	use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
+
+	/* sort the list of SMgrRelations if necessary */
+	if (use_bsearch)
+		pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
+
+	/* Make sure we can handle the pin inside the loop */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		SMgrSortArray *srelent = NULL;
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		/*
+		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+		 * and saves some cycles.
+		 */
+
+		if (!use_bsearch)
+		{
+			int			j;
+
+			for (j = 0; j < nrels; j++)
+			{
+				if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
+				{
+					srelent = &srels[j];
+					break;
+				}
+			}
+
+		}
+		else
+		{
+			srelent = bsearch((const void *) &(bufHdr->tag.rnode),
+							  srels, nrels, sizeof(SMgrSortArray),
+							  rnode_comparator);
+		}
+
+		/* buffer doesn't belong to any of the given relfilenodes; skip it */
+		if (srelent == NULL)
+			continue;
+
+		ReservePrivateRefCountEntry();
+
+		buf_state = LockBufHdr(bufHdr);
+		if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
+			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+		{
+			PinBuffer_Locked(bufHdr);
+			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+			FlushBuffer(bufHdr, srelent->srel);
+			LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+			UnpinBuffer(bufHdr, true);
+		}
+		else
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+
+	pfree(srels);
+}
+
+/* ---------------------------------------------------------------------
  *		FlushDatabaseBuffers
  *
  *		This function writes all dirty pages of a database out to disk
@@ -3428,13 +3539,15 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 			(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
 		{
 			/*
-			 * If we're in recovery we cannot dirty a page because of a hint.
-			 * We can set the hint, just not dirty the page as a result so the
-			 * hint is lost when we evict the page or shutdown.
+			 * If we must not write WAL, due to a relfilenode-specific
+			 * condition or being in recovery, don't dirty the page.  We can
+			 * set the hint, just not dirty the page as a result so the hint
+			 * is lost when we evict the page or shutdown.
 			 *
 			 * See src/backend/storage/page/README for longer discussion.
 			 */
-			if (RecoveryInProgress())
+			if (RecoveryInProgress() ||
+				RelFileNodeSkippingWAL(bufHdr->tag.rnode))
 				return;
 
 			/*
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 98a2f366b4d..66ed9f44d41 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -563,6 +563,18 @@ DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
 	return false;
 }
 
+#ifdef USE_ASSERT_CHECKING
+/*
+ * GetLockMethodLocalHash -- return the hash of local locks, for modules that
+ *		evaluate assertions based on all locks held.
+ */
+HTAB *
+GetLockMethodLocalHash(void)
+{
+	return LockMethodLocalHash;
+}
+#endif
+
 /*
  * LockHasWaiters -- look up 'locktag' and check if releasing this
  *		lock would wake up other processes waiting for it.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 301a24bd66c..a25d07a6f76 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -359,11 +359,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
  * During replay, we would delete the file and then recreate it, which is fine
  * if the contents of the file were repopulated by subsequent WAL entries.
  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as for instance CLUSTER and CREATE INDEX do),
- * the contents of the file would be lost forever.  By leaving the empty file
- * until after the next checkpoint, we prevent reassignment of the relfilenode
- * number until it's safe, because relfilenode assignment skips over any
- * existing file.
+ * file after populating it (as we do at wal_level=minimal), the contents of
+ * the file would be lost forever.  By leaving the empty file until after the
+ * next checkpoint, we prevent reassignment of the relfilenode number until
+ * it's safe, because relfilenode assignment skips over any existing file.
  *
  * We do not need to go through this dance for temp relations, though, because
  * we never make WAL entries for temp rels, and so a temp rel poses no threat
@@ -1019,12 +1018,19 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  *	mdimmedsync() -- Immediately sync a relation to stable storage.
  *
  * Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
  */
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
 	MdfdVec    *v;
+	BlockNumber segno = 0;
+	bool		active = true;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1034,14 +1040,42 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 
 	v = mdopen(reln, forknum, EXTENSION_FAIL);
 
+	/*
+	 * Temporarily open inactive segments, then close them after sync.  There
+	 * may be some inactive segments left opened after fsync() error, but that
+	 * is harmless.  We don't bother to clean them up and take a risk of
+	 * further trouble.  The next mdclose() will soon close them.
+	 */
 	while (v != NULL)
 	{
-		if (FileSync(v->mdfd_vfd) < 0)
+		File		vfd = v->mdfd_vfd;
+
+		if (active)
+			v = v->mdfd_chain;
+		else
+		{
+			Assert(v->mdfd_chain == NULL);
+			pfree(v);
+			v = NULL;
+		}
+
+		if (FileSync(vfd) < 0)
 			ereport(data_sync_elevel(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
-							FilePathName(v->mdfd_vfd))));
-		v = v->mdfd_chain;
+							FilePathName(vfd))));
+
+		/* Close inactive segments immediately */
+		if (!active)
+			FileClose(vfd);
+
+		segno++;
+
+		if (v == NULL)
+		{
+			v = _mdfd_openseg(reln, forknum, segno, 0);
+			active = false;
+		}
 	}
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 97119f2063f..e5383722a65 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -409,6 +409,41 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
 }
 
 /*
+ *	smgrdosyncall() -- Immediately sync all forks of all given relations
+ *
+ *		All forks of all given relations are synced out to the store.
+ *
+ *		This is equivalent to FlushRelationBuffers() for each smgr relation,
+ *		then calling smgrimmedsync() for all forks of each relation, but it's
+ *		significantly quicker so should be preferred when possible.
+ */
+void
+smgrdosyncall(SMgrRelation *rels, int nrels)
+{
+	int			i = 0;
+	ForkNumber	forknum;
+
+	if (nrels == 0)
+		return;
+
+	FlushRelationsAllBuffers(rels, nrels);
+
+	/*
+	 * Sync the physical file(s).
+	 */
+	for (i = 0; i < nrels; i++)
+	{
+		int			which = rels[i]->smgr_which;
+
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			if (smgrsw[which].smgr_exists(rels[i], forknum))
+				smgrsw[which].smgr_immedsync(rels[i], forknum);
+		}
+	}
+}
+
+/*
  *	smgrdounlinkall() -- Immediately unlink all forks of all given relations
  *
  *		All forks of all given relations are removed from the store.  This