1 files changed, 43 insertions, 9 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 301a24bd66c..a25d07a6f76 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -359,11 +359,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
  * During replay, we would delete the file and then recreate it, which is fine
  * if the contents of the file were repopulated by subsequent WAL entries.
  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as for instance CLUSTER and CREATE INDEX do),
- * the contents of the file would be lost forever.  By leaving the empty file
- * until after the next checkpoint, we prevent reassignment of the relfilenode
- * number until it's safe, because relfilenode assignment skips over any
- * existing file.
+ * file after populating it (as we do at wal_level=minimal), the contents of
+ * the file would be lost forever.  By leaving the empty file until after the
+ * next checkpoint, we prevent reassignment of the relfilenode number until
+ * it's safe, because relfilenode assignment skips over any existing file.
  *
  * We do not need to go through this dance for temp relations, though, because
  * we never make WAL entries for temp rels, and so a temp rel poses no threat
@@ -1019,12 +1018,19 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  *	mdimmedsync() -- Immediately sync a relation to stable storage.
  *
  * Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
  */
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
 	MdfdVec    *v;
+	BlockNumber segno = 0;
+	bool		active = true;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1034,14 +1040,42 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 
 	v = mdopen(reln, forknum, EXTENSION_FAIL);
 
+	/*
+	 * Temporarily open inactive segments, then close them after sync.  There
+	 * may be some inactive segments left opened after fsync() error, but that
+	 * is harmless.  We don't bother to clean them up and take a risk of
+	 * further trouble.  The next mdclose() will soon close them.
+	 */
 	while (v != NULL)
 	{
-		if (FileSync(v->mdfd_vfd) < 0)
+		File		vfd = v->mdfd_vfd;
+
+		if (active)
+			v = v->mdfd_chain;
+		else
+		{
+			Assert(v->mdfd_chain == NULL);
+			pfree(v);
+			v = NULL;
+		}
+
+		if (FileSync(vfd) < 0)
 			ereport(data_sync_elevel(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
-							FilePathName(v->mdfd_vfd))));
-		v = v->mdfd_chain;
+							FilePathName(vfd))));
+
+		/* Close inactive segments immediately */
+		if (!active)
+			FileClose(vfd);
+
+		segno++;
+
+		if (v == NULL)
+		{
+			v = _mdfd_openseg(reln, forknum, segno, 0);
+			active = false;
+		}
 	}
 }