diff options
Diffstat (limited to 'src/backend/storage')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 125 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lock.c | 12 | ||||
-rw-r--r-- | src/backend/storage/smgr/md.c | 52 | ||||
-rw-r--r-- | src/backend/storage/smgr/smgr.c | 35 |
4 files changed, 209 insertions, 15 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0e4a8ac800a..8e051fc9b46 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -65,7 +65,7 @@ #define BUF_WRITTEN 0x01 #define BUF_REUSABLE 0x02 -#define DROP_RELS_BSEARCH_THRESHOLD 20 +#define RELS_BSEARCH_THRESHOLD 20 typedef struct PrivateRefCountEntry { @@ -104,6 +104,19 @@ typedef struct CkptTsStatus int index; } CkptTsStatus; +/* + * Type for array used to sort SMgrRelations + * + * FlushRelationsAllBuffers shares the same comparator function with + * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be + * compatible. + */ +typedef struct SMgrSortArray +{ + RelFileNode rnode; /* This must be the first member */ + SMgrRelation srel; +} SMgrSortArray; + /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; @@ -2977,7 +2990,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes) * an exactly determined value, as it depends on many factors (CPU and RAM * speeds, amount of shared buffers etc.). */ - use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD; + use_bsearch = n > RELS_BSEARCH_THRESHOLD; /* sort the list of rnodes if necessary */ if (use_bsearch) @@ -3228,6 +3241,104 @@ FlushRelationBuffers(Relation rel) } /* --------------------------------------------------------------------- + * FlushRelationsAllBuffers + * + * This function flushes out of the buffer pool all the pages of all + * forks of the specified smgr relations. It's equivalent to calling + * FlushRelationBuffers once per fork per relation. The relations are + * assumed not to use local buffers. + * -------------------------------------------------------------------- + */ +void +FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) +{ + int i; + SMgrSortArray *srels; + bool use_bsearch; + + if (nrels == 0) + return; + + /* fill-in array for qsort */ + srels = palloc(sizeof(SMgrSortArray) * nrels); + + for (i = 0; i < nrels; i++) + { + Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode)); + + srels[i].rnode = smgrs[i]->smgr_rnode.node; + srels[i].srel = smgrs[i]; + } + + /* + * Save the bsearch overhead for low number of relations to sync. See + * DropRelFileNodesAllBuffers for details. + */ + use_bsearch = nrels > RELS_BSEARCH_THRESHOLD; + + /* sort the list of SMgrRelations if necessary */ + if (use_bsearch) + pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator); + + /* Make sure we can handle the pin inside the loop */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + SMgrSortArray *srelent = NULL; + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * As in DropRelFileNodeBuffers, an unlocked precheck should be safe + * and saves some cycles. + */ + + if (!use_bsearch) + { + int j; + + for (j = 0; j < nrels; j++) + { + if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode)) + { + srelent = &srels[j]; + break; + } + } + + } + else + { + srelent = bsearch((const void *) &(bufHdr->tag.rnode), + srels, nrels, sizeof(SMgrSortArray), + rnode_comparator); + } + + /* buffer doesn't belong to any of the given relfilenodes; skip it */ + if (srelent == NULL) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) && + (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + FlushBuffer(bufHdr, srelent->srel); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr, true); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + + pfree(srels); +} + +/* --------------------------------------------------------------------- * FlushDatabaseBuffers * * This function writes all dirty pages of a database out to disk @@ -3428,13 +3539,15 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) { /* - * If we're in recovery we cannot dirty a page because of a hint. - * We can set the hint, just not dirty the page as a result so the - * hint is lost when we evict the page or shutdown. + * If we must not write WAL, due to a relfilenode-specific + * condition or being in recovery, don't dirty the page. We can + * set the hint, just not dirty the page as a result so the hint + * is lost when we evict the page or shutdown. * * See src/backend/storage/page/README for longer discussion. */ - if (RecoveryInProgress()) + if (RecoveryInProgress() || + RelFileNodeSkippingWAL(bufHdr->tag.rnode)) return; /* diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 98a2f366b4d..66ed9f44d41 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -563,6 +563,18 @@ DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2) return false; } +#ifdef USE_ASSERT_CHECKING +/* + * GetLockMethodLocalHash -- return the hash of local locks, for modules that + * evaluate assertions based on all locks held. + */ +HTAB * +GetLockMethodLocalHash(void) +{ + return LockMethodLocalHash; +} +#endif + /* * LockHasWaiters -- look up 'locktag' and check if releasing this * lock would wake up other processes waiting for it. diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 301a24bd66c..a25d07a6f76 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -359,11 +359,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * During replay, we would delete the file and then recreate it, which is fine * if the contents of the file were repopulated by subsequent WAL entries. * But if we didn't WAL-log insertions, but instead relied on fsyncing the - * file after populating it (as for instance CLUSTER and CREATE INDEX do), - * the contents of the file would be lost forever. By leaving the empty file - * until after the next checkpoint, we prevent reassignment of the relfilenode - * number until it's safe, because relfilenode assignment skips over any - * existing file. + * file after populating it (as we do at wal_level=minimal), the contents of + * the file would be lost forever. By leaving the empty file until after the + * next checkpoint, we prevent reassignment of the relfilenode number until + * it's safe, because relfilenode assignment skips over any existing file. * * We do not need to go through this dance for temp relations, though, because * we never make WAL entries for temp rels, and so a temp rel poses no threat @@ -1019,12 +1018,19 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows - * nothing of dirty buffers that may exist inside the buffer manager. + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. */ void mdimmedsync(SMgrRelation reln, ForkNumber forknum) { MdfdVec *v; + BlockNumber segno = 0; + bool active = true; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1034,14 +1040,42 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) v = mdopen(reln, forknum, EXTENSION_FAIL); + /* + * Temporarily open inactive segments, then close them after sync. There + * may be some inactive segments left opened after fsync() error, but that + * is harmless. We don't bother to clean them up and take a risk of + * further trouble. The next mdclose() will soon close them. + */ while (v != NULL) { - if (FileSync(v->mdfd_vfd) < 0) + File vfd = v->mdfd_vfd; + + if (active) + v = v->mdfd_chain; + else + { + Assert(v->mdfd_chain == NULL); + pfree(v); + v = NULL; + } + + if (FileSync(vfd) < 0) ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", - FilePathName(v->mdfd_vfd)))); - v = v->mdfd_chain; + FilePathName(vfd)))); + + /* Close inactive segments immediately */ + if (!active) + FileClose(vfd); + + segno++; + + if (v == NULL) + { + v = _mdfd_openseg(reln, forknum, segno, 0); + active = false; + } } } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 97119f2063f..e5383722a65 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -409,6 +409,41 @@ smgrdounlink(SMgrRelation reln, bool isRedo) } /* + * smgrdosyncall() -- Immediately sync all forks of all given relations + * + * All forks of all given relations are synced out to the store. + * + * This is equivalent to FlushRelationBuffers() for each smgr relation, + * then calling smgrimmedsync() for all forks of each relation, but it's + * significantly quicker so should be preferred when possible. + */ +void +smgrdosyncall(SMgrRelation *rels, int nrels) +{ + int i = 0; + ForkNumber forknum; + + if (nrels == 0) + return; + + FlushRelationsAllBuffers(rels, nrels); + + /* + * Sync the physical file(s). + */ + for (i = 0; i < nrels; i++) + { + int which = rels[i]->smgr_which; + + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + if (smgrsw[which].smgr_exists(rels[i], forknum)) + smgrsw[which].smgr_immedsync(rels[i], forknum); + } + } +} + +/* * smgrdounlinkall() -- Immediately unlink all forks of all given relations * * All forks of all given relations are removed from the store. This |