summaryrefslogtreecommitdiff
path: root/src/backend/storage/buffer/bufmgr.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r--src/backend/storage/buffer/bufmgr.c2205
1 files changed, 0 insertions, 2205 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
deleted file mode 100644
index 1a86afb286e..00000000000
--- a/src/backend/storage/buffer/bufmgr.c
+++ /dev/null
@@ -1,2205 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * bufmgr.c
- * buffer manager interface routines
- *
- * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.126 2002/06/20 20:29:34 momjian Exp $
- *
- *-------------------------------------------------------------------------
- */
-/*
- *
- * BufferAlloc() -- lookup a buffer in the buffer table. If
- * it isn't there add it, but do not read data into memory.
- * This is used when we are about to reinitialize the
- * buffer so don't care what the current disk contents are.
- * BufferAlloc() also pins the new buffer in memory.
- *
- * ReadBuffer() -- like BufferAlloc() but reads the data
- * on a buffer cache miss.
- *
- * ReleaseBuffer() -- unpin the buffer
- *
- * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
- * but don't unpin. The disk IO is delayed until buffer
- * replacement.
- *
- * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
- *
- * BufferSync() -- flush all dirty buffers in the buffer pool.
- *
- * InitBufferPool() -- Init the buffer module.
- *
- * See other files:
- * freelist.c -- chooses victim for buffer replacement
- * buf_table.c -- manages the buffer lookup table
- */
-#include "postgres.h"
-
-#include <sys/types.h>
-#include <sys/file.h>
-#include <math.h>
-#include <signal.h>
-
-#include "lib/stringinfo.h"
-#include "miscadmin.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-#include "storage/proc.h"
-#include "storage/smgr.h"
-#include "utils/relcache.h"
-
-#include "pgstat.h"
-
-#define BufferGetLSN(bufHdr) \
- (*((XLogRecPtr*)MAKE_PTR((bufHdr)->data)))
-
-
-extern long int ReadBufferCount;
-extern long int ReadLocalBufferCount;
-extern long int BufferHitCount;
-extern long int LocalBufferHitCount;
-extern long int BufferFlushCount;
-extern long int LocalBufferFlushCount;
-
-static void WaitIO(BufferDesc *buf);
-static void StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf);
-static void ContinueBufferIO(BufferDesc *buf, bool forInput);
-extern void AbortBufferIO(void);
-
-/*
- * Macro : BUFFER_IS_BROKEN
- * Note that write error doesn't mean the buffer broken
-*/
-#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))
-
-static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
- bool bufferLockHeld);
-static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
- bool *foundPtr);
-static int ReleaseBufferWithBufferLock(Buffer buffer);
-static int BufferReplace(BufferDesc *bufHdr);
-void PrintBufferDescs(void);
-
-static void write_buffer(Buffer buffer, bool unpin);
-
-/*
- * ReadBuffer -- returns a buffer containing the requested
- * block of the requested relation. If the blknum
- * requested is P_NEW, extend the relation file and
- * allocate a new block. (Caller is responsible for
- * ensuring that only one backend tries to extend a
- * relation at the same time!)
- *
- * Returns: the buffer number for the buffer containing
- * the block read, or NULL on an error. If successful,
- * the returned buffer has been pinned.
- *
- * Assume when this function is called, that reln has been
- * opened already.
- *
- * Note: a side effect of a P_NEW call is to update reln->rd_nblocks.
- */
-
-#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG
- * defined */
-
-/*
- * ReadBuffer
- */
-Buffer
-ReadBuffer(Relation reln, BlockNumber blockNum)
-{
- return ReadBufferInternal(reln, blockNum, false);
-}
-
-/*
- * ReadBufferInternal -- internal version of ReadBuffer with more options
- *
- * bufferLockHeld: if true, caller already acquired the bufmgr lock.
- * (This is assumed never to be true if dealing with a local buffer!)
- */
-static Buffer
-ReadBufferInternal(Relation reln, BlockNumber blockNum,
- bool bufferLockHeld)
-{
- BufferDesc *bufHdr;
- int status;
- bool found;
- bool isExtend;
- bool isLocalBuf;
-
- isExtend = (blockNum == P_NEW);
- isLocalBuf = reln->rd_myxactonly;
-
- if (isLocalBuf)
- {
- ReadLocalBufferCount++;
- pgstat_count_buffer_read(&reln->pgstat_info, reln);
- /* Substitute proper block number if caller asked for P_NEW */
- if (isExtend)
- {
- blockNum = reln->rd_nblocks;
- reln->rd_nblocks++;
- }
- bufHdr = LocalBufferAlloc(reln, blockNum, &found);
- if (found)
- {
- LocalBufferHitCount++;
- pgstat_count_buffer_hit(&reln->pgstat_info, reln);
- }
- }
- else
- {
- ReadBufferCount++;
- pgstat_count_buffer_read(&reln->pgstat_info, reln);
- /* Substitute proper block number if caller asked for P_NEW */
- if (isExtend)
- {
- /* must be sure we have accurate file length! */
- blockNum = reln->rd_nblocks = smgrnblocks(DEFAULT_SMGR, reln);
- reln->rd_nblocks++;
- }
-
- /*
- * lookup the buffer. IO_IN_PROGRESS is set if the requested
- * block is not currently in memory.
- */
- if (!bufferLockHeld)
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- bufHdr = BufferAlloc(reln, blockNum, &found);
- if (found)
- {
- BufferHitCount++;
- pgstat_count_buffer_hit(&reln->pgstat_info, reln);
- }
- }
-
- /* At this point we do NOT hold the bufmgr lock. */
-
- if (!bufHdr)
- return InvalidBuffer;
-
- /* if it's already in the buffer pool, we're done */
- if (found)
- {
- /* That is, we're done if we expected to be able to find it ... */
- if (!isExtend)
- return BufferDescriptorGetBuffer(bufHdr);
-
- /*
- * If we found a buffer when we were expecting to extend the
- * relation, the implication is that a buffer was already created
- * for the next page position, but then smgrextend failed to write
- * the page. We'd better try the smgrextend again. But since
- * BufferAlloc won't have done StartBufferIO, we must do that
- * first.
- */
- if (!isLocalBuf)
- {
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- StartBufferIO(bufHdr, false);
- LWLockRelease(BufMgrLock);
- }
- }
-
- /*
- * if we have gotten to this point, the reln pointer must be ok and
- * the relation file must be open.
- */
- if (isExtend)
- {
- /* new buffers are zero-filled */
- MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
- status = smgrextend(DEFAULT_SMGR, reln, blockNum,
- (char *) MAKE_PTR(bufHdr->data));
- }
- else
- {
- status = smgrread(DEFAULT_SMGR, reln, blockNum,
- (char *) MAKE_PTR(bufHdr->data));
- }
-
- if (isLocalBuf)
- {
- /* No shared buffer state to update... */
- if (status == SM_FAIL)
- {
- bufHdr->flags |= BM_IO_ERROR;
- return InvalidBuffer;
- }
- return BufferDescriptorGetBuffer(bufHdr);
- }
-
- /* lock buffer manager again to update IO IN PROGRESS */
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- if (status == SM_FAIL)
- {
- /* IO Failed. cleanup the data structures and go home */
-
- if (!BufTableDelete(bufHdr))
- {
- LWLockRelease(BufMgrLock);
- elog(FATAL, "BufRead: buffer table broken after IO error");
- }
- /* remember that BufferAlloc() pinned the buffer */
- UnpinBuffer(bufHdr);
-
- /*
- * Have to reset the flag so that anyone waiting for the buffer
- * can tell that the contents are invalid.
- */
- bufHdr->flags |= BM_IO_ERROR;
- bufHdr->flags &= ~BM_IO_IN_PROGRESS;
- }
- else
- {
- /* IO Succeeded. clear the flags, finish buffer update */
-
- bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS);
- }
-
- /* If anyone was waiting for IO to complete, wake them up now */
- TerminateBufferIO(bufHdr);
-
- LWLockRelease(BufMgrLock);
-
- if (status == SM_FAIL)
- return InvalidBuffer;
-
- return BufferDescriptorGetBuffer(bufHdr);
-}
-
-/*
- * BufferAlloc -- Get a buffer from the buffer pool but don't
- * read it. If successful, the returned buffer is pinned.
- *
- * Returns: descriptor for buffer
- *
- * BufMgrLock must be held at entry. When this routine returns,
- * the BufMgrLock is guaranteed NOT to be held.
- */
-static BufferDesc *
-BufferAlloc(Relation reln,
- BlockNumber blockNum,
- bool *foundPtr)
-{
- BufferDesc *buf,
- *buf2;
- BufferTag newTag; /* identity of requested block */
- bool inProgress; /* buffer undergoing IO */
-
- /* create a new tag so we can lookup the buffer */
- /* assume that the relation is already open */
- INIT_BUFFERTAG(&newTag, reln, blockNum);
-
- /* see if the block is in the buffer pool already */
- buf = BufTableLookup(&newTag);
- if (buf != NULL)
- {
- /*
- * Found it. Now, (a) pin the buffer so no one steals it from the
- * buffer pool, (b) check IO_IN_PROGRESS, someone may be faulting
- * the buffer into the buffer pool.
- */
-
- PinBuffer(buf);
- inProgress = (buf->flags & BM_IO_IN_PROGRESS);
-
- *foundPtr = TRUE;
- if (inProgress) /* confirm end of IO */
- {
- WaitIO(buf);
- inProgress = (buf->flags & BM_IO_IN_PROGRESS);
- }
- if (BUFFER_IS_BROKEN(buf))
- {
- /*
- * I couldn't understand the following old comment. If there's
- * no IO for the buffer and the buffer is BROKEN,it should be
- * read again. So start a new buffer IO here.
- *
- * wierd race condition:
- *
- * We were waiting for someone else to read the buffer. While we
- * were waiting, the reader boof'd in some way, so the
- * contents of the buffer are still invalid. By saying that
- * we didn't find it, we can make the caller reinitialize the
- * buffer. If two processes are waiting for this block, both
- * will read the block. The second one to finish may
- * overwrite any updates made by the first. (Assume higher
- * level synchronization prevents this from happening).
- *
- * This is never going to happen, don't worry about it.
- */
- *foundPtr = FALSE;
- }
-#ifdef BMTRACE
- _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND);
-#endif /* BMTRACE */
-
- if (!(*foundPtr))
- StartBufferIO(buf, true);
- LWLockRelease(BufMgrLock);
-
- return buf;
- }
-
- *foundPtr = FALSE;
-
- /*
- * Didn't find it in the buffer pool. We'll have to initialize a new
- * buffer. First, grab one from the free list. If it's dirty, flush
- * it to disk. Remember to unlock BufMgrLock while doing the IOs.
- */
- inProgress = FALSE;
- for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;)
- {
- buf = GetFreeBuffer();
-
- /* GetFreeBuffer will abort if it can't find a free buffer */
- Assert(buf);
-
- /*
- * There should be exactly one pin on the buffer after it is
- * allocated -- ours. If it had a pin it wouldn't have been on
- * the free list. No one else could have pinned it between
- * GetFreeBuffer and here because we have the BufMgrLock.
- */
- Assert(buf->refcount == 0);
- buf->refcount = 1;
- PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;
-
- if (buf->flags & BM_DIRTY || buf->cntxDirty)
- {
- bool smok;
-
- /*
- * skip write error buffers
- */
- if ((buf->flags & BM_IO_ERROR) != 0)
- {
- UnpinBuffer(buf);
- buf = (BufferDesc *) NULL;
- continue;
- }
-
- /*
- * Set BM_IO_IN_PROGRESS to keep anyone from doing anything
- * with the contents of the buffer while we write it out. We
- * don't really care if they try to read it, but if they can
- * complete a BufferAlloc on it they can then scribble into
- * it, and we'd really like to avoid that while we are
- * flushing the buffer. Setting this flag should block them
- * in WaitIO until we're done.
- */
- inProgress = TRUE;
-
- /*
- * All code paths that acquire this lock pin the buffer first;
- * since no one had it pinned (it just came off the free
- * list), no one else can have this lock.
- */
- StartBufferIO(buf, false);
-
- /*
- * Write the buffer out, being careful to release BufMgrLock
- * before starting the I/O.
- */
- smok = BufferReplace(buf);
-
- if (smok == FALSE)
- {
- elog(WARNING, "BufferAlloc: cannot write block %u for %u/%u",
- buf->tag.blockNum,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode);
- inProgress = FALSE;
- buf->flags |= BM_IO_ERROR;
- buf->flags &= ~BM_IO_IN_PROGRESS;
- TerminateBufferIO(buf);
- UnpinBuffer(buf);
- buf = (BufferDesc *) NULL;
- }
- else
- {
- /*
- * BM_JUST_DIRTIED cleared by BufferReplace and shouldn't
- * be setted by anyone. - vadim 01/17/97
- */
- if (buf->flags & BM_JUST_DIRTIED)
- {
- elog(PANIC, "BufferAlloc: content of block %u (%u/%u) changed while flushing",
- buf->tag.blockNum,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode);
- }
- else
- buf->flags &= ~BM_DIRTY;
- buf->cntxDirty = false;
- }
-
- /*
- * Somebody could have pinned the buffer while we were doing
- * the I/O and had given up the BufMgrLock (though they would
- * be waiting for us to clear the BM_IO_IN_PROGRESS flag).
- * That's why this is a loop -- if so, we need to clear the
- * I/O flags, remove our pin and start all over again.
- *
- * People may be making buffers free at any time, so there's no
- * reason to think that we have an immediate disaster on our
- * hands.
- */
- if (buf && buf->refcount > 1)
- {
- inProgress = FALSE;
- buf->flags &= ~BM_IO_IN_PROGRESS;
- TerminateBufferIO(buf);
- UnpinBuffer(buf);
- buf = (BufferDesc *) NULL;
- }
-
- /*
- * Somebody could have allocated another buffer for the same
- * block we are about to read in. (While we flush out the
- * dirty buffer, we don't hold the lock and someone could have
- * allocated another buffer for the same block. The problem is
- * we haven't gotten around to insert the new tag into the
- * buffer table. So we need to check here. -ay 3/95
- */
- buf2 = BufTableLookup(&newTag);
- if (buf2 != NULL)
- {
- /*
- * Found it. Someone has already done what we're about to
- * do. We'll just handle this as if it were found in the
- * buffer pool in the first place.
- */
- if (buf != NULL)
- {
- buf->flags &= ~BM_IO_IN_PROGRESS;
- TerminateBufferIO(buf);
- /* give up old buffer since we don't need it any more */
- UnpinBuffer(buf);
- }
-
- PinBuffer(buf2);
- inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
-
- *foundPtr = TRUE;
- if (inProgress)
- {
- WaitIO(buf2);
- inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
- }
- if (BUFFER_IS_BROKEN(buf2))
- *foundPtr = FALSE;
-
- if (!(*foundPtr))
- StartBufferIO(buf2, true);
- LWLockRelease(BufMgrLock);
-
- return buf2;
- }
- }
- }
-
- /*
- * At this point we should have the sole pin on a non-dirty buffer and
- * we may or may not already have the BM_IO_IN_PROGRESS flag set.
- */
-
- /*
- * Change the name of the buffer in the lookup table:
- *
- * Need to update the lookup table before the read starts. If someone
- * comes along looking for the buffer while we are reading it in, we
- * don't want them to allocate a new buffer. For the same reason, we
- * didn't want to erase the buf table entry for the buffer we were
- * writing back until now, either.
- */
-
- if (!BufTableDelete(buf))
- {
- LWLockRelease(BufMgrLock);
- elog(FATAL, "buffer wasn't in the buffer table");
- }
-
- INIT_BUFFERTAG(&(buf->tag), reln, blockNum);
-
- if (!BufTableInsert(buf))
- {
- LWLockRelease(BufMgrLock);
- elog(FATAL, "Buffer in lookup table twice");
- }
-
- /*
- * Buffer contents are currently invalid. Have to mark IO IN PROGRESS
- * so no one fiddles with them until the read completes. If this
- * routine has been called simply to allocate a buffer, no io will be
- * attempted, so the flag isnt set.
- */
- if (!inProgress)
- StartBufferIO(buf, true);
- else
- ContinueBufferIO(buf, true);
-
-#ifdef BMTRACE
- _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND);
-#endif /* BMTRACE */
-
- LWLockRelease(BufMgrLock);
-
- return buf;
-}
-
-/*
- * write_buffer -- common functionality for
- * WriteBuffer and WriteNoReleaseBuffer
- */
-static void
-write_buffer(Buffer buffer, bool release)
-{
- BufferDesc *bufHdr;
-
- if (BufferIsLocal(buffer))
- {
- WriteLocalBuffer(buffer, release);
- return;
- }
-
- if (BAD_BUFFER_ID(buffer))
- elog(ERROR, "write_buffer: bad buffer %d", buffer);
-
- bufHdr = &BufferDescriptors[buffer - 1];
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- Assert(bufHdr->refcount > 0);
-
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-
- if (release)
- UnpinBuffer(bufHdr);
- LWLockRelease(BufMgrLock);
-}
-
-/*
- * WriteBuffer
- *
- * Marks buffer contents as dirty (actual write happens later).
- *
- * Assume that buffer is pinned. Assume that reln is
- * valid.
- *
- * Side Effects:
- * Pin count is decremented.
- */
-
-#undef WriteBuffer
-
-void
-WriteBuffer(Buffer buffer)
-{
- write_buffer(buffer, true);
-}
-
-/*
- * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
- * when the operation is complete.
- */
-void
-WriteNoReleaseBuffer(Buffer buffer)
-{
- write_buffer(buffer, false);
-}
-
-
-#undef ReleaseAndReadBuffer
-/*
- * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
- * to save a lock release/acquire.
- *
- * Also, if the passed buffer is valid and already contains the desired block
- * number, we simply return it without ever acquiring the lock at all.
- * Since the passed buffer must be pinned, it's OK to examine its block
- * number without getting the lock first.
- *
- * Note: it is OK to pass buffer = InvalidBuffer, indicating that no old
- * buffer actually needs to be released. This case is the same as ReadBuffer,
- * but can save some tests in the caller.
- *
- * Also note: while it will work to call this routine with blockNum == P_NEW,
- * it's best to avoid doing so, since that would result in calling
- * smgrnblocks() while holding the bufmgr lock, hence some loss of
- * concurrency.
- */
-Buffer
-ReleaseAndReadBuffer(Buffer buffer,
- Relation relation,
- BlockNumber blockNum)
-{
- BufferDesc *bufHdr;
-
- if (BufferIsValid(buffer))
- {
- if (BufferIsLocal(buffer))
- {
- Assert(LocalRefCount[-buffer - 1] > 0);
- bufHdr = &LocalBufferDescriptors[-buffer - 1];
- if (bufHdr->tag.blockNum == blockNum &&
- RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
- return buffer;
- LocalRefCount[-buffer - 1]--;
- }
- else
- {
- Assert(PrivateRefCount[buffer - 1] > 0);
- bufHdr = &BufferDescriptors[buffer - 1];
- if (bufHdr->tag.blockNum == blockNum &&
- RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
- return buffer;
- if (PrivateRefCount[buffer - 1] > 1)
- PrivateRefCount[buffer - 1]--;
- else
- {
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- UnpinBuffer(bufHdr);
- return ReadBufferInternal(relation, blockNum, true);
- }
- }
- }
-
- return ReadBufferInternal(relation, blockNum, false);
-}
-
-/*
- * BufferSync -- Write all dirty buffers in the pool.
- *
- * This is called at checkpoint time and write out all dirty buffers.
- */
-void
-BufferSync()
-{
- int i;
- BufferDesc *bufHdr;
- Buffer buffer;
- int status;
- RelFileNode rnode;
- XLogRecPtr recptr;
- Relation reln = NULL;
-
- for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
- {
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- if (!(bufHdr->flags & BM_VALID))
- {
- LWLockRelease(BufMgrLock);
- continue;
- }
-
- /*
- * We can check bufHdr->cntxDirty here *without* holding any lock
- * on buffer context as long as we set this flag in access methods
- * *before* logging changes with XLogInsert(): if someone will set
- * cntxDirty just after our check we don't worry because of our
- * checkpoint.redo points before log record for upcoming changes
- * and so we are not required to write such dirty buffer.
- */
- if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
- {
- LWLockRelease(BufMgrLock);
- continue;
- }
-
- /*
- * IO synchronization. Note that we do it with unpinned buffer to
- * avoid conflicts with FlushRelationBuffers.
- */
- if (bufHdr->flags & BM_IO_IN_PROGRESS)
- {
- WaitIO(bufHdr);
- if (!(bufHdr->flags & BM_VALID) ||
- (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)))
- {
- LWLockRelease(BufMgrLock);
- continue;
- }
- }
-
- /*
- * Here: no one doing IO for this buffer and it's dirty. Pin
- * buffer now and set IO state for it *before* acquiring shlock to
- * avoid conflicts with FlushRelationBuffers.
- */
- PinBuffer(bufHdr);
- StartBufferIO(bufHdr, false); /* output IO start */
-
- buffer = BufferDescriptorGetBuffer(bufHdr);
- rnode = bufHdr->tag.rnode;
-
- LWLockRelease(BufMgrLock);
-
- /*
- * Try to find relation for buffer
- */
- reln = RelationNodeCacheGetRelation(rnode);
-
- /*
- * Protect buffer content against concurrent update
- */
- LockBuffer(buffer, BUFFER_LOCK_SHARE);
-
- /*
- * Force XLOG flush for buffer' LSN
- */
- recptr = BufferGetLSN(bufHdr);
- XLogFlush(recptr);
-
- /*
- * Now it's safe to write buffer to disk. Note that no one else
- * should not be able to write it while we were busy with locking
- * and log flushing because of we setted IO flag.
- */
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty);
- bufHdr->flags &= ~BM_JUST_DIRTIED;
- LWLockRelease(BufMgrLock);
-
- if (reln == (Relation) NULL)
- {
- status = smgrblindwrt(DEFAULT_SMGR,
- bufHdr->tag.rnode,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data),
- true); /* must fsync */
- }
- else
- {
- status = smgrwrite(DEFAULT_SMGR, reln,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
- }
-
- if (status == SM_FAIL) /* disk failure ?! */
- elog(PANIC, "BufferSync: cannot write %u for %u/%u",
- bufHdr->tag.blockNum,
- bufHdr->tag.rnode.tblNode, bufHdr->tag.rnode.relNode);
-
- /*
- * Note that it's safe to change cntxDirty here because of we
- * protect it from upper writers by share lock and from other
- * bufmgr routines by BM_IO_IN_PROGRESS
- */
- bufHdr->cntxDirty = false;
-
- /*
- * Release the per-buffer readlock, reacquire BufMgrLock.
- */
- LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
- BufferFlushCount++;
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
- TerminateBufferIO(bufHdr); /* Sync IO finished */
-
- /*
- * If this buffer was marked by someone as DIRTY while we were
- * flushing it out we must not clear DIRTY flag - vadim 01/17/97
- */
- if (!(bufHdr->flags & BM_JUST_DIRTIED))
- bufHdr->flags &= ~BM_DIRTY;
- UnpinBuffer(bufHdr);
- LWLockRelease(BufMgrLock);
-
- /* drop refcnt obtained by RelationNodeCacheGetRelation */
- if (reln != (Relation) NULL)
- RelationDecrementReferenceCount(reln);
- }
-
-}
-
-/*
- * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
- *
- * Should be entered with buffer manager lock held; releases it before
- * waiting and re-acquires it afterwards.
- */
-static void
-WaitIO(BufferDesc *buf)
-{
- /*
- * Changed to wait until there's no IO - Inoue 01/13/2000
- *
- * Note this is *necessary* because an error abort in the process doing
- * I/O could release the io_in_progress_lock prematurely. See
- * AbortBufferIO.
- */
- while ((buf->flags & BM_IO_IN_PROGRESS) != 0)
- {
- LWLockRelease(BufMgrLock);
- LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
- LWLockRelease(buf->io_in_progress_lock);
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- }
-}
-
-
-long NDirectFileRead; /* some I/O's are direct file access.
- * bypass bufmgr */
-long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
-
-
-/*
- * Return a palloc'd string containing buffer usage statistics.
- */
-char *
-ShowBufferUsage(void)
-{
- StringInfoData str;
- float hitrate;
- float localhitrate;
-
- initStringInfo(&str);
-
- if (ReadBufferCount == 0)
- hitrate = 0.0;
- else
- hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
-
- if (ReadLocalBufferCount == 0)
- localhitrate = 0.0;
- else
- localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
-
- appendStringInfo(&str,
- "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
- ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
- appendStringInfo(&str,
- "!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
- ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
- appendStringInfo(&str,
- "!\tDirect blocks: %10ld read, %10ld written\n",
- NDirectFileRead, NDirectFileWrite);
-
- return str.data;
-}
-
-void
-ResetBufferUsage(void)
-{
- BufferHitCount = 0;
- ReadBufferCount = 0;
- BufferFlushCount = 0;
- LocalBufferHitCount = 0;
- ReadLocalBufferCount = 0;
- LocalBufferFlushCount = 0;
- NDirectFileRead = 0;
- NDirectFileWrite = 0;
-}
-
-/* ----------------------------------------------
- * ResetBufferPool
- *
- * This routine is supposed to be called when a transaction aborts.
- * It will release all the buffer pins held by the transaction.
- * Currently, we also call it during commit if BufferPoolCheckLeak
- * detected a problem --- in that case, isCommit is TRUE, and we
- * only clean up buffer pin counts.
- *
- * ----------------------------------------------
- */
-void
-ResetBufferPool(bool isCommit)
-{
- int i;
-
- for (i = 0; i < NBuffers; i++)
- {
- if (PrivateRefCount[i] != 0)
- {
- BufferDesc *buf = &BufferDescriptors[i];
-
- PrivateRefCount[i] = 1; /* make sure we release shared pin */
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- UnpinBuffer(buf);
- LWLockRelease(BufMgrLock);
- Assert(PrivateRefCount[i] == 0);
- }
- }
-
- ResetLocalBufferPool();
-
- if (!isCommit)
- smgrabort();
-}
-
-/*
- * BufferPoolCheckLeak
- *
- * check if there is buffer leak
- */
-bool
-BufferPoolCheckLeak(void)
-{
- int i;
- bool result = false;
-
- for (i = 0; i < NBuffers; i++)
- {
- if (PrivateRefCount[i] != 0)
- {
- BufferDesc *buf = &(BufferDescriptors[i]);
-
- elog(WARNING,
- "Buffer Leak: [%03d] (freeNext=%d, freePrev=%d, \
-rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
- i, buf->freeNext, buf->freePrev,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum, buf->flags,
- buf->refcount, PrivateRefCount[i]);
- result = true;
- }
- }
- return result;
-}
-
-/* ------------------------------------------------
- * FlushBufferPool
- *
- * Flush all dirty blocks in buffer pool to disk
- * at the checkpoint time
- * ------------------------------------------------
- */
-void
-FlushBufferPool(void)
-{
- BufferSync();
- smgrsync();
-}
-
-/*
- * At the commit time we have to flush local buffer pool only
- */
-void
-BufmgrCommit(void)
-{
- LocalBufferSync();
-
- /*
- * All files created in current transaction will be fsync-ed
- */
- smgrcommit();
-}
-
-/*
- * BufferGetBlockNumber
- * Returns the block number associated with a buffer.
- *
- * Note:
- * Assumes that the buffer is valid and pinned, else the
- * value may be obsolete immediately...
- */
-BlockNumber
-BufferGetBlockNumber(Buffer buffer)
-{
- Assert(BufferIsPinned(buffer));
-
- if (BufferIsLocal(buffer))
- return LocalBufferDescriptors[-buffer - 1].tag.blockNum;
- else
- return BufferDescriptors[buffer - 1].tag.blockNum;
-}
-
-/*
- * BufferReplace
- *
- * Write out the buffer corresponding to 'bufHdr'
- *
- * BufMgrLock must be held at entry, and the buffer must be pinned.
- */
-static int
-BufferReplace(BufferDesc *bufHdr)
-{
- Relation reln;
- XLogRecPtr recptr;
- int status;
-
- /* To check if block content changed while flushing. - vadim 01/17/97 */
- bufHdr->flags &= ~BM_JUST_DIRTIED;
-
- LWLockRelease(BufMgrLock);
-
- /*
- * No need to lock buffer context - no one should be able to end
- * ReadBuffer
- */
- recptr = BufferGetLSN(bufHdr);
- XLogFlush(recptr);
-
- reln = RelationNodeCacheGetRelation(bufHdr->tag.rnode);
-
- if (reln != (Relation) NULL)
- {
- status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
- }
- else
- {
- status = smgrblindwrt(DEFAULT_SMGR, bufHdr->tag.rnode,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data),
- false); /* no fsync */
- }
-
- /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */
- if (reln != (Relation) NULL)
- RelationDecrementReferenceCount(reln);
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- if (status == SM_FAIL)
- return FALSE;
-
- BufferFlushCount++;
-
- return TRUE;
-}
-
-/*
- * RelationGetNumberOfBlocks
- * Determines the current number of pages in the relation.
- * Side effect: relation->rd_nblocks is updated.
- *
- * Note:
- * XXX may fail for huge relations.
- * XXX should be elsewhere.
- * XXX maybe should be hidden
- */
-BlockNumber
-RelationGetNumberOfBlocks(Relation relation)
-{
- /*
- * relation->rd_nblocks should be accurate already if the relation is
- * myxactonly. (XXX how safe is that really?) Don't call smgr on a
- * view, either.
- */
- if (relation->rd_rel->relkind == RELKIND_VIEW)
- relation->rd_nblocks = 0;
- else if (!relation->rd_myxactonly)
- relation->rd_nblocks = smgrnblocks(DEFAULT_SMGR, relation);
- return relation->rd_nblocks;
-}
-
-/* ---------------------------------------------------------------------
- * DropRelationBuffers
- *
- * This function removes all the buffered pages for a relation
- * from the buffer pool. Dirty pages are simply dropped, without
- * bothering to write them out first. This is NOT rollback-able,
- * and so should be used only with extreme caution!
- *
- * We assume that the caller holds an exclusive lock on the relation,
- * which should assure that no new buffers will be acquired for the rel
- * meanwhile.
- *
- * XXX currently it sequentially searches the buffer pool, should be
- * changed to more clever ways of searching.
- * --------------------------------------------------------------------
- */
-void
-DropRelationBuffers(Relation rel)
-{
- int i;
- BufferDesc *bufHdr;
-
- if (rel->rd_myxactonly)
- {
- for (i = 0; i < NLocBuffer; i++)
- {
- bufHdr = &LocalBufferDescriptors[i];
- if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
- {
- bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
- bufHdr->cntxDirty = false;
- LocalRefCount[i] = 0;
- bufHdr->tag.rnode.relNode = InvalidOid;
- }
- }
- return;
- }
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- for (i = 1; i <= NBuffers; i++)
- {
- bufHdr = &BufferDescriptors[i - 1];
-recheck:
- if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
- {
- /*
- * If there is I/O in progress, better wait till it's done;
- * don't want to delete the relation out from under someone
- * who's just trying to flush the buffer!
- */
- if (bufHdr->flags & BM_IO_IN_PROGRESS)
- {
- WaitIO(bufHdr);
-
- /*
- * By now, the buffer very possibly belongs to some other
- * rel, so check again before proceeding.
- */
- goto recheck;
- }
- /* Now we can do what we came for */
- bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
- bufHdr->cntxDirty = false;
-
- /*
- * Release any refcount we may have.
- *
- * This is very probably dead code, and if it isn't then it's
- * probably wrong. I added the Assert to find out --- tgl
- * 11/99.
- */
- if (!(bufHdr->flags & BM_FREE))
- {
- /* Assert checks that buffer will actually get freed! */
- Assert(PrivateRefCount[i - 1] == 1 &&
- bufHdr->refcount == 1);
- ReleaseBufferWithBufferLock(i);
- }
-
- /*
- * And mark the buffer as no longer occupied by this rel.
- */
- BufTableDelete(bufHdr);
- }
- }
-
- LWLockRelease(BufMgrLock);
-}
-
-/* ---------------------------------------------------------------------
- * DropRelFileNodeBuffers
- *
- * This is the same as DropRelationBuffers, except that the target
- * relation is specified by RelFileNode.
- *
- * This is NOT rollback-able. One legitimate use is to clear the
- * buffer cache of buffers for a relation that is being deleted
- * during transaction abort.
- * --------------------------------------------------------------------
- */
-void
-DropRelFileNodeBuffers(RelFileNode rnode)
-{
- int i;
- BufferDesc *bufHdr;
-
- /* We have to search both local and shared buffers... */
-
- for (i = 0; i < NLocBuffer; i++)
- {
- bufHdr = &LocalBufferDescriptors[i];
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
- {
- bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
- bufHdr->cntxDirty = false;
- LocalRefCount[i] = 0;
- bufHdr->tag.rnode.relNode = InvalidOid;
- }
- }
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- for (i = 1; i <= NBuffers; i++)
- {
- bufHdr = &BufferDescriptors[i - 1];
-recheck:
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
- {
- /*
- * If there is I/O in progress, better wait till it's done;
- * don't want to delete the relation out from under someone
- * who's just trying to flush the buffer!
- */
- if (bufHdr->flags & BM_IO_IN_PROGRESS)
- {
- WaitIO(bufHdr);
-
- /*
- * By now, the buffer very possibly belongs to some other
- * rel, so check again before proceeding.
- */
- goto recheck;
- }
- /* Now we can do what we came for */
- bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
- bufHdr->cntxDirty = false;
-
- /*
- * Release any refcount we may have.
- *
- * This is very probably dead code, and if it isn't then it's
- * probably wrong. I added the Assert to find out --- tgl
- * 11/99.
- */
- if (!(bufHdr->flags & BM_FREE))
- {
- /* Assert checks that buffer will actually get freed! */
- Assert(PrivateRefCount[i - 1] == 1 &&
- bufHdr->refcount == 1);
- ReleaseBufferWithBufferLock(i);
- }
-
- /*
- * And mark the buffer as no longer occupied by this rel.
- */
- BufTableDelete(bufHdr);
- }
- }
-
- LWLockRelease(BufMgrLock);
-}
-
-/* ---------------------------------------------------------------------
- * DropBuffers
- *
- * This function removes all the buffers in the buffer cache for a
- * particular database. Dirty pages are simply dropped, without
- * bothering to write them out first. This is used when we destroy a
- * database, to avoid trying to flush data to disk when the directory
- * tree no longer exists. Implementation is pretty similar to
- * DropRelationBuffers() which is for destroying just one relation.
- * --------------------------------------------------------------------
- */
-void
-DropBuffers(Oid dbid)
-{
- int i;
- BufferDesc *bufHdr;
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- for (i = 1; i <= NBuffers; i++)
- {
- bufHdr = &BufferDescriptors[i - 1];
-recheck:
-
- /*
- * We know that currently database OID is tblNode but this
- * probably will be changed in future and this func will be used
- * to drop tablespace buffers.
- */
- if (bufHdr->tag.rnode.tblNode == dbid)
- {
- /*
- * If there is I/O in progress, better wait till it's done;
- * don't want to delete the database out from under someone
- * who's just trying to flush the buffer!
- */
- if (bufHdr->flags & BM_IO_IN_PROGRESS)
- {
- WaitIO(bufHdr);
-
- /*
- * By now, the buffer very possibly belongs to some other
- * DB, so check again before proceeding.
- */
- goto recheck;
- }
- /* Now we can do what we came for */
- bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
- bufHdr->cntxDirty = false;
-
- /*
- * The thing should be free, if caller has checked that no
- * backends are running in that database.
- */
- Assert(bufHdr->flags & BM_FREE);
-
- /*
- * And mark the buffer as no longer occupied by this page.
- */
- BufTableDelete(bufHdr);
- }
- }
-
- LWLockRelease(BufMgrLock);
-}
-
-/* -----------------------------------------------------------------
- * PrintBufferDescs
- *
- * this function prints all the buffer descriptors, for debugging
- * use only.
- * -----------------------------------------------------------------
- */
-void
-PrintBufferDescs()
-{
- int i;
- BufferDesc *buf = BufferDescriptors;
-
- if (IsUnderPostmaster)
- {
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- for (i = 0; i < NBuffers; ++i, ++buf)
- {
- elog(LOG, "[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u, \
-blockNum=%u, flags=0x%x, refcount=%d %ld)",
- i, buf->freeNext, buf->freePrev,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum, buf->flags,
- buf->refcount, PrivateRefCount[i]);
- }
- LWLockRelease(BufMgrLock);
- }
- else
- {
- /* interactive backend */
- for (i = 0; i < NBuffers; ++i, ++buf)
- {
- printf("[%-2d] (%u/%u, %u) flags=0x%x, refcnt=%d %ld)\n",
- i, buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum,
- buf->flags, buf->refcount, PrivateRefCount[i]);
- }
- }
-}
-
-void
-PrintPinnedBufs()
-{
- int i;
- BufferDesc *buf = BufferDescriptors;
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- for (i = 0; i < NBuffers; ++i, ++buf)
- {
- if (PrivateRefCount[i] > 0)
- elog(WARNING, "[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u, \
-blockNum=%u, flags=0x%x, refcount=%d %ld)",
- i, buf->freeNext, buf->freePrev,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum, buf->flags,
- buf->refcount, PrivateRefCount[i]);
- }
- LWLockRelease(BufMgrLock);
-}
-
-/*
- * BufferPoolBlowaway
- *
- * this routine is solely for the purpose of experiments -- sometimes
- * you may want to blowaway whatever is left from the past in buffer
- * pool and start measuring some performance with a clean empty buffer
- * pool.
- */
-#ifdef NOT_USED
-void
-BufferPoolBlowaway()
-{
- int i;
-
- BufferSync();
- for (i = 1; i <= NBuffers; i++)
- {
- if (BufferIsValid(i))
- {
- while (BufferIsValid(i))
- ReleaseBuffer(i);
- }
- BufTableDelete(&BufferDescriptors[i - 1]);
- }
-}
-#endif
-
-/* ---------------------------------------------------------------------
- * FlushRelationBuffers
- *
- * This function writes all dirty pages of a relation out to disk.
- * Furthermore, pages that have blocknumber >= firstDelBlock are
- * actually removed from the buffer pool. An error code is returned
- * if we fail to dump a dirty buffer or if we find one of
- * the target pages is pinned into the cache.
- *
- * This is called by DROP TABLE to clear buffers for the relation
- * from the buffer pool. Note that we must write dirty buffers,
- * rather than just dropping the changes, because our transaction
- * might abort later on; we want to roll back safely in that case.
- *
- * This is also called by VACUUM before truncating the relation to the
- * given number of blocks. It might seem unnecessary for VACUUM to
- * write dirty pages before firstDelBlock, since VACUUM should already
- * have committed its changes. However, it is possible for there still
- * to be dirty pages: if some page had unwritten on-row tuple status
- * updates from a prior transaction, and VACUUM had no additional
- * changes to make to that page, then VACUUM won't have written it.
- * This is harmless in most cases but will break pg_upgrade, which
- * relies on VACUUM to ensure that *all* tuples have correct on-row
- * status. So, we check and flush all dirty pages of the rel
- * regardless of block number.
- *
- * In all cases, the caller should be holding AccessExclusiveLock on
- * the target relation to ensure that no other backend is busy reading
- * more blocks of the relation (or might do so before we commit).
- *
- * Formerly, we considered it an error condition if we found dirty
- * buffers here. However, since BufferSync no longer forces out all
- * dirty buffers at every xact commit, it's possible for dirty buffers
- * to still be present in the cache due to failure of an earlier
- * transaction. So, must flush dirty buffers without complaint.
- *
- * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED
- *
- * XXX currently it sequentially searches the buffer pool, should be
- * changed to more clever ways of searching.
- * --------------------------------------------------------------------
- */
-int
-FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
-{
- int i;
- BufferDesc *bufHdr;
- XLogRecPtr recptr;
- int status;
-
- if (rel->rd_myxactonly)
- {
- for (i = 0; i < NLocBuffer; i++)
- {
- bufHdr = &LocalBufferDescriptors[i];
- if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
- {
- if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
- {
- status = smgrwrite(DEFAULT_SMGR, rel,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
- if (status == SM_FAIL)
- {
- elog(WARNING, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
- RelationGetRelationName(rel), firstDelBlock,
- bufHdr->tag.blockNum);
- return (-1);
- }
- bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
- bufHdr->cntxDirty = false;
- }
- if (LocalRefCount[i] > 0)
- {
- elog(WARNING, "FlushRelationBuffers(%s (local), %u): block %u is referenced (%ld)",
- RelationGetRelationName(rel), firstDelBlock,
- bufHdr->tag.blockNum, LocalRefCount[i]);
- return (-2);
- }
- if (bufHdr->tag.blockNum >= firstDelBlock)
- bufHdr->tag.rnode.relNode = InvalidOid;
- }
- }
- return 0;
- }
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- for (i = 0; i < NBuffers; i++)
- {
- bufHdr = &BufferDescriptors[i];
- if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
- {
- if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
- {
- PinBuffer(bufHdr);
- if (bufHdr->flags & BM_IO_IN_PROGRESS)
- WaitIO(bufHdr);
- LWLockRelease(BufMgrLock);
-
- /*
- * Force XLOG flush for buffer' LSN
- */
- recptr = BufferGetLSN(bufHdr);
- XLogFlush(recptr);
-
- /*
- * Now it's safe to write buffer to disk
- */
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- if (bufHdr->flags & BM_IO_IN_PROGRESS)
- WaitIO(bufHdr);
-
- if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
- {
- bufHdr->flags &= ~BM_JUST_DIRTIED;
- StartBufferIO(bufHdr, false); /* output IO start */
-
- LWLockRelease(BufMgrLock);
-
- status = smgrwrite(DEFAULT_SMGR, rel,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
-
- if (status == SM_FAIL) /* disk failure ?! */
- elog(PANIC, "FlushRelationBuffers: cannot write %u for %u/%u",
- bufHdr->tag.blockNum,
- bufHdr->tag.rnode.tblNode,
- bufHdr->tag.rnode.relNode);
-
- BufferFlushCount++;
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- bufHdr->flags &= ~BM_IO_IN_PROGRESS;
- TerminateBufferIO(bufHdr);
- Assert(!(bufHdr->flags & BM_JUST_DIRTIED));
- bufHdr->flags &= ~BM_DIRTY;
-
- /*
- * Note that it's safe to change cntxDirty here
- * because of we protect it from upper writers by
- * AccessExclusiveLock and from other bufmgr routines
- * by BM_IO_IN_PROGRESS
- */
- bufHdr->cntxDirty = false;
- }
- UnpinBuffer(bufHdr);
- }
- if (!(bufHdr->flags & BM_FREE))
- {
- LWLockRelease(BufMgrLock);
- elog(WARNING, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)",
- RelationGetRelationName(rel), firstDelBlock,
- bufHdr->tag.blockNum,
- PrivateRefCount[i], bufHdr->refcount);
- return -2;
- }
- if (bufHdr->tag.blockNum >= firstDelBlock)
- BufTableDelete(bufHdr);
- }
- }
- LWLockRelease(BufMgrLock);
- return 0;
-}
-
-#undef ReleaseBuffer
-
-/*
- * ReleaseBuffer -- remove the pin on a buffer without
- * marking it dirty.
- */
-int
-ReleaseBuffer(Buffer buffer)
-{
- BufferDesc *bufHdr;
-
- if (BufferIsLocal(buffer))
- {
- Assert(LocalRefCount[-buffer - 1] > 0);
- LocalRefCount[-buffer - 1]--;
- return STATUS_OK;
- }
-
- if (BAD_BUFFER_ID(buffer))
- return STATUS_ERROR;
-
- bufHdr = &BufferDescriptors[buffer - 1];
-
- Assert(PrivateRefCount[buffer - 1] > 0);
- if (PrivateRefCount[buffer - 1] > 1)
- PrivateRefCount[buffer - 1]--;
- else
- {
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- UnpinBuffer(bufHdr);
- LWLockRelease(BufMgrLock);
- }
-
- return STATUS_OK;
-}
-
-/*
- * ReleaseBufferWithBufferLock
- * Same as ReleaseBuffer except we hold the bufmgr lock
- */
-static int
-ReleaseBufferWithBufferLock(Buffer buffer)
-{
- BufferDesc *bufHdr;
-
- if (BufferIsLocal(buffer))
- {
- Assert(LocalRefCount[-buffer - 1] > 0);
- LocalRefCount[-buffer - 1]--;
- return STATUS_OK;
- }
-
- if (BAD_BUFFER_ID(buffer))
- return STATUS_ERROR;
-
- bufHdr = &BufferDescriptors[buffer - 1];
-
- Assert(PrivateRefCount[buffer - 1] > 0);
- if (PrivateRefCount[buffer - 1] > 1)
- PrivateRefCount[buffer - 1]--;
- else
- UnpinBuffer(bufHdr);
-
- return STATUS_OK;
-}
-
-
-#ifdef NOT_USED
-void
-IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
-{
- IncrBufferRefCount(buffer);
- if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
- {
- BufferDesc *buf = &BufferDescriptors[buffer - 1];
-
- fprintf(stderr, "PIN(Incr) %d rel = %u/%u, blockNum = %u, \
-refcount = %ld, file: %s, line: %d\n",
- buffer,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum,
- PrivateRefCount[buffer - 1], file, line);
- }
-}
-#endif
-
-#ifdef NOT_USED
-void
-ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
-{
- ReleaseBuffer(buffer);
- if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer))
- {
- BufferDesc *buf = &BufferDescriptors[buffer - 1];
-
- fprintf(stderr, "UNPIN(Rel) %d rel = %u/%u, blockNum = %u, \
-refcount = %ld, file: %s, line: %d\n",
- buffer,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum,
- PrivateRefCount[buffer - 1], file, line);
- }
-}
-#endif
-
-#ifdef NOT_USED
-int
-ReleaseAndReadBuffer_Debug(char *file,
- int line,
- Buffer buffer,
- Relation relation,
- BlockNumber blockNum)
-{
- bool bufferValid;
- Buffer b;
-
- bufferValid = BufferIsValid(buffer);
- b = ReleaseAndReadBuffer(buffer, relation, blockNum);
- if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
- && is_userbuffer(buffer))
- {
- BufferDesc *buf = &BufferDescriptors[buffer - 1];
-
- fprintf(stderr, "UNPIN(Rel&Rd) %d rel = %u/%u, blockNum = %u, \
-refcount = %ld, file: %s, line: %d\n",
- buffer,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum,
- PrivateRefCount[buffer - 1], file, line);
- }
- if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer))
- {
- BufferDesc *buf = &BufferDescriptors[b - 1];
-
- fprintf(stderr, "PIN(Rel&Rd) %d rel = %u/%u, blockNum = %u, \
-refcount = %ld, file: %s, line: %d\n",
- b,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
- buf->tag.blockNum,
- PrivateRefCount[b - 1], file, line);
- }
- return b;
-}
-#endif
-
-#ifdef BMTRACE
-
-/*
- * trace allocations and deallocations in a circular buffer in
- * shared memory. check the buffer before doing the allocation,
- * and die if there's anything fishy.
- */
-
-void
-_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType)
-{
- long start,
- cur;
- bmtrace *tb;
-
- start = *CurTraceBuf;
-
- if (start > 0)
- cur = start - 1;
- else
- cur = BMT_LIMIT - 1;
-
- for (;;)
- {
- tb = &TraceBuf[cur];
- if (tb->bmt_op != BMT_NOTUSED)
- {
- if (tb->bmt_buf == bufNo)
- {
- if ((tb->bmt_op == BMT_DEALLOC)
- || (tb->bmt_dbid == dbId && tb->bmt_relid == relId
- && tb->bmt_blkno == blkNo))
- goto okay;
-
- /* die holding the buffer lock */
- _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur);
- }
- }
-
- if (cur == start)
- goto okay;
-
- if (cur == 0)
- cur = BMT_LIMIT - 1;
- else
- cur--;
- }
-
-okay:
- tb = &TraceBuf[start];
- tb->bmt_pid = MyProcPid;
- tb->bmt_buf = bufNo;
- tb->bmt_dbid = dbId;
- tb->bmt_relid = relId;
- tb->bmt_blkno = blkNo;
- tb->bmt_op = allocType;
-
- *CurTraceBuf = (start + 1) % BMT_LIMIT;
-}
-
-void
-_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,
- int allocType, long start, long cur)
-{
- FILE *fp;
- bmtrace *tb;
- int i;
-
- tb = &TraceBuf[cur];
-
- if ((fp = AllocateFile("/tmp/death_notice", "w")) == NULL)
- elog(FATAL, "buffer alloc trace error and can't open log file");
-
- fprintf(fp, "buffer alloc trace detected the following error:\n\n");
- fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n",
- bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"),
- (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation"));
-
- fprintf(fp, "the trace buffer contains:\n");
-
- i = start;
- for (;;)
- {
- tb = &TraceBuf[i];
- if (tb->bmt_op != BMT_NOTUSED)
- {
- fprintf(fp, " [%3d]%spid %d buf %2d for <%u,%u,%u> ",
- i, (i == cur ? " ---> " : "\t"),
- tb->bmt_pid, tb->bmt_buf,
- tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno);
-
- switch (tb->bmt_op)
- {
- case BMT_ALLOCFND:
- fprintf(fp, "allocate (found)\n");
- break;
-
- case BMT_ALLOCNOTFND:
- fprintf(fp, "allocate (not found)\n");
- break;
-
- case BMT_DEALLOC:
- fprintf(fp, "deallocate\n");
- break;
-
- default:
- fprintf(fp, "unknown op type %d\n", tb->bmt_op);
- break;
- }
- }
-
- i = (i + 1) % BMT_LIMIT;
- if (i == start)
- break;
- }
-
- fprintf(fp, "\noperation causing error:\n");
- fprintf(fp, "\tpid %d buf %d for <%d,%u,%d> ",
- getpid(), bufNo, dbId, relId, blkNo);
-
- switch (allocType)
- {
- case BMT_ALLOCFND:
- fprintf(fp, "allocate (found)\n");
- break;
-
- case BMT_ALLOCNOTFND:
- fprintf(fp, "allocate (not found)\n");
- break;
-
- case BMT_DEALLOC:
- fprintf(fp, "deallocate\n");
- break;
-
- default:
- fprintf(fp, "unknown op type %d\n", allocType);
- break;
- }
-
- FreeFile(fp);
-
- kill(getpid(), SIGILL);
-}
-#endif /* BMTRACE */
-
-/*
- * SetBufferCommitInfoNeedsSave
- *
- * Mark a buffer dirty when we have updated tuple commit-status bits in it.
- *
- * This is similar to WriteNoReleaseBuffer, except that we have not made a
- * critical change that has to be flushed to disk before xact commit --- the
- * status-bit update could be redone by someone else just as easily.
- *
- * This routine might get called many times on the same page, if we are making
- * the first scan after commit of an xact that added/deleted many tuples.
- * So, be as quick as we can if the buffer is already dirty.
- */
-void
-SetBufferCommitInfoNeedsSave(Buffer buffer)
-{
- BufferDesc *bufHdr;
-
- if (BufferIsLocal(buffer))
- return;
-
- if (BAD_BUFFER_ID(buffer))
- return;
-
- bufHdr = &BufferDescriptors[buffer - 1];
-
- if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
- (BM_DIRTY | BM_JUST_DIRTIED))
- {
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- Assert(bufHdr->refcount > 0);
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- LWLockRelease(BufMgrLock);
- }
-}
-
-/*
- * Release buffer context locks for shared buffers.
- *
- * Used to clean up after errors.
- */
-void
-UnlockBuffers(void)
-{
- BufferDesc *buf;
- int i;
-
- for (i = 0; i < NBuffers; i++)
- {
- bits8 buflocks = BufferLocks[i];
-
- if (buflocks == 0)
- continue;
-
- Assert(BufferIsValid(i + 1));
- buf = &(BufferDescriptors[i]);
-
- HOLD_INTERRUPTS(); /* don't want to die() partway through... */
-
- /*
- * The buffer's cntx_lock has already been released by lwlock.c.
- */
-
- if (buflocks & BL_PIN_COUNT_LOCK)
- {
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- /*
- * Don't complain if flag bit not set; it could have been
- * reset but we got a cancel/die interrupt before getting the
- * signal.
- */
- if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
- buf->wait_backend_id == MyBackendId)
- buf->flags &= ~BM_PIN_COUNT_WAITER;
- LWLockRelease(BufMgrLock);
- ProcCancelWaitForSignal();
- }
-
- BufferLocks[i] = 0;
-
- RESUME_INTERRUPTS();
- }
-}
-
-/*
- * Acquire or release the cntx_lock for the buffer.
- */
-void
-LockBuffer(Buffer buffer, int mode)
-{
- BufferDesc *buf;
-
- Assert(BufferIsValid(buffer));
- if (BufferIsLocal(buffer))
- return;
-
- buf = &(BufferDescriptors[buffer - 1]);
-
- if (mode == BUFFER_LOCK_UNLOCK)
- LWLockRelease(buf->cntx_lock);
- else if (mode == BUFFER_LOCK_SHARE)
- LWLockAcquire(buf->cntx_lock, LW_SHARED);
- else if (mode == BUFFER_LOCK_EXCLUSIVE)
- {
- LWLockAcquire(buf->cntx_lock, LW_EXCLUSIVE);
-
- /*
- * This is not the best place to set cntxDirty flag (eg indices do
- * not always change buffer they lock in excl mode). But please
- * remember that it's critical to set cntxDirty *before* logging
- * changes with XLogInsert() - see comments in BufferSync().
- */
- buf->cntxDirty = true;
- }
- else
- elog(ERROR, "LockBuffer: unknown lock mode %d", mode);
-}
-
-/*
- * LockBufferForCleanup - lock a buffer in preparation for deleting items
- *
- * Items may be deleted from a disk page only when the caller (a) holds an
- * exclusive lock on the buffer and (b) has observed that no other backend
- * holds a pin on the buffer. If there is a pin, then the other backend
- * might have a pointer into the buffer (for example, a heapscan reference
- * to an item --- see README for more details). It's OK if a pin is added
- * after the cleanup starts, however; the newly-arrived backend will be
- * unable to look at the page until we release the exclusive lock.
- *
- * To implement this protocol, a would-be deleter must pin the buffer and
- * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
- * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
- * it has successfully observed pin count = 1.
- */
-void
-LockBufferForCleanup(Buffer buffer)
-{
- BufferDesc *bufHdr;
- bits8 *buflock;
-
- Assert(BufferIsValid(buffer));
-
- if (BufferIsLocal(buffer))
- {
- /* There should be exactly one pin */
- if (LocalRefCount[-buffer - 1] != 1)
- elog(ERROR, "LockBufferForCleanup: wrong local pin count");
- /* Nobody else to wait for */
- return;
- }
-
- /* There should be exactly one local pin */
- if (PrivateRefCount[buffer - 1] != 1)
- elog(ERROR, "LockBufferForCleanup: wrong local pin count");
-
- bufHdr = &BufferDescriptors[buffer - 1];
- buflock = &(BufferLocks[buffer - 1]);
-
- for (;;)
- {
- /* Try to acquire lock */
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- Assert(bufHdr->refcount > 0);
- if (bufHdr->refcount == 1)
- {
- /* Successfully acquired exclusive lock with pincount 1 */
- LWLockRelease(BufMgrLock);
- return;
- }
- /* Failed, so mark myself as waiting for pincount 1 */
- if (bufHdr->flags & BM_PIN_COUNT_WAITER)
- {
- LWLockRelease(BufMgrLock);
- LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
- elog(ERROR, "Multiple backends attempting to wait for pincount 1");
- }
- bufHdr->wait_backend_id = MyBackendId;
- bufHdr->flags |= BM_PIN_COUNT_WAITER;
- *buflock |= BL_PIN_COUNT_LOCK;
- LWLockRelease(BufMgrLock);
- LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
- /* Wait to be signaled by UnpinBuffer() */
- ProcWaitForSignal();
- *buflock &= ~BL_PIN_COUNT_LOCK;
- /* Loop back and try again */
- }
-}
-
-/*
- * Functions for IO error handling
- *
- * Note : We assume that nested buffer IO never occur.
- * i.e at most one io_in_progress lock is held per proc.
-*/
-static BufferDesc *InProgressBuf = (BufferDesc *) NULL;
-static bool IsForInput;
-
-/*
- * Function:StartBufferIO
- * (Assumptions)
- * My process is executing no IO
- * BufMgrLock is held
- * BM_IO_IN_PROGRESS mask is not set for the buffer
- * The buffer is Pinned
- *
- * Because BufMgrLock is held, we are already in an interrupt holdoff here,
- * and do not need another.
- */
-static void
-StartBufferIO(BufferDesc *buf, bool forInput)
-{
- Assert(!InProgressBuf);
- Assert(!(buf->flags & BM_IO_IN_PROGRESS));
- buf->flags |= BM_IO_IN_PROGRESS;
-
- LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
-
- InProgressBuf = buf;
- IsForInput = forInput;
-}
-
-/*
- * Function:TerminateBufferIO
- * (Assumptions)
- * My process is executing IO for the buffer
- * BufMgrLock is held
- * The buffer is Pinned
- *
- * Because BufMgrLock is held, we are already in an interrupt holdoff here,
- * and do not need another.
- */
-static void
-TerminateBufferIO(BufferDesc *buf)
-{
- Assert(buf == InProgressBuf);
- LWLockRelease(buf->io_in_progress_lock);
- InProgressBuf = (BufferDesc *) 0;
-}
-
-/*
- * Function:ContinueBufferIO
- * (Assumptions)
- * My process is executing IO for the buffer
- * BufMgrLock is held
- * The buffer is Pinned
- *
- * Because BufMgrLock is held, we are already in an interrupt holdoff here,
- * and do not need another.
- */
-static void
-ContinueBufferIO(BufferDesc *buf, bool forInput)
-{
- Assert(buf == InProgressBuf);
- Assert(buf->flags & BM_IO_IN_PROGRESS);
- IsForInput = forInput;
-}
-
-#ifdef NOT_USED
-void
-InitBufferIO(void)
-{
- InProgressBuf = (BufferDesc *) 0;
-}
-#endif
-
-/*
- * Clean up any active buffer I/O after an error.
- * BufMgrLock isn't held when this function is called.
- *
- * If I/O was in progress, we always set BM_IO_ERROR.
- */
-void
-AbortBufferIO(void)
-{
- BufferDesc *buf = InProgressBuf;
-
- if (buf)
- {
- /*
- * Since LWLockReleaseAll has already been called, we're not
- * holding the buffer's io_in_progress_lock. We have to re-acquire
- * it so that we can use TerminateBufferIO. Anyone who's executing
- * WaitIO on the buffer will be in a busy spin until we succeed in
- * doing this.
- */
- LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
-
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- Assert(buf->flags & BM_IO_IN_PROGRESS);
- if (IsForInput)
- Assert(!(buf->flags & BM_DIRTY) && !(buf->cntxDirty));
- else
- {
- Assert(buf->flags & BM_DIRTY || buf->cntxDirty);
- /* Issue notice if this is not the first failure... */
- if (buf->flags & BM_IO_ERROR)
- {
- elog(WARNING, "write error may be permanent: cannot write block %u for %u/%u",
- buf->tag.blockNum,
- buf->tag.rnode.tblNode, buf->tag.rnode.relNode);
- }
- buf->flags |= BM_DIRTY;
- }
- buf->flags |= BM_IO_ERROR;
- buf->flags &= ~BM_IO_IN_PROGRESS;
- TerminateBufferIO(buf);
- LWLockRelease(BufMgrLock);
- }
-}
-
-RelFileNode
-BufferGetFileNode(Buffer buffer)
-{
- BufferDesc *bufHdr;
-
- if (BufferIsLocal(buffer))
- bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
- else
- bufHdr = &BufferDescriptors[buffer - 1];
-
- return (bufHdr->tag.rnode);
-}