summaryrefslogtreecommitdiff
path: root/src/include/storage/buf_internals.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/storage/buf_internals.h')
-rw-r--r--src/include/storage/buf_internals.h119
1 files changed, 92 insertions, 27 deletions
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index c1206a46aba..5400c56a965 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -211,28 +211,36 @@ BufMappingPartitionLockByIndex(uint32 index)
/*
* BufferDesc -- shared descriptor/state data for a single shared buffer.
*
- * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
- * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
- * is a spinlock which is combined with flags, refcount and usagecount into
- * single atomic variable. This layout allow us to do some operations in a
- * single atomic operation, without actually acquiring and releasing spinlock;
- * for instance, increase or decrease refcount. buf_id field never changes
- * after initialization, so does not need locking. The LWLock can take care
- * of itself. The buffer header lock is *not* used to control access to the
- * data in the buffer!
+ * The state of the buffer is controlled by the, drumroll, state variable. It
+ * only may be modified using atomic operations. The state variable combines
+ * various flags, the buffer's refcount and usage count. See comment above
+ * BUF_REFCOUNT_BITS for details about the division. This layout allow us to
+ * do some operations in a single atomic operation, without actually acquiring
+ * and releasing the spinlock; for instance, increasing or decreasing the
+ * refcount.
*
- * It's assumed that nobody changes the state field while buffer header lock
- * is held. Thus buffer header lock holder can do complex updates of the
- * state variable in single write, simultaneously with lock release (cleaning
- * BM_LOCKED flag). On the other hand, updating of state without holding
- * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
- * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
+ * One of the aforementioned flags is BM_LOCKED, used to implement the buffer
+ * header lock. See the following paragraphs, as well as the documentation for
+ * individual fields, for more details.
*
- * An exception is that if we have the buffer pinned, its tag can't change
- * underneath us, so we can examine the tag without locking the buffer header.
- * Also, in places we do one-time reads of the flags without bothering to
- * lock the buffer header; this is generally for situations where we don't
- * expect the flag bit being tested to be changing.
+ * The identity of the buffer (BufferDesc.tag) can only be changed by the
+ * backend holding the buffer header lock.
+ *
+ * If the lock is held by another backend, neither additional buffer pins may
+ * be established (we would like to relax this eventually), nor can flags be
+ * set/cleared. These operations either need to acquire the buffer header
+ * spinlock, or need to use a CAS loop, waiting for the lock to be released if
+ * it is held. However, existing buffer pins may be released while the buffer
+ * header spinlock is held, using an atomic subtraction.
+ *
+ * The LWLock can take care of itself. The buffer header lock is *not* used
+ * to control access to the data in the buffer!
+ *
+ * If we have the buffer pinned, its tag can't change underneath us, so we can
+ * examine the tag without locking the buffer header. Also, in places we do
+ * one-time reads of the flags without bothering to lock the buffer header;
+ * this is generally for situations where we don't expect the flag bit being
+ * tested to be changing.
*
* We can't physically remove items from a disk page if another backend has
* the buffer pinned. Hence, a backend may need to wait for all other pins
@@ -256,13 +264,29 @@ BufMappingPartitionLockByIndex(uint32 index)
*/
typedef struct BufferDesc
{
- BufferTag tag; /* ID of page contained in buffer */
- int buf_id; /* buffer's index number (from 0) */
+ /*
+ * ID of page contained in buffer. The buffer header spinlock needs to be
+ * held to modify this field.
+ */
+ BufferTag tag;
+
+ /*
+ * Buffer's index number (from 0). The field never changes after
+ * initialization, so does not need locking.
+ */
+ int buf_id;
- /* state of the tag, containing flags, refcount and usagecount */
+ /*
+ * State of the buffer, containing flags, refcount and usagecount. See
+ * BUF_* and BM_* defines at the top of this file.
+ */
pg_atomic_uint32 state;
- int wait_backend_pgprocno; /* backend of pin-count waiter */
+ /*
+ * Backend of pin-count waiter. The buffer header spinlock needs to be
+ * held to modify this field.
+ */
+ int wait_backend_pgprocno;
PgAioWaitRef io_wref; /* set iff AIO is in progress */
LWLock content_lock; /* to lock access to buffer contents */
@@ -364,11 +388,52 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc)
*/
extern uint32 LockBufHdr(BufferDesc *desc);
+/*
+ * Unlock the buffer header.
+ *
+ * This can only be used if the caller did not modify BufferDesc.state. To
+ * set/unset flag bits or change the refcount use UnlockBufHdrExt().
+ */
static inline void
-UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
+UnlockBufHdr(BufferDesc *desc)
{
- pg_write_barrier();
- pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
+ Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
+
+ pg_atomic_fetch_sub_u32(&desc->state, BM_LOCKED);
+}
+
+/*
+ * Unlock the buffer header, while atomically adding the flags in set_bits,
+ * unsetting the ones in unset_bits and changing the refcount by
+ * refcount_change.
+ *
+ * Note that this approach would not work for usagecount, since we need to cap
+ * the usagecount at BM_MAX_USAGE_COUNT.
+ */
+static inline uint32
+UnlockBufHdrExt(BufferDesc *desc, uint32 old_buf_state,
+ uint32 set_bits, uint32 unset_bits,
+ int refcount_change)
+{
+ for (;;)
+ {
+ uint32 buf_state = old_buf_state;
+
+ Assert(buf_state & BM_LOCKED);
+
+ buf_state |= set_bits;
+ buf_state &= ~unset_bits;
+ buf_state &= ~BM_LOCKED;
+
+ if (refcount_change != 0)
+ buf_state += BUF_REFCOUNT_ONE * refcount_change;
+
+ if (pg_atomic_compare_exchange_u32(&desc->state, &old_buf_state,
+ buf_state))
+ {
+ return old_buf_state;
+ }
+ }
}
extern uint32 WaitBufHdrUnlocked(BufferDesc *buf);