diff options
Diffstat (limited to 'src/include/storage/buf_internals.h')
| -rw-r--r-- | src/include/storage/buf_internals.h | 119 |
1 files changed, 92 insertions, 27 deletions
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index c1206a46aba..5400c56a965 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -211,28 +211,36 @@ BufMappingPartitionLockByIndex(uint32 index) /* * BufferDesc -- shared descriptor/state data for a single shared buffer. * - * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change - * tag, state or wait_backend_pgprocno fields. In general, buffer header lock - * is a spinlock which is combined with flags, refcount and usagecount into - * single atomic variable. This layout allow us to do some operations in a - * single atomic operation, without actually acquiring and releasing spinlock; - * for instance, increase or decrease refcount. buf_id field never changes - * after initialization, so does not need locking. The LWLock can take care - * of itself. The buffer header lock is *not* used to control access to the - * data in the buffer! + * The state of the buffer is controlled by the, drumroll, state variable. It + * only may be modified using atomic operations. The state variable combines + * various flags, the buffer's refcount and usage count. See comment above + * BUF_REFCOUNT_BITS for details about the division. This layout allow us to + * do some operations in a single atomic operation, without actually acquiring + * and releasing the spinlock; for instance, increasing or decreasing the + * refcount. * - * It's assumed that nobody changes the state field while buffer header lock - * is held. Thus buffer header lock holder can do complex updates of the - * state variable in single write, simultaneously with lock release (cleaning - * BM_LOCKED flag). On the other hand, updating of state without holding - * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag - * is not set. Atomic increment/decrement, OR/AND etc. are not allowed. + * One of the aforementioned flags is BM_LOCKED, used to implement the buffer + * header lock. See the following paragraphs, as well as the documentation for + * individual fields, for more details. * - * An exception is that if we have the buffer pinned, its tag can't change - * underneath us, so we can examine the tag without locking the buffer header. - * Also, in places we do one-time reads of the flags without bothering to - * lock the buffer header; this is generally for situations where we don't - * expect the flag bit being tested to be changing. + * The identity of the buffer (BufferDesc.tag) can only be changed by the + * backend holding the buffer header lock. + * + * If the lock is held by another backend, neither additional buffer pins may + * be established (we would like to relax this eventually), nor can flags be + * set/cleared. These operations either need to acquire the buffer header + * spinlock, or need to use a CAS loop, waiting for the lock to be released if + * it is held. However, existing buffer pins may be released while the buffer + * header spinlock is held, using an atomic subtraction. + * + * The LWLock can take care of itself. The buffer header lock is *not* used + * to control access to the data in the buffer! + * + * If we have the buffer pinned, its tag can't change underneath us, so we can + * examine the tag without locking the buffer header. Also, in places we do + * one-time reads of the flags without bothering to lock the buffer header; + * this is generally for situations where we don't expect the flag bit being + * tested to be changing. * * We can't physically remove items from a disk page if another backend has * the buffer pinned. Hence, a backend may need to wait for all other pins @@ -256,13 +264,29 @@ BufMappingPartitionLockByIndex(uint32 index) */ typedef struct BufferDesc { - BufferTag tag; /* ID of page contained in buffer */ - int buf_id; /* buffer's index number (from 0) */ + /* + * ID of page contained in buffer. The buffer header spinlock needs to be + * held to modify this field. + */ + BufferTag tag; + + /* + * Buffer's index number (from 0). The field never changes after + * initialization, so does not need locking. + */ + int buf_id; - /* state of the tag, containing flags, refcount and usagecount */ + /* + * State of the buffer, containing flags, refcount and usagecount. See + * BUF_* and BM_* defines at the top of this file. + */ pg_atomic_uint32 state; - int wait_backend_pgprocno; /* backend of pin-count waiter */ + /* + * Backend of pin-count waiter. The buffer header spinlock needs to be + * held to modify this field. + */ + int wait_backend_pgprocno; PgAioWaitRef io_wref; /* set iff AIO is in progress */ LWLock content_lock; /* to lock access to buffer contents */ @@ -364,11 +388,52 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc) */ extern uint32 LockBufHdr(BufferDesc *desc); +/* + * Unlock the buffer header. + * + * This can only be used if the caller did not modify BufferDesc.state. To + * set/unset flag bits or change the refcount use UnlockBufHdrExt(). + */ static inline void -UnlockBufHdr(BufferDesc *desc, uint32 buf_state) +UnlockBufHdr(BufferDesc *desc) { - pg_write_barrier(); - pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED)); + Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED); + + pg_atomic_fetch_sub_u32(&desc->state, BM_LOCKED); +} + +/* + * Unlock the buffer header, while atomically adding the flags in set_bits, + * unsetting the ones in unset_bits and changing the refcount by + * refcount_change. + * + * Note that this approach would not work for usagecount, since we need to cap + * the usagecount at BM_MAX_USAGE_COUNT. + */ +static inline uint32 +UnlockBufHdrExt(BufferDesc *desc, uint32 old_buf_state, + uint32 set_bits, uint32 unset_bits, + int refcount_change) +{ + for (;;) + { + uint32 buf_state = old_buf_state; + + Assert(buf_state & BM_LOCKED); + + buf_state |= set_bits; + buf_state &= ~unset_bits; + buf_state &= ~BM_LOCKED; + + if (refcount_change != 0) + buf_state += BUF_REFCOUNT_ONE * refcount_change; + + if (pg_atomic_compare_exchange_u32(&desc->state, &old_buf_state, + buf_state)) + { + return old_buf_state; + } + } } extern uint32 WaitBufHdrUnlocked(BufferDesc *buf); |
