From 3b97e6823b949624afdc3ce4c92b29a80429715f Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Mon, 16 Dec 2013 11:29:50 -0300 Subject: Rework tuple freezing protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund --- src/backend/access/transam/multixact.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'src/backend/access/transam/multixact.c') diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 05e1dcb49c5..55a8ca7ac49 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -289,7 +289,6 @@ static MemoryContext MXactContext = NULL; /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); -static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, MultiXactMember *members); static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); @@ -336,6 +335,9 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + /* * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs * are still running. In typical usage, xid2 will be our own XID and the @@ -347,7 +349,7 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, members[1].xid = xid2; members[1].status = status2; - newMulti = CreateMultiXactId(2, members); + newMulti = MultiXactIdCreateFromMembers(2, members); debug_elog3(DEBUG2, "Create: %s", mxid_to_string(newMulti, 2, members)); @@ -387,6 +389,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) AssertArg(MultiXactIdIsValid(multi)); AssertArg(TransactionIdIsValid(xid)); + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s", multi, xid, mxstatus_to_string(status)); @@ -410,7 +415,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) */ member.xid = xid; member.status = status; - newMulti = CreateMultiXactId(1, &member); + newMulti = MultiXactIdCreateFromMembers(1, &member); debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", multi, newMulti); @@ -462,7 +467,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) newMembers[j].xid = xid; newMembers[j++].status = status; - newMulti = CreateMultiXactId(j, newMembers); + newMulti = MultiXactIdCreateFromMembers(j, newMembers); pfree(members); pfree(newMembers); @@ -667,16 +672,16 @@ ReadNextMultiXactId(void) } /* - * CreateMultiXactId - * Make a new MultiXactId + * MultiXactIdCreateFromMembers + * Make a new MultiXactId from the specified set of members * * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the * given TransactionIds as members. Returns the newly created MultiXactId. * * NB: the passed members[] array will be sorted in-place. */ -static MultiXactId -CreateMultiXactId(int nmembers, MultiXactMember *members) +MultiXactId +MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) { MultiXactId multi; MultiXactOffset offset; @@ -707,6 +712,13 @@ CreateMultiXactId(int nmembers, MultiXactMember *members) * Assign the MXID and offsets range to use, and make sure there is space * in the OFFSETs and MEMBERs files. NB: this routine does * START_CRIT_SECTION(). + * + * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check + * that we've called MultiXactIdSetOldestMember here. This is because + * this routine is used in some places to create new MultiXactIds of which + * the current backend is not a member, notably during freezing of multis + * in vacuum. During vacuum, in particular, it would be unacceptable to + * keep OldestMulti set, in case it runs for long. */ multi = GetNewMultiXactId(nmembers, &offset); @@ -763,7 +775,8 @@ CreateMultiXactId(int nmembers, MultiXactMember *members) * RecordNewMultiXact * Write info about a new multixact into the offsets and members files * - * This is broken out of CreateMultiXactId so that xlog replay can use it. + * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can + * use it. */ static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, @@ -867,9 +880,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers); - /* MultiXactIdSetOldestMember() must have been called already */ - Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); - /* safety check, we should never get this far in a HS slave */ if (RecoveryInProgress()) elog(ERROR, "cannot assign MultiXactIds during recovery"); -- cgit v1.2.3