From 92fe23d93aa3bbbc40fca669cabc4a4d7975e327 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 4 Apr 2025 12:27:04 -0400 Subject: Add nbtree skip scan optimization. Teach nbtree multi-column index scans to opportunistically skip over irrelevant sections of the index given a query with no "=" conditions on one or more prefix index columns. When nbtree is passed input scan keys derived from a predicate "WHERE b = 5", new nbtree preprocessing steps output "WHERE a = ANY() AND b = 5" scan keys. That is, preprocessing generates a "skip array" (and an output scan key) for the omitted prefix column "a", which makes it safe to mark the scan key on "b" as required to continue the scan. The scan is therefore able to repeatedly reposition itself by applying both the "a" and "b" keys. A skip array has "elements" that are generated procedurally and on demand, but otherwise works just like a regular ScalarArrayOp array. Preprocessing can freely add a skip array before or after any input ScalarArrayOp arrays. Index scans with a skip array decide when and where to reposition the scan using the same approach as any other scan with array keys. This design builds on the design for array advancement and primitive scan scheduling added to Postgres 17 by commit 5bf748b8. Testing has shown that skip scans of an index with a low cardinality skipped prefix column can be multiple orders of magnitude faster than an equivalent full index scan (or sequential scan). In general, the cardinality of the scan's skipped column(s) limits the number of leaf pages that can be skipped over. The core B-Tree operator classes on most discrete types generate their array elements with the help of their own custom skip support routine. This infrastructure gives nbtree a way to generate the next required array element by incrementing (or decrementing) the current array value. It can reduce the number of index descents in cases where the next possible indexable value frequently turns out to be the next value stored in the index. Opclasses that lack a skip support routine fall back on having nbtree "increment" (or "decrement") a skip array's current element by setting the NEXT (or PRIOR) scan key flag, without directly changing the scan key's sk_argument. These sentinel values behave just like any other value from an array -- though they can never locate equal index tuples (they can only locate the next group of index tuples containing the next set of non-sentinel values that the scan's arrays need to advance to). A skip array's range is constrained by "contradictory" inequality keys. For example, a skip array on "x" will only generate the values 1 and 2 given a qual such as "WHERE x BETWEEN 1 AND 2 AND y = 66". Such a skip array qual usually has near-identical performance characteristics to a comparable SAOP qual "WHERE x = ANY('{1, 2}') AND y = 66". However, improved performance isn't guaranteed. Much depends on physical index characteristics. B-Tree preprocessing is optimistic about skipping working out: it applies static, generic rules when determining where to generate skip arrays, which assumes that the runtime overhead of maintaining skip arrays will pay for itself -- or lead to only a modest performance loss. As things stand, these assumptions are much too optimistic: skip array maintenance will lead to unacceptable regressions with unsympathetic queries (queries whose scan can't skip over many irrelevant leaf pages). An upcoming commit will address the problems in this area by enhancing _bt_readpage's approach to saving cycles on scan key evaluation, making it work in a way that directly considers the needs of = array keys (particularly = skip array keys). Author: Peter Geoghegan Reviewed-By: Masahiro Ikeda Reviewed-By: Heikki Linnakangas Reviewed-By: Matthias van de Meent Reviewed-By: Tomas Vondra Reviewed-By: Aleksander Alekseev Reviewed-By: Alena Rybakina Discussion: https://postgr.es/m/CAH2-Wzmn1YsLzOGgjAQZdn1STSG_y8qP__vggTaPAYXJP+G4bw@mail.gmail.com --- src/backend/access/nbtree/nbtcompare.c | 273 ++++++++++ src/backend/access/nbtree/nbtpreprocesskeys.c | 631 ++++++++++++++++++--- src/backend/access/nbtree/nbtree.c | 196 ++++++- src/backend/access/nbtree/nbtsearch.c | 130 ++++- src/backend/access/nbtree/nbtutils.c | 756 +++++++++++++++++++++++--- src/backend/access/nbtree/nbtvalidate.c | 4 + 6 files changed, 1792 insertions(+), 198 deletions(-) (limited to 'src/backend/access/nbtree') diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 291cb8fc15d..4da5a3c1d16 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -58,6 +58,7 @@ #include #include "utils/fmgrprotos.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #ifdef STRESS_SORT_INT_MIN @@ -78,6 +79,51 @@ btboolcmp(PG_FUNCTION_ARGS) PG_RETURN_INT32((int32) a - (int32) b); } +static Datum +bool_decrement(Relation rel, Datum existing, bool *underflow) +{ + bool bexisting = DatumGetBool(existing); + + if (bexisting == false) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return BoolGetDatum(bexisting - 1); +} + +static Datum +bool_increment(Relation rel, Datum existing, bool *overflow) +{ + bool bexisting = DatumGetBool(existing); + + if (bexisting == true) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return BoolGetDatum(bexisting + 1); +} + +Datum +btboolskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = bool_decrement; + sksup->increment = bool_increment; + sksup->low_elem = BoolGetDatum(false); + sksup->high_elem = BoolGetDatum(true); + + PG_RETURN_VOID(); +} + Datum btint2cmp(PG_FUNCTION_ARGS) { @@ -105,6 +151,51 @@ btint2sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int2_decrement(Relation rel, Datum existing, bool *underflow) +{ + int16 iexisting = DatumGetInt16(existing); + + if (iexisting == PG_INT16_MIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return Int16GetDatum(iexisting - 1); +} + +static Datum +int2_increment(Relation rel, Datum existing, bool *overflow) +{ + int16 iexisting = DatumGetInt16(existing); + + if (iexisting == PG_INT16_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return Int16GetDatum(iexisting + 1); +} + +Datum +btint2skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int2_decrement; + sksup->increment = int2_increment; + sksup->low_elem = Int16GetDatum(PG_INT16_MIN); + sksup->high_elem = Int16GetDatum(PG_INT16_MAX); + + PG_RETURN_VOID(); +} + Datum btint4cmp(PG_FUNCTION_ARGS) { @@ -128,6 +219,51 @@ btint4sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int4_decrement(Relation rel, Datum existing, bool *underflow) +{ + int32 iexisting = DatumGetInt32(existing); + + if (iexisting == PG_INT32_MIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return Int32GetDatum(iexisting - 1); +} + +static Datum +int4_increment(Relation rel, Datum existing, bool *overflow) +{ + int32 iexisting = DatumGetInt32(existing); + + if (iexisting == PG_INT32_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return Int32GetDatum(iexisting + 1); +} + +Datum +btint4skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int4_decrement; + sksup->increment = int4_increment; + sksup->low_elem = Int32GetDatum(PG_INT32_MIN); + sksup->high_elem = Int32GetDatum(PG_INT32_MAX); + + PG_RETURN_VOID(); +} + Datum btint8cmp(PG_FUNCTION_ARGS) { @@ -171,6 +307,51 @@ btint8sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int8_decrement(Relation rel, Datum existing, bool *underflow) +{ + int64 iexisting = DatumGetInt64(existing); + + if (iexisting == PG_INT64_MIN) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return Int64GetDatum(iexisting - 1); +} + +static Datum +int8_increment(Relation rel, Datum existing, bool *overflow) +{ + int64 iexisting = DatumGetInt64(existing); + + if (iexisting == PG_INT64_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return Int64GetDatum(iexisting + 1); +} + +Datum +btint8skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int8_decrement; + sksup->increment = int8_increment; + sksup->low_elem = Int64GetDatum(PG_INT64_MIN); + sksup->high_elem = Int64GetDatum(PG_INT64_MAX); + + PG_RETURN_VOID(); +} + Datum btint48cmp(PG_FUNCTION_ARGS) { @@ -292,6 +473,51 @@ btoidsortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +oid_decrement(Relation rel, Datum existing, bool *underflow) +{ + Oid oexisting = DatumGetObjectId(existing); + + if (oexisting == InvalidOid) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return ObjectIdGetDatum(oexisting - 1); +} + +static Datum +oid_increment(Relation rel, Datum existing, bool *overflow) +{ + Oid oexisting = DatumGetObjectId(existing); + + if (oexisting == OID_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return ObjectIdGetDatum(oexisting + 1); +} + +Datum +btoidskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = oid_decrement; + sksup->increment = oid_increment; + sksup->low_elem = ObjectIdGetDatum(InvalidOid); + sksup->high_elem = ObjectIdGetDatum(OID_MAX); + + PG_RETURN_VOID(); +} + Datum btoidvectorcmp(PG_FUNCTION_ARGS) { @@ -325,3 +551,50 @@ btcharcmp(PG_FUNCTION_ARGS) /* Be careful to compare chars as unsigned */ PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b)); } + +static Datum +char_decrement(Relation rel, Datum existing, bool *underflow) +{ + uint8 cexisting = UInt8GetDatum(existing); + + if (cexisting == 0) + { + /* return value is undefined */ + *underflow = true; + return (Datum) 0; + } + + *underflow = false; + return CharGetDatum((uint8) cexisting - 1); +} + +static Datum +char_increment(Relation rel, Datum existing, bool *overflow) +{ + uint8 cexisting = UInt8GetDatum(existing); + + if (cexisting == UCHAR_MAX) + { + /* return value is undefined */ + *overflow = true; + return (Datum) 0; + } + + *overflow = false; + return CharGetDatum((uint8) cexisting + 1); +} + +Datum +btcharskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = char_decrement; + sksup->increment = char_increment; + + /* btcharcmp compares chars as unsigned */ + sksup->low_elem = UInt8GetDatum(0); + sksup->high_elem = UInt8GetDatum(UCHAR_MAX); + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 38a87af1cc8..791aa4ece40 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -45,8 +45,15 @@ static bool _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, FmgrInfo *orderproc, BTArrayKeyInfo *array, bool *qual_ok); +static bool _bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk, + ScanKey skey, FmgrInfo *orderproc, + BTArrayKeyInfo *array, bool *qual_ok); +static bool _bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey, + BTArrayKeyInfo *array, bool *qual_ok); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); +static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out, + int *numSkipArrayKeys_out); static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype, StrategyNumber strat, Datum *elems, int nelems); @@ -89,6 +96,8 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * within each attribute may be done as a byproduct of the processing here. * That process must leave array scan keys (within an attribute) in the same * order as corresponding entries from the scan's BTArrayKeyInfo array info. + * We might also construct skip array scan keys that weren't present in the + * original input keys; these are also output in standard attribute order. * * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD * if they must be satisfied in order to continue the scan forward or backward @@ -101,10 +110,16 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD. * For the first attribute without an "=" key, any "<" and "<=" keys are * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD. - * This can be seen to be correct by considering the above example. Note - * in particular that if there are no keys for a given attribute, the keys for - * subsequent attributes can never be required; for instance "WHERE y = 4" - * requires a full-index scan. + * This can be seen to be correct by considering the above example. + * + * If we never generated skip array scan keys, it would be possible for "gaps" + * to appear that make it unsafe to mark any subsequent input scan keys + * (copied from scan->keyData[]) as required to continue the scan. Prior to + * Postgres 18, a qual like "WHERE y = 4" always resulted in a full scan. + * This qual now becomes "WHERE x = ANY('{every possible x value}') and y = 4" + * on output. In other words, preprocessing now adds a skip array on "x". + * This has the potential to be much more efficient than a full index scan + * (though it behaves like a full scan when there's many distinct "x" values). * * If possible, redundant keys are eliminated: we keep only the tightest * >/>= bound and the tightest keyData[]). + * + * Row comparison keys currently have a couple of notable limitations. + * Right now we just transfer them into the preprocessed array without any * editorialization. We can treat them the same as an ordinary inequality * comparison on the row's first index column, for the purposes of the logic - * about required keys. + * about required keys. Also, we are unable to merge a row comparison key + * into a skip array (only ordinary inequalities are merged). A key that + * comes after a Row comparison key is therefore never marked as required. * * Note: the reason we have to copy the preprocessed scan keys into private * storage is that we are modifying the array based on comparisons of the @@ -200,6 +224,14 @@ _bt_preprocess_keys(IndexScanDesc scan) /* Also maintain keyDataMap for remapping so->orderProcs[] later */ keyDataMap = MemoryContextAlloc(so->arrayContext, numberOfKeys * sizeof(int)); + + /* + * Also enlarge output array when it might otherwise not have room for + * a skip array's scan key + */ + if (numberOfKeys > scan->numberOfKeys) + so->keyData = repalloc(so->keyData, + numberOfKeys * sizeof(ScanKeyData)); } else inkeys = scan->keyData; @@ -229,6 +261,7 @@ _bt_preprocess_keys(IndexScanDesc scan) Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY); Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber || (so->arrayKeys[0].scan_key == 0 && + !(so->keyData[0].sk_flags & SK_BT_SKIP) && OidIsValid(so->orderProcs[0].fn_oid))); } @@ -288,7 +321,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * redundant. Note that this is no less true if the = key is * SEARCHARRAY; the only real difference is that the inequality * key _becomes_ redundant by making _bt_compare_scankey_args - * eliminate the subset of elements that won't need to be matched. + * eliminate the subset of elements that won't need to be matched + * (with SAOP arrays and skip arrays alike). * * If we have a case like "key = 1 AND key > 2", we set qual_ok to * false and abandon further processing. We'll do the same thing @@ -345,7 +379,6 @@ _bt_preprocess_keys(IndexScanDesc scan) return; } /* else discard the redundant non-equality key */ - Assert(!array || array->num_elems > 0); xform[j].inkey = NULL; xform[j].inkeyi = -1; } @@ -393,6 +426,11 @@ _bt_preprocess_keys(IndexScanDesc scan) * Emit the cleaned-up keys into the so->keyData[] array, and then * mark them if they are required. They are required (possibly * only in one direction) if all attrs before this one had "=". + * + * In practice we'll rarely output non-required scan keys here; + * typically, _bt_preprocess_array_keys has already added "=" keys + * sufficient to form an unbroken series of "=" constraints on all + * attrs prior to the attr from the final scan->keyData[] key. */ for (j = BTMaxStrategyNumber; --j >= 0;) { @@ -481,6 +519,7 @@ _bt_preprocess_keys(IndexScanDesc scan) Assert(array->scan_key == i); Assert(OidIsValid(orderproc->fn_oid)); + Assert(!(inkey->sk_flags & SK_BT_SKIP)); } else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY) { @@ -489,6 +528,7 @@ _bt_preprocess_keys(IndexScanDesc scan) Assert(array->scan_key == xform[j].inkeyi); Assert(OidIsValid(orderproc->fn_oid)); + Assert(!(xform[j].inkey->sk_flags & SK_BT_SKIP)); } /* @@ -508,8 +548,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* Have all we need to determine redundancy */ if (test_result) { - Assert(!array || array->num_elems > 0); - /* * New key is more restrictive, and so replaces old key... */ @@ -803,6 +841,9 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, cmp_op; StrategyNumber strat; + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & + (SK_ROW_HEADER | SK_ROW_MEMBER))); + /* * First, deal with cases where one or both args are NULL. This should * only happen when the scankeys represent IS NULL/NOT NULL conditions. @@ -812,6 +853,22 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, bool leftnull, rightnull; + /* Handle skip array comparison with IS NOT NULL scan key */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP) + { + /* Shouldn't generate skip array in presence of IS NULL key */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNULL)); + Assert((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNOTNULL); + + /* Skip array will have no NULL element/IS NULL scan key */ + Assert(array->num_elems == -1); + array->null_elem = false; + + /* IS NOT NULL key (could be leftarg or rightarg) now redundant */ + *result = true; + return true; + } + if (leftarg->sk_flags & SK_ISNULL) { Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); @@ -885,6 +942,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, { /* Can't make the comparison */ *result = false; /* suppress compiler warnings */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); return false; } @@ -978,24 +1036,56 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, * Compare an array scan key to a scalar scan key, eliminating contradictory * array elements such that the scalar scan key becomes redundant. * + * If the opfamily is incomplete we may not be able to determine which + * elements are contradictory. When we return true we'll have validly set + * *qual_ok, guaranteeing that at least the scalar scan key can be considered + * redundant. We return false if the comparison could not be made (caller + * must keep both scan keys when this happens). + * + * Note: it's up to caller to deal with IS [NOT] NULL scan keys, as well as + * row comparison scan keys. We only deal with scalar scan keys. + */ +static bool +_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, + bool *qual_ok) +{ + Assert(arraysk->sk_attno == skey->sk_attno); + Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert((arraysk->sk_flags & SK_SEARCHARRAY) && + arraysk->sk_strategy == BTEqualStrategyNumber); + /* don't expect to have to deal with NULLs/row comparison scan keys */ + Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!(skey->sk_flags & SK_SEARCHARRAY) || + skey->sk_strategy != BTEqualStrategyNumber); + + /* + * Just call the appropriate helper function based on whether it's a SAOP + * array or a skip array. Both helpers will set *qual_ok in passing. + */ + if (array->num_elems != -1) + return _bt_saoparray_shrink(scan, arraysk, skey, orderproc, array, + qual_ok); + else + return _bt_skiparray_shrink(scan, skey, array, qual_ok); +} + +/* + * Preprocessing of SAOP array scan key, used to determine which array + * elements are eliminated as contradictory by a non-array scalar key. + * + * _bt_compare_array_scankey_args helper function. + * * Array elements can be eliminated as contradictory when excluded by some * other operator on the same attribute. For example, with an index scan qual * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1" * are eliminated, and the < scan key is eliminated as redundant. Cases where * every array element is eliminated by a redundant scalar scan key have an * unsatisfiable qual, which we handle by setting *qual_ok=false for caller. - * - * If the opfamily doesn't supply a complete set of cross-type ORDER procs we - * may not be able to determine which elements are contradictory. If we have - * the required ORDER proc then we return true (and validly set *qual_ok), - * guaranteeing that at least the scalar scan key can be considered redundant. - * We return false if the comparison could not be made (caller must keep both - * scan keys when this happens). */ static bool -_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, - FmgrInfo *orderproc, BTArrayKeyInfo *array, - bool *qual_ok) +_bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, bool *qual_ok) { Relation rel = scan->indexRelation; Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1]; @@ -1006,14 +1096,8 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey FmgrInfo crosstypeproc; FmgrInfo *orderprocp = orderproc; - Assert(arraysk->sk_attno == skey->sk_attno); Assert(array->num_elems > 0); - Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); - Assert((arraysk->sk_flags & SK_SEARCHARRAY) && - arraysk->sk_strategy == BTEqualStrategyNumber); - Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); - Assert(!(skey->sk_flags & SK_SEARCHARRAY) || - skey->sk_strategy != BTEqualStrategyNumber); + Assert(!(arraysk->sk_flags & SK_BT_SKIP)); /* * _bt_binsrch_array_skey searches an array for the entry best matching a @@ -1112,6 +1196,106 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey return true; } +/* + * Preprocessing of skip array scan key, used to determine redundancy against + * a non-array scalar scan key (must be an inequality). + * + * _bt_compare_array_scankey_args helper function. + * + * Skip arrays work by procedurally generating their elements as needed, so we + * just store the inequality as the skip array's low_compare or high_compare + * (except when there's already a more restrictive low_compare/high_compare). + * The array's final elements are the range of values that still satisfy the + * array's final low_compare and high_compare. + */ +static bool +_bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey, BTArrayKeyInfo *array, + bool *qual_ok) +{ + bool test_result; + + Assert(array->num_elems == -1); + + /* + * Array's index attribute will be constrained by a strict operator/key. + * Array must not "contain a NULL element" (i.e. the scan must not apply + * "IS NULL" qual when it reaches the end of the index that stores NULLs). + */ + array->null_elem = false; + *qual_ok = true; + + /* + * Consider if we should treat caller's scalar scan key as the skip + * array's high_compare or low_compare. + * + * In general the current array element must either be a copy of a value + * taken from an index tuple, or a derivative value generated by opclass's + * skip support function. That way the scan can always safely assume that + * it's okay to use the only-input-opclass-type proc from so->orderProcs[] + * (they can be cross-type with SAOP arrays, but never with skip arrays). + * + * This approach is enabled by MINVAL/MAXVAL sentinel key markings, which + * can be thought of as representing either the lowest or highest matching + * array element (excluding the NULL element, where applicable, though as + * just discussed it isn't applicable to this range skip array anyway). + * Array keys marked MINVAL/MAXVAL never have a valid datum in their + * sk_argument field. The scan directly applies the array's low_compare + * key when it encounters MINVAL in the array key proper (just as it + * applies high_compare when it sees MAXVAL set in the array key proper). + * The scan must never use the array's so->orderProcs[] proc against + * low_compare's/high_compare's sk_argument, either (so->orderProcs[] is + * only intended to be used with rhs datums from the array proper/index). + */ + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (array->high_compare) + { + /* replace existing high_compare with caller's key? */ + if (!_bt_compare_scankey_args(scan, array->high_compare, skey, + array->high_compare, NULL, NULL, + &test_result)) + return false; /* can't determine more restrictive key */ + + if (!test_result) + return true; /* no, just discard caller's key */ + + /* yes, replace existing high_compare with caller's key */ + } + + /* caller's key becomes skip array's high_compare */ + array->high_compare = skey; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (array->low_compare) + { + /* replace existing low_compare with caller's key? */ + if (!_bt_compare_scankey_args(scan, array->low_compare, skey, + array->low_compare, NULL, NULL, + &test_result)) + return false; /* can't determine more restrictive key */ + + if (!test_result) + return true; /* no, just discard caller's key */ + + /* yes, replace existing low_compare with caller's key */ + } + + /* caller's key becomes skip array's low_compare */ + array->low_compare = skey; + break; + case BTEqualStrategyNumber: + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } + + return true; +} + /* * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys * @@ -1137,6 +1321,12 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey * one equality strategy array scan key per index attribute. We'll always be * able to set things up that way when complete opfamilies are used. * + * We're also responsible for generating skip arrays (and their associated + * scan keys) here. This enables skip scan. We do this for index attributes + * that initially lacked an equality condition within scan->keyData[], iff + * doing so allows a later scan key (that was passed to us in scan->keyData[]) + * to be marked required by our _bt_preprocess_keys caller. + * * We set the scan key references from the scan's BTArrayKeyInfo info array to * offsets into the temp modified input array returned to caller. Scans that * have array keys should call _bt_preprocess_array_keys_final when standard @@ -1144,49 +1334,45 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey * references into references to the scan's so->keyData[] output scan keys. * * Note: the reason we need to return a temp scan key array, rather than just - * scribbling on scan->keyData, is that callers are permitted to call btrescan - * without supplying a new set of scankey data. + * modifying scan->keyData[], is that callers are permitted to call btrescan + * without supplying a new set of scankey data. Certain other preprocessing + * routines (e.g., _bt_fix_scankey_strategy) _can_ modify scan->keyData[], but + * we can't make that work here because our modifications are non-idempotent. */ static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - int numberOfKeys = scan->numberOfKeys; int16 *indoption = rel->rd_indoption; + Oid skip_eq_ops[INDEX_MAX_KEYS]; int numArrayKeys, - output_ikey = 0; + numSkipArrayKeys, + numArrayKeyData; + AttrNumber attno_skip = 1; int origarrayatt = InvalidAttrNumber, origarraykey = -1; Oid origelemtype = InvalidOid; - ScanKey cur; MemoryContext oldContext; ScanKey arrayKeyData; /* modified copy of scan->keyData */ - Assert(numberOfKeys); - - /* Quick check to see if there are any array keys */ - numArrayKeys = 0; - for (int i = 0; i < numberOfKeys; i++) - { - cur = &scan->keyData[i]; - if (cur->sk_flags & SK_SEARCHARRAY) - { - numArrayKeys++; - Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL))); - /* If any arrays are null as a whole, we can quit right now. */ - if (cur->sk_flags & SK_ISNULL) - { - so->qual_ok = false; - return NULL; - } - } - } + /* + * Check the number of input array keys within scan->keyData[] input keys + * (also checks if we should add extra skip arrays based on input keys) + */ + numArrayKeys = _bt_num_array_keys(scan, skip_eq_ops, &numSkipArrayKeys); /* Quit if nothing to do. */ if (numArrayKeys == 0) return NULL; + /* + * Estimated final size of arrayKeyData[] array we'll return to our caller + * is the size of the original scan->keyData[] input array, plus space for + * any additional skip array scan keys we'll need to generate below + */ + numArrayKeyData = scan->numberOfKeys + numSkipArrayKeys; + /* * Make a scan-lifespan context to hold array-associated data, or reset it * if we already have one from a previous rescan cycle. @@ -1201,18 +1387,20 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) oldContext = MemoryContextSwitchTo(so->arrayContext); /* Create output scan keys in the workspace context */ - arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData)); + arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); /* Allocate space for ORDER procs used to help _bt_checkkeys */ - so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo)); + so->orderProcs = (FmgrInfo *) palloc(numArrayKeyData * sizeof(FmgrInfo)); - /* Now process each array key */ numArrayKeys = 0; - for (int input_ikey = 0; input_ikey < numberOfKeys; input_ikey++) + numArrayKeyData = 0; + for (int input_ikey = 0; input_ikey < scan->numberOfKeys; input_ikey++) { + ScanKey inkey = scan->keyData + input_ikey, + cur; FmgrInfo sortproc; FmgrInfo *sortprocp = &sortproc; Oid elemtype; @@ -1225,21 +1413,113 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) Datum *elem_values; bool *elem_nulls; int num_nonnulls; - int j; + + /* set up next output scan key */ + cur = &arrayKeyData[numArrayKeyData]; + + /* Backfill skip arrays for attrs < or <= input key's attr? */ + while (numSkipArrayKeys && attno_skip <= inkey->sk_attno) + { + Oid opfamily = rel->rd_opfamily[attno_skip - 1]; + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + Oid collation = rel->rd_indcollation[attno_skip - 1]; + Oid eq_op = skip_eq_ops[attno_skip - 1]; + CompactAttribute *attr; + RegProcedure cmp_proc; + + if (!OidIsValid(eq_op)) + { + /* + * Attribute already has an = input key, so don't output a + * skip array for attno_skip. Just copy attribute's = input + * key into arrayKeyData[] once outside this inner loop. + * + * Note: When we get here there must be a later attribute that + * lacks an equality input key, and still needs a skip array + * (if there wasn't then numSkipArrayKeys would be 0 by now). + */ + Assert(attno_skip == inkey->sk_attno); + /* inkey can't be last input key to be marked required: */ + Assert(input_ikey < scan->numberOfKeys - 1); +#if 0 + /* Could be a redundant input scan key, so can't do this: */ + Assert(inkey->sk_strategy == BTEqualStrategyNumber || + (inkey->sk_flags & SK_SEARCHNULL)); +#endif + + attno_skip++; + break; + } + + cmp_proc = get_opcode(eq_op); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing oprcode for skipping equals operator %u", eq_op); + + ScanKeyEntryInitialize(cur, + SK_SEARCHARRAY | SK_BT_SKIP, /* flags */ + attno_skip, /* skipped att number */ + BTEqualStrategyNumber, /* equality strategy */ + InvalidOid, /* opclass input subtype */ + collation, /* index column's collation */ + cmp_proc, /* equality operator's proc */ + (Datum) 0); /* constant */ + + /* Initialize generic BTArrayKeyInfo fields */ + so->arrayKeys[numArrayKeys].scan_key = numArrayKeyData; + so->arrayKeys[numArrayKeys].num_elems = -1; + + /* Initialize skip array specific BTArrayKeyInfo fields */ + attr = TupleDescCompactAttr(RelationGetDescr(rel), attno_skip - 1); + reverse = (indoption[attno_skip - 1] & INDOPTION_DESC) != 0; + so->arrayKeys[numArrayKeys].attlen = attr->attlen; + so->arrayKeys[numArrayKeys].attbyval = attr->attbyval; + so->arrayKeys[numArrayKeys].null_elem = true; /* for now */ + so->arrayKeys[numArrayKeys].sksup = + PrepareSkipSupportFromOpclass(opfamily, opcintype, reverse); + so->arrayKeys[numArrayKeys].low_compare = NULL; /* for now */ + so->arrayKeys[numArrayKeys].high_compare = NULL; /* for now */ + + /* + * We'll need a 3-way ORDER proc. Set that up now. + */ + _bt_setup_array_cmp(scan, cur, opcintype, + &so->orderProcs[numArrayKeyData], NULL); + + numArrayKeys++; + numArrayKeyData++; /* keep this scan key/array */ + + /* set up next output scan key */ + cur = &arrayKeyData[numArrayKeyData]; + + /* remember having output this skip array and scan key */ + numSkipArrayKeys--; + attno_skip++; + } /* * Provisionally copy scan key into arrayKeyData[] array we'll return * to _bt_preprocess_keys caller */ - cur = &arrayKeyData[output_ikey]; - *cur = scan->keyData[input_ikey]; + *cur = *inkey; if (!(cur->sk_flags & SK_SEARCHARRAY)) { - output_ikey++; /* keep this non-array scan key */ + numArrayKeyData++; /* keep this non-array scan key */ continue; } + /* + * Process SAOP array scan key + */ + Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL))); + + /* If array is null as a whole, the scan qual is unsatisfiable */ + if (cur->sk_flags & SK_ISNULL) + { + so->qual_ok = false; + break; + } + /* * Deconstruct the array into elements */ @@ -1257,7 +1537,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) * all btree operators are strict. */ num_nonnulls = 0; - for (j = 0; j < num_elems; j++) + for (int j = 0; j < num_elems; j++) { if (!elem_nulls[j]) elem_values[num_nonnulls++] = elem_values[j]; @@ -1295,7 +1575,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) _bt_find_extreme_element(scan, cur, elemtype, BTGreaterStrategyNumber, elem_values, num_nonnulls); - output_ikey++; /* keep this transformed scan key */ + numArrayKeyData++; /* keep this transformed scan key */ continue; case BTEqualStrategyNumber: /* proceed with rest of loop */ @@ -1306,7 +1586,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) _bt_find_extreme_element(scan, cur, elemtype, BTLessStrategyNumber, elem_values, num_nonnulls); - output_ikey++; /* keep this transformed scan key */ + numArrayKeyData++; /* keep this transformed scan key */ continue; default: elog(ERROR, "unrecognized StrategyNumber: %d", @@ -1323,7 +1603,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) * sortproc just points to the same proc used during binary searches. */ _bt_setup_array_cmp(scan, cur, elemtype, - &so->orderProcs[output_ikey], &sortprocp); + &so->orderProcs[numArrayKeyData], &sortprocp); /* * Sort the non-null elements and eliminate any duplicates. We must @@ -1392,23 +1672,24 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) origelemtype = elemtype; } - /* - * And set up the BTArrayKeyInfo data. - * - * Note: _bt_preprocess_array_keys_final will fix-up each array's - * scan_key field later on, after so->keyData[] has been finalized. - */ - so->arrayKeys[numArrayKeys].scan_key = output_ikey; + /* Initialize generic BTArrayKeyInfo fields */ + so->arrayKeys[numArrayKeys].scan_key = numArrayKeyData; so->arrayKeys[numArrayKeys].num_elems = num_elems; + + /* Initialize SAOP array specific BTArrayKeyInfo fields */ so->arrayKeys[numArrayKeys].elem_values = elem_values; + so->arrayKeys[numArrayKeys].cur_elem = -1; /* i.e. invalid */ + numArrayKeys++; - output_ikey++; /* keep this scan key/array */ + numArrayKeyData++; /* keep this scan key/array */ } + Assert(numSkipArrayKeys == 0); + /* Set final number of equality-type array keys */ so->numArrayKeys = numArrayKeys; - /* Set number of scan keys remaining in arrayKeyData[] */ - *new_numberOfKeys = output_ikey; + /* Set number of scan keys in arrayKeyData[] */ + *new_numberOfKeys = numArrayKeyData; MemoryContextSwitchTo(oldContext); @@ -1514,7 +1795,14 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) { BTArrayKeyInfo *array = &so->arrayKeys[arrayidx]; - Assert(array->num_elems > 0); + /* + * All skip arrays must be marked required, and final column can + * never have a skip array + */ + Assert(array->num_elems > 0 || array->num_elems == -1); + Assert(array->num_elems != -1 || outkey->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems != -1 || + outkey->sk_attno < IndexRelationGetNumberOfKeyAttributes(rel)); if (array->scan_key == input_ikey) { @@ -1575,6 +1863,199 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) so->numArrayKeys, INDEX_MAX_KEYS))); } +/* + * _bt_num_array_keys() -- determine # of BTArrayKeyInfo entries + * + * _bt_preprocess_array_keys helper function. Returns the estimated size of + * the scan's BTArrayKeyInfo array, which is guaranteed to be large enough to + * fit every so->arrayKeys[] entry. + * + * Also sets *numSkipArrayKeys_out to the number of of skip arrays caller must + * add to the scan keys it'll output. Caller must add this many skip arrays: + * one array for each of the most significant attributes that lack a = input + * key (IS NULL keys count as = input keys here). The specific attributes + * that need skip arrays are indicated by initializing skip_eq_ops_out[] arg + * 0-based attribute offset to a valid = op strategy Oid. We'll only ever set + * skip_eq_ops_out[] entries to InvalidOid for attributes that already have an + * equality key in scan->keyData[] input keys -- and only when there's some + * later "attribute gap" for us to "fill-in" with a skip array. + * + * We're optimistic about skipping working out: we always add exactly the skip + * arrays needed to maximize the number of input scan keys that can ultimately + * be marked as required to continue the scan (but no more). Given a + * multi-column index on (a, b, c, d), we add skip arrays as follows: + * + * Input keys Output keys (after all preprocessing) + * ---------- ------------------------------------- + * a = 1 a = 1 (no skip arrays) + * b = 42 skip a AND b = 42 + * a = 1 AND b = 42 a = 1 AND b = 42 (no skip arrays) + * a >= 1 AND b = 42 range skip a AND b = 42 + * a = 1 AND b > 42 a = 1 AND b > 42 (no skip arrays) + * a >= 1 AND a <= 3 AND b = 42 range skip a AND b = 42 + * a = 1 AND c <= 27 a = 1 AND skip b AND c <= 27 + * a = 1 AND d >= 1 a = 1 AND skip b AND skip c AND d >= 1 + * a = 1 AND b >= 42 AND d > 1 a = 1 AND range skip b AND skip c AND d > 1 + */ +static int +_bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out, + int *numSkipArrayKeys_out) +{ + Relation rel = scan->indexRelation; + AttrNumber attno_skip = 1, + attno_inkey = 1; + bool attno_has_equal = false, + attno_has_rowcompare = false; + int numSAOPArrayKeys, + numSkipArrayKeys, + prev_numSkipArrayKeys; + + Assert(scan->numberOfKeys); + + /* Initial pass over input scan keys counts the number of SAOP arrays */ + numSAOPArrayKeys = 0; + *numSkipArrayKeys_out = prev_numSkipArrayKeys = numSkipArrayKeys = 0; + for (int i = 0; i < scan->numberOfKeys; i++) + { + ScanKey inkey = scan->keyData + i; + + if (inkey->sk_flags & SK_SEARCHARRAY) + numSAOPArrayKeys++; + } + +#ifdef DEBUG_DISABLE_SKIP_SCAN + /* don't attempt to add skip arrays */ + return numSAOPArrayKeys; +#endif + + for (int i = 0;; i++) + { + ScanKey inkey = scan->keyData + i; + + /* + * Backfill skip arrays for any wholly omitted attributes prior to + * attno_inkey + */ + while (attno_skip < attno_inkey) + { + Oid opfamily = rel->rd_opfamily[attno_skip - 1]; + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + + /* Look up input opclass's equality operator (might fail) */ + skip_eq_ops_out[attno_skip - 1] = + get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + if (!OidIsValid(skip_eq_ops_out[attno_skip - 1])) + { + /* + * Cannot generate a skip array for this or later attributes + * (input opclass lacks an equality strategy operator) + */ + *numSkipArrayKeys_out = prev_numSkipArrayKeys; + return numSAOPArrayKeys + prev_numSkipArrayKeys; + } + + /* plan on adding a backfill skip array for this attribute */ + numSkipArrayKeys++; + attno_skip++; + } + + prev_numSkipArrayKeys = numSkipArrayKeys; + + /* + * Stop once past the final input scan key. We deliberately never add + * a skip array for the last input scan key's attribute -- even when + * there are only inequality keys on that attribute. + */ + if (i == scan->numberOfKeys) + break; + + /* + * Later preprocessing steps cannot merge a RowCompare into a skip + * array, so stop adding skip arrays once we see one. (Note that we + * can backfill skip arrays before a RowCompare, which will allow keys + * up to and including the RowCompare to be marked required.) + * + * Skip arrays work by maintaining a current array element value, + * which anchors lower-order keys via an implied equality constraint. + * This is incompatible with the current nbtree row comparison design, + * which compares all columns together, as an indivisible group. + * Alternative designs that can be used alongside skip arrays are + * possible, but it's not clear that they're really worth pursuing. + * + * A RowCompare qual "(a, b, c) > (10, 'foo', 42)" is equivalent to + * "(a=10 AND b='foo' AND c>42) OR (a=10 AND b>'foo') OR (a>10)". + * Decomposing this RowCompare into these 3 disjuncts allows each + * disjunct to be executed as a separate "single value" index scan. + * That'll give all 3 scans the ability to add skip arrays in the + * usual way (when there are any scalar keys after the RowCompare). + * Under this scheme, a qual "(a, b, c) > (10, 'foo', 42) AND d = 99" + * performs 3 separate scans, each of which can mark keys up to and + * including its "d = 99" key as required to continue the scan. + */ + if (attno_has_rowcompare) + break; + + /* + * Now consider next attno_inkey (or keep going if this is an + * additional scan key against the same attribute) + */ + if (attno_inkey < inkey->sk_attno) + { + /* + * Now add skip array for previous scan key's attribute, though + * only if the attribute has no equality strategy scan keys + */ + if (attno_has_equal) + { + /* Attributes with an = key must have InvalidOid eq_op set */ + skip_eq_ops_out[attno_skip - 1] = InvalidOid; + } + else + { + Oid opfamily = rel->rd_opfamily[attno_skip - 1]; + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + + /* Look up input opclass's equality operator (might fail) */ + skip_eq_ops_out[attno_skip - 1] = + get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + + if (!OidIsValid(skip_eq_ops_out[attno_skip - 1])) + { + /* + * Input opclass lacks an equality strategy operator, so + * don't generate a skip array that definitely won't work + */ + break; + } + + /* plan on adding a backfill skip array for this attribute */ + numSkipArrayKeys++; + } + + /* Set things up for this new attribute */ + attno_skip++; + attno_inkey = inkey->sk_attno; + attno_has_equal = false; + } + + /* + * Track if this attribute's scan keys include any equality strategy + * scan keys (IS NULL keys count as equality keys here). Also track + * if it has any RowCompare keys. + */ + if (inkey->sk_strategy == BTEqualStrategyNumber || + (inkey->sk_flags & SK_SEARCHNULL)) + attno_has_equal = true; + if (inkey->sk_flags & SK_ROW_HEADER) + attno_has_rowcompare = true; + } + + *numSkipArrayKeys_out = numSkipArrayKeys; + return numSAOPArrayKeys + numSkipArrayKeys; +} + /* * _bt_find_extreme_element() -- get least or greatest array element * diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index a99667eb2bd..77781cb9002 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -31,6 +31,7 @@ #include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/read_stream.h" +#include "utils/datum.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" @@ -76,14 +77,26 @@ typedef struct BTParallelScanDescData /* * btps_arrElems is used when scans need to schedule another primitive - * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys. + * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem + * offsets for each = scan key associated with a ScalarArrayOp array. */ int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]; + + /* + * Additional space (at the end of the struct) is used when scans need to + * schedule another primitive index scan with one or more skip arrays. + * Holds a flattened datum representation for each = scan key associated + * with a skip array. + */ } BTParallelScanDescData; typedef struct BTParallelScanDescData *BTParallelScanDesc; +static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so); +static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so); static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid); @@ -541,10 +554,167 @@ btrestrpos(IndexScanDesc scan) * btestimateparallelscan -- estimate storage for BTParallelScanDescData */ Size -btestimateparallelscan(int nkeys, int norderbys) +btestimateparallelscan(Relation rel, int nkeys, int norderbys) +{ + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + Size estnbtreeshared, + genericattrspace; + + /* + * Pessimistically assume that every input scan key will be output with + * its own SAOP array + */ + estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) + + sizeof(int) * nkeys; + + /* Single column indexes cannot possibly use a skip array */ + if (nkeyatts == 1) + return estnbtreeshared; + + /* + * Pessimistically assume that all attributes prior to the least + * significant attribute require a skip array (and an associated key) + */ + genericattrspace = datumEstimateSpace((Datum) 0, false, true, + sizeof(Datum)); + for (int attnum = 1; attnum < nkeyatts; attnum++) + { + CompactAttribute *attr; + + /* + * We make the conservative assumption that every index column will + * also require a skip array. + * + * Every skip array must have space to store its scan key's sk_flags. + */ + estnbtreeshared = add_size(estnbtreeshared, sizeof(int)); + + /* Consider space required to store a datum of opclass input type */ + attr = TupleDescCompactAttr(rel->rd_att, attnum - 1); + if (attr->attbyval) + { + /* This index attribute stores pass-by-value datums */ + Size estfixed = datumEstimateSpace((Datum) 0, false, + true, attr->attlen); + + estnbtreeshared = add_size(estnbtreeshared, estfixed); + continue; + } + + /* + * This index attribute stores pass-by-reference datums. + * + * Assume that serializing this array will use just as much space as a + * pass-by-value datum, in addition to space for the largest possible + * whole index tuple (this is not just a per-datum portion of the + * largest possible tuple because that'd be almost as large anyway). + * + * This is quite conservative, but it's not clear how we could do much + * better. The executor requires an up-front storage request size + * that reliably covers the scan's high watermark memory usage. We + * can't be sure of the real high watermark until the scan is over. + */ + estnbtreeshared = add_size(estnbtreeshared, genericattrspace); + estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize); + } + + return estnbtreeshared; +} + +/* + * _bt_parallel_serialize_arrays() -- Serialize parallel array state. + * + * Caller must have exclusively locked btscan->btps_lock when called. + */ +static void +_bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so) +{ + char *datumshared; + + /* Space for serialized datums begins immediately after btps_arrElems[] */ + datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]); + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + + if (array->num_elems != -1) + { + /* Save SAOP array's cur_elem (no need to copy key/datum) */ + Assert(!(skey->sk_flags & SK_BT_SKIP)); + btscan->btps_arrElems[i] = array->cur_elem; + continue; + } + + /* Save all mutable state associated with skip array's key */ + Assert(skey->sk_flags & SK_BT_SKIP); + memcpy(datumshared, &skey->sk_flags, sizeof(int)); + datumshared += sizeof(int); + + if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)) + { + /* No sk_argument datum to serialize */ + Assert(skey->sk_argument == 0); + continue; + } + + datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0, + array->attbyval, array->attlen, &datumshared); + } +} + +/* + * _bt_parallel_restore_arrays() -- Restore serialized parallel array state. + * + * Caller must have exclusively locked btscan->btps_lock when called. + */ +static void +_bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, + BTScanOpaque so) { - /* Pessimistically assume all input scankeys will be output with arrays */ - return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys; + char *datumshared; + + /* Space for serialized datums begins immediately after btps_arrElems[] */ + datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]); + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + bool isnull; + + if (array->num_elems != -1) + { + /* Restore SAOP array using its saved cur_elem */ + Assert(!(skey->sk_flags & SK_BT_SKIP)); + array->cur_elem = btscan->btps_arrElems[i]; + skey->sk_argument = array->elem_values[array->cur_elem]; + continue; + } + + /* Restore skip array by restoring its key directly */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = (Datum) 0; + memcpy(&skey->sk_flags, datumshared, sizeof(int)); + datumshared += sizeof(int); + + Assert(skey->sk_flags & SK_BT_SKIP); + + if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)) + { + /* No sk_argument datum to restore */ + continue; + } + + skey->sk_argument = datumRestore(&datumshared, &isnull); + if (isnull) + { + Assert(skey->sk_argument == 0); + Assert(skey->sk_flags & SK_SEARCHNULL); + Assert(skey->sk_flags & SK_ISNULL); + } + } } /* @@ -613,6 +783,7 @@ bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; bool exit_loop = false, status = true, @@ -679,14 +850,9 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, { /* Can start scheduled primitive scan right away, so do so */ btscan->btps_pageStatus = BTPARALLEL_ADVANCING; - for (int i = 0; i < so->numArrayKeys; i++) - { - BTArrayKeyInfo *array = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[array->scan_key]; - array->cur_elem = btscan->btps_arrElems[i]; - skey->sk_argument = array->elem_values[array->cur_elem]; - } + /* Restore scan's array keys from serialized values */ + _bt_parallel_restore_arrays(rel, btscan, so); exit_loop = true; } else @@ -831,6 +997,7 @@ _bt_parallel_done(IndexScanDesc scan) void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; @@ -849,12 +1016,7 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; /* Serialize scan's current array keys */ - for (int i = 0; i < so->numArrayKeys; i++) - { - BTArrayKeyInfo *array = &so->arrayKeys[i]; - - btscan->btps_arrElems[i] = array->cur_elem; - } + _bt_parallel_serialize_arrays(rel, btscan, so); } LWLockRelease(&btscan->btps_lock); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 3d46fb5df78..1ef2cb2b55e 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -983,7 +983,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * one we use --- by definition, they are either redundant or * contradictory. * - * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. + * In practice we rarely see any "attribute boundary key gaps" here. + * Preprocessing can usually backfill skip array keys for any attributes + * that were omitted from the original scan->keyData[] input keys. All + * array keys are always considered = keys, but we'll sometimes need to + * treat the current key value as if we were using an inequality strategy. + * This happens with range skip arrays, which store inequality keys in the + * array's low_compare/high_compare fields (used to find the first/last + * set of matches, when = key will lack a usable sk_argument value). + * These are always preferred over any redundant "standard" inequality + * keys on the same column (per the usual rule about preferring = keys). + * Note also that any column with an = skip array key can never have an + * additional, contradictory = key. + * + * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP + * array keys whose array is "null_elem=true") imply a NOT NULL qualifier. * If the index stores nulls at the end of the index we'll be starting * from, and we have no boundary key for the column (which means the key * we deduced NOT NULL from is an inequality key that constrains the other @@ -1040,8 +1054,54 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (i >= so->numberOfKeys || cur->sk_attno != curattr) { /* - * Done looking at keys for curattr. If we didn't find a - * usable boundary key, see if we can deduce a NOT NULL key. + * Done looking at keys for curattr. + * + * If this is a scan key for a skip array whose current + * element is MINVAL, choose low_compare (when scanning + * backwards it'll be MAXVAL, and we'll choose high_compare). + * + * Note: if the array's low_compare key makes 'chosen' NULL, + * then we behave as if the array's first element is -inf, + * except when !array->null_elem implies a usable NOT NULL + * constraint. + */ + if (chosen != NULL && + (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) + { + int ikey = chosen - so->keyData; + ScanKey skipequalitykey = chosen; + BTArrayKeyInfo *array = NULL; + + for (int arridx = 0; arridx < so->numArrayKeys; arridx++) + { + array = &so->arrayKeys[arridx]; + if (array->scan_key == ikey) + break; + } + + if (ScanDirectionIsForward(dir)) + { + Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL)); + chosen = array->low_compare; + } + else + { + Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL)); + chosen = array->high_compare; + } + + Assert(chosen == NULL || + chosen->sk_attno == skipequalitykey->sk_attno); + + if (!array->null_elem) + impliesNN = skipequalitykey; + else + Assert(chosen == NULL && impliesNN == NULL); + } + + /* + * If we didn't find a usable boundary key, see if we can + * deduce a NOT NULL key */ if (chosen == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? @@ -1084,9 +1144,40 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) break; /* - * Done if that was the last attribute, or if next key is not - * in sequence (implying no boundary key is available for the - * next attribute). + * If the key that we just added to startKeys[] is a skip + * array = key whose current element is marked NEXT or PRIOR, + * make strat_total > or < (and stop adding boundary keys). + * This can only happen with opclasses that lack skip support. + */ + if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) + { + Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(strat_total == BTEqualStrategyNumber); + + if (ScanDirectionIsForward(dir)) + { + Assert(!(chosen->sk_flags & SK_BT_PRIOR)); + strat_total = BTGreaterStrategyNumber; + } + else + { + Assert(!(chosen->sk_flags & SK_BT_NEXT)); + strat_total = BTLessStrategyNumber; + } + + /* + * We're done. We'll never find an exact = match for a + * NEXT or PRIOR sentinel sk_argument value. There's no + * sense in trying to add more keys to startKeys[]. + */ + break; + } + + /* + * Done if that was the last scan key output by preprocessing. + * Also done if there is a gap index attribute that lacks a + * usable key (only possible when preprocessing was unable to + * generate a skip array key to "fill in the gap"). */ if (i >= so->numberOfKeys || cur->sk_attno != curattr + 1) @@ -1581,31 +1672,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * We skip this for the first page read by each (primitive) scan, to avoid * slowing down point queries. They typically don't stand to gain much * when the optimization can be applied, and are more likely to notice the - * overhead of the precheck. - * - * The optimization is unsafe and must be avoided whenever _bt_checkkeys - * just set a low-order required array's key to the best available match - * for a truncated -inf attribute value from the prior page's high key - * (array element 0 is always the best available match in this scenario). - * It's quite likely that matches for array element 0 begin on this page, - * but the start of matches won't necessarily align with page boundaries. - * When the start of matches is somewhere in the middle of this page, it - * would be wrong to treat page's final non-pivot tuple as representative. - * Doing so might lead us to treat some of the page's earlier tuples as - * being part of a group of tuples thought to satisfy the required keys. - * - * Note: Conversely, in the case where the scan's arrays just advanced - * using the prior page's HIKEY _without_ advancement setting scanBehind, - * the start of matches must be aligned with page boundaries, which makes - * it safe to attempt the optimization here now. It's also safe when the - * prior page's HIKEY simply didn't need to advance any required array. In - * both cases we can safely assume that the _first_ tuple from this page - * must be >= the current set of array keys/equality constraints. And so - * if the final tuple is == those same keys (and also satisfies any - * required < or <= strategy scan keys) during the precheck, we can safely - * assume that this must also be true of all earlier tuples from the page. + * overhead of the precheck. Also avoid it during scans with array keys, + * which might be using skip scan (XXX fixed in next commit). */ - if (!pstate.firstpage && !so->scanBehind && minoff < maxoff) + if (!pstate.firstpage && !arrayKeys && minoff < maxoff) { ItemId iid; IndexTuple itup; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2aee9bbf67d..108030a8ee7 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -30,6 +30,17 @@ static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, Datum arrdatum, ScanKey cur); +static void _bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static void _bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + int32 set_elem_result, Datum tupdatum, bool tupnull); +static void _bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, + BTArrayKeyInfo *array, bool low_not_high); +static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, @@ -207,6 +218,7 @@ _bt_compare_array_skey(FmgrInfo *orderproc, int32 result = 0; Assert(cur->sk_strategy == BTEqualStrategyNumber); + Assert(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))); if (tupnull) /* NULL tupdatum */ { @@ -283,6 +295,8 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum arrdatum; Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(!(cur->sk_flags & SK_BT_SKIP)); + Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */ Assert(cur->sk_strategy == BTEqualStrategyNumber); if (cur_elem_trig) @@ -405,6 +419,186 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, return low_elem; } +/* + * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array + * + * Does not return an index into the array, since skip arrays don't really + * contain elements (they generate their array elements procedurally instead). + * Our interface matches that of _bt_binsrch_array_skey in every other way. + * + * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true + * array. The value 0 indicates that tupdatum/tupnull is within the range of + * the skip array. We return -1 when tupdatum/tupnull is lower that any value + * within the range of the array, and 1 when it is higher than every value. + * Caller should pass *set_elem_result to _bt_skiparray_set_element to advance + * the array. + * + * cur_elem_trig indicates if array advancement was triggered by this array's + * scan key. We use this to optimize-away comparisons that are known by our + * caller to be unnecessary from context, just like _bt_binsrch_array_skey. + */ +static void +_bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems == -1); + Assert(!ScanDirectionIsNoMovement(dir)); + + if (array->null_elem) + { + Assert(!array->low_compare && !array->high_compare); + + *set_elem_result = 0; + return; + } + + if (tupnull) /* NULL tupdatum */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + *set_elem_result = -1; /* NULL "<" NOT_NULL */ + else + *set_elem_result = 1; /* NULL ">" NOT_NULL */ + return; + } + + /* + * Array inequalities determine whether tupdatum is within the range of + * caller's skip array + */ + *set_elem_result = 0; + if (ScanDirectionIsForward(dir)) + { + /* + * Evaluate low_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate high_compare + */ + if (!cur_elem_trig && array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + else if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + } + else + { + /* + * Evaluate high_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate low_compare + */ + if (!cur_elem_trig && array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + else if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + } + + /* + * Assert that any keys that were assumed to be satisfied already (due to + * caller passing cur_elem_trig=true) really are satisfied as expected + */ +#ifdef USE_ASSERT_CHECKING + if (cur_elem_trig) + { + if (ScanDirectionIsForward(dir) && array->low_compare) + Assert(DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))); + + if (ScanDirectionIsBackward(dir) && array->high_compare) + Assert(DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))); + } +#endif +} + +/* + * _bt_skiparray_set_element() -- Set skip array scan key's sk_argument + * + * Caller passes set_elem_result returned by _bt_binsrch_skiparray_skey for + * caller's tupdatum/tupnull. + * + * We copy tupdatum/tupnull into skey's sk_argument iff set_elem_result == 0. + * Otherwise, we set skey to either the lowest or highest value that's within + * the range of caller's skip array (whichever is the best available match to + * tupdatum/tupnull that is still within the range of the skip array according + * to _bt_binsrch_skiparray_skey/set_elem_result). + */ +static void +_bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + int32 set_elem_result, Datum tupdatum, bool tupnull) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (set_elem_result) + { + /* tupdatum/tupnull is out of the range of the skip array */ + Assert(!array->null_elem); + + _bt_array_set_low_or_high(rel, skey, array, set_elem_result < 0); + return; + } + + /* Advance skip array to tupdatum (or tupnull) value */ + if (unlikely(tupnull)) + { + _bt_skiparray_set_isnull(rel, skey, array); + return; + } + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* tupdatum becomes new sk_argument/new current element */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + skey->sk_argument = datumCopy(tupdatum, array->attbyval, array->attlen); +} + +/* + * _bt_skiparray_set_isnull() -- set skip array scan key to NULL + */ +static void +_bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(array->null_elem && !array->low_compare && !array->high_compare); + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* NULL becomes new sk_argument/new current element */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * @@ -414,29 +608,355 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; Assert(so->numArrayKeys); Assert(so->qual_ok); - for (i = 0; i < so->numArrayKeys; i++) + for (int i = 0; i < so->numArrayKeys; i++) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[curArrayKey->scan_key]; + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; - Assert(curArrayKey->num_elems > 0); Assert(skey->sk_flags & SK_SEARCHARRAY); - if (ScanDirectionIsBackward(dir)) - curArrayKey->cur_elem = curArrayKey->num_elems - 1; - else - curArrayKey->cur_elem = 0; - skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; + _bt_array_set_low_or_high(rel, skey, array, + ScanDirectionIsForward(dir)); } so->scanBehind = so->oppositeDirCheck = false; /* reset */ } +/* + * _bt_array_set_low_or_high() -- Set array scan key to lowest/highest element + * + * Caller also passes associated scan key, which will have its argument set to + * the lowest/highest array value in passing. + */ +static void +_bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + bool low_not_high) +{ + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (array->num_elems != -1) + { + /* set low or high element for SAOP array */ + int set_elem = 0; + + Assert(!(skey->sk_flags & SK_BT_SKIP)); + + if (!low_not_high) + set_elem = array->num_elems - 1; + + /* + * Just copy over array datum (only skip arrays require freeing and + * allocating memory for sk_argument) + */ + array->cur_elem = set_elem; + skey->sk_argument = array->elem_values[set_elem]; + + return; + } + + /* set low or high element for skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + + /* Free memory previously allocated for sk_argument if needed */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Reset flags */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR); + + if (array->null_elem && + (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) + { + /* Requested element (either lowest or highest) has the value NULL */ + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); + } + else if (low_not_high) + { + /* Setting array to lowest element (according to low_compare) */ + skey->sk_flags |= SK_BT_MINVAL; + } + else + { + /* Setting array to highest element (according to high_compare) */ + skey->sk_flags |= SK_BT_MAXVAL; + } +} + +/* + * _bt_array_decrement() -- decrement array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully decremented. + * Cannot decrement an array whose current element is already the first one. + */ +static bool +_bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool uflow = false; + Datum dec_sk_argument; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_BT_MAXVAL | SK_BT_NEXT | SK_BT_PRIOR))); + + /* SAOP array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); + if (array->cur_elem > 0) + { + /* + * Just decrement current element, and assign its datum to skey + * (only skip arrays need us to free existing sk_argument memory) + */ + array->cur_elem--; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully decremented array */ + return true; + } + + /* Cannot decrement to before first array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* + * The sentinel value that represents the minimum value within the range + * of a skip array (often just -inf) is never decrementable + */ + if (skey->sk_flags & SK_BT_MINVAL) + return false; + + /* + * When the current array element is NULL, and the lowest sorting value in + * the index is also NULL, we cannot decrement before first array element + */ + if ((skey->sk_flags & SK_ISNULL) && (skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "decrement" the scan key's current + * element by setting the PRIOR flag. The true prior value is determined + * by repositioning to the last index tuple < existing sk_argument/current + * array element. Note that this works in the usual way when the scan key + * is already marked ISNULL (i.e. when the current element is NULL). + */ + if (!array->sksup) + { + /* Successfully "decremented" array */ + skey->sk_flags |= SK_BT_PRIOR; + return true; + } + + /* + * Opclasses with skip support directly decrement sk_argument + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(!(skey->sk_flags & SK_BT_NULLS_FIRST)); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * "Decrement" from NULL to the high_elem value provided by opclass + * skip support routine. + */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + skey->sk_argument = datumCopy(array->sksup->high_elem, + array->attbyval, array->attlen); + return true; + } + + /* + * Ask opclass support routine to provide decremented copy of existing + * non-NULL sk_argument + */ + dec_sk_argument = array->sksup->decrement(rel, skey->sk_argument, &uflow); + if (unlikely(uflow)) + { + /* dec_sk_argument has undefined value (so no pfree) */ + if (array->null_elem && (skey->sk_flags & SK_BT_NULLS_FIRST)) + { + _bt_skiparray_set_isnull(rel, skey, array); + + /* Successfully "decremented" array to NULL */ + return true; + } + + /* Cannot decrement to before first array element */ + return false; + } + + /* + * Successfully decremented sk_argument to a non-NULL value. Make sure + * that the decremented value is still within the range of the array. + */ + if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + dec_sk_argument, + array->low_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!array->attbyval) + pfree(DatumGetPointer(dec_sk_argument)); + + /* Cannot decrement to before first array element */ + return false; + } + + /* Accept value returned by opclass decrement callback */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = dec_sk_argument; + + /* Successfully decremented array */ + return true; +} + +/* + * _bt_array_increment() -- increment array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully incremented. + * Cannot increment an array whose current element is already the final one. + */ +static bool +_bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool oflow = false; + Datum inc_sk_argument; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_BT_MINVAL | SK_BT_NEXT | SK_BT_PRIOR))); + + /* SAOP array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); + if (array->cur_elem < array->num_elems - 1) + { + /* + * Just increment current element, and assign its datum to skey + * (only skip arrays need us to free existing sk_argument memory) + */ + array->cur_elem++; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully incremented array */ + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* + * The sentinel value that represents the maximum value within the range + * of a skip array (often just +inf) is never incrementable + */ + if (skey->sk_flags & SK_BT_MAXVAL) + return false; + + /* + * When the current array element is NULL, and the highest sorting value + * in the index is also NULL, we cannot increment past the final element + */ + if ((skey->sk_flags & SK_ISNULL) && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "increment" the scan key's current + * element by setting the NEXT flag. The true next value is determined by + * repositioning to the first index tuple > existing sk_argument/current + * array element. Note that this works in the usual way when the scan key + * is already marked ISNULL (i.e. when the current element is NULL). + */ + if (!array->sksup) + { + /* Successfully "incremented" array */ + skey->sk_flags |= SK_BT_NEXT; + return true; + } + + /* + * Opclasses with skip support directly increment sk_argument + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(skey->sk_flags & SK_BT_NULLS_FIRST); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * "Increment" from NULL to the low_elem value provided by opclass + * skip support routine. + */ + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + skey->sk_argument = datumCopy(array->sksup->low_elem, + array->attbyval, array->attlen); + return true; + } + + /* + * Ask opclass support routine to provide incremented copy of existing + * non-NULL sk_argument + */ + inc_sk_argument = array->sksup->increment(rel, skey->sk_argument, &oflow); + if (unlikely(oflow)) + { + /* inc_sk_argument has undefined value (so no pfree) */ + if (array->null_elem && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + { + _bt_skiparray_set_isnull(rel, skey, array); + + /* Successfully "incremented" array to NULL */ + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* + * Successfully incremented sk_argument to a non-NULL value. Make sure + * that the incremented value is still within the range of the array. + */ + if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + inc_sk_argument, + array->high_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!array->attbyval) + pfree(DatumGetPointer(inc_sk_argument)); + + /* Cannot increment past final array element */ + return false; + } + + /* Accept value returned by opclass increment callback */ + if (!array->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = inc_sk_argument; + + /* Successfully incremented array */ + return true; +} + /* * _bt_advance_array_keys_increment() -- Advance to next set of array elements * @@ -452,6 +972,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; /* @@ -461,29 +982,30 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) */ for (int i = so->numArrayKeys - 1; i >= 0; i--) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[curArrayKey->scan_key]; - int cur_elem = curArrayKey->cur_elem; - int num_elems = curArrayKey->num_elems; - bool rolled = false; + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; - if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems) + if (ScanDirectionIsForward(dir)) { - cur_elem = 0; - rolled = true; + if (_bt_array_increment(rel, skey, array)) + return true; } - else if (ScanDirectionIsBackward(dir) && --cur_elem < 0) + else { - cur_elem = num_elems - 1; - rolled = true; + if (_bt_array_decrement(rel, skey, array)) + return true; } - curArrayKey->cur_elem = cur_elem; - skey->sk_argument = curArrayKey->elem_values[cur_elem]; - if (!rolled) - return true; + /* + * Couldn't increment (or decrement) array. Handle array roll over. + * + * Start over at the array's lowest sorting value (or its highest + * value, for backward scans)... + */ + _bt_array_set_low_or_high(rel, skey, array, + ScanDirectionIsForward(dir)); - /* Need to advance next array key, if any */ + /* ...then increment (or decrement) next most significant array */ } /* @@ -507,7 +1029,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) } /* - * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays + * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required * * Called when _bt_advance_array_keys decides to start a new primitive index * scan on the basis of the current scan position being before the position @@ -539,10 +1061,15 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) * * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that * everybody got this right. + * + * Note: In practice almost all SAOP arrays are marked required during + * preprocessing (if necessary by generating skip arrays). It is hardly ever + * truly necessary to call here, but consistently doing so is simpler. */ static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; int arrayidx = 0; @@ -550,7 +1077,6 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { ScanKey cur = so->keyData + ikey; BTArrayKeyInfo *array = NULL; - int first_elem_dir; if (!(cur->sk_flags & SK_SEARCHARRAY) || cur->sk_strategy != BTEqualStrategyNumber) @@ -562,16 +1088,10 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) continue; - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; + Assert(array->num_elems != -1); /* No non-required skip arrays */ - if (array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); } } @@ -696,9 +1216,77 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); - result = _bt_compare_array_skey(&so->orderProcs[ikey], - tupdatum, tupnull, - cur->sk_argument, cur); + if (likely(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))) + { + /* Scankey has a valid/comparable sk_argument value */ + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + + if (result == 0) + { + /* + * Interpret result in a way that takes NEXT/PRIOR into + * account + */ + if (cur->sk_flags & SK_BT_NEXT) + result = -1; + else if (cur->sk_flags & SK_BT_PRIOR) + result = 1; + + Assert(result == 0 || (cur->sk_flags & SK_BT_SKIP)); + } + } + else + { + BTArrayKeyInfo *array = NULL; + + /* + * Current array element/array = scan key value is a sentinel + * value that represents the lowest (or highest) possible value + * that's still within the range of the array. + * + * Like _bt_first, we only see MINVAL keys during forwards scans + * (and similarly only see MAXVAL keys during backwards scans). + * Even if the scan's direction changes, we'll stop at some higher + * order key before we can ever reach any MAXVAL (or MINVAL) keys. + * (However, unlike _bt_first we _can_ get to keys marked either + * NEXT or PRIOR, regardless of the scan's current direction.) + */ + Assert(ScanDirectionIsForward(dir) ? + !(cur->sk_flags & SK_BT_MAXVAL) : + !(cur->sk_flags & SK_BT_MINVAL)); + + /* + * There are no valid sk_argument values in MINVAL/MAXVAL keys. + * Check if tupdatum is within the range of skip array instead. + */ + for (int arrayidx = 0; arrayidx < so->numArrayKeys; arrayidx++) + { + array = &so->arrayKeys[arrayidx]; + if (array->scan_key == ikey) + break; + } + + _bt_binsrch_skiparray_skey(false, dir, tupdatum, tupnull, + array, cur, &result); + + if (result == 0) + { + /* + * tupdatum satisfies both low_compare and high_compare, so + * it's time to advance the array keys. + * + * Note: It's possible that the skip array will "advance" from + * its MINVAL (or MAXVAL) representation to an alternative, + * logically equivalent representation of the same value: a + * representation where the = key gets a valid datum in its + * sk_argument. This is only possible when low_compare uses + * the >= strategy (or high_compare uses the <= strategy). + */ + return false; + } + } /* * Does this comparison indicate that caller must _not_ advance the @@ -1017,18 +1605,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (beyond_end_advance) { - int final_elem_dir; - - if (ScanDirectionIsBackward(dir) || !array) - final_elem_dir = 0; - else - final_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != final_elem_dir) - { - array->cur_elem = final_elem_dir; - cur->sk_argument = array->elem_values[final_elem_dir]; - } + if (array) + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); continue; } @@ -1053,18 +1632,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (!all_required_satisfied || cur->sk_attno > tupnatts) { - int first_elem_dir; - - if (ScanDirectionIsForward(dir) || !array) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + if (array) + _bt_array_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); continue; } @@ -1080,14 +1650,22 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, bool cur_elem_trig = (sktrig_required && ikey == sktrig); /* - * Binary search for closest match that's available from the array + * "Binary search" by checking if tupdatum/tupnull are within the + * range of the skip array */ - set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], - cur_elem_trig, dir, - tupdatum, tupnull, array, cur, - &result); + if (array->num_elems == -1) + _bt_binsrch_skiparray_skey(cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); - Assert(set_elem >= 0 && set_elem < array->num_elems); + /* + * Binary search for the closest match from the SAOP array + */ + else + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); } else { @@ -1163,11 +1741,21 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } } - /* Advance array keys, even when set_elem isn't an exact match */ - if (array && array->cur_elem != set_elem) + /* Advance array keys, even when we don't have an exact match */ + if (array) { - array->cur_elem = set_elem; - cur->sk_argument = array->elem_values[set_elem]; + if (array->num_elems == -1) + { + /* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */ + _bt_skiparray_set_element(rel, cur, array, result, + tupdatum, tupnull); + } + else if (array->cur_elem != set_elem) + { + /* SAOP array's new element is set_elem datum */ + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; + } } } @@ -1581,10 +2169,11 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->scan_key != ikey) return false; - if (array->num_elems <= 0) + if (array->num_elems == 0 || array->num_elems < -1) return false; - if (cur->sk_argument != array->elem_values[array->cur_elem]) + if (array->num_elems != -1 && + cur->sk_argument != array->elem_values[array->cur_elem]) return false; if (last_sk_attno > cur->sk_attno) return false; @@ -1914,6 +2503,20 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, continue; } + /* + * A skip array scan key uses one of several sentinel values. We just + * fall back on _bt_tuple_before_array_skeys when we see such a value. + */ + if (key->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL | + SK_BT_NEXT | SK_BT_PRIOR)) + { + Assert(key->sk_flags & SK_SEARCHARRAY); + Assert(key->sk_flags & SK_BT_SKIP); + + *continuescan = false; + return false; + } + /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { @@ -1939,6 +2542,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, else { Assert(key->sk_flags & SK_SEARCHNOTNULL); + Assert(!(key->sk_flags & SK_BT_SKIP)); if (!isNull) continue; /* tuple satisfies this qual */ } diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c index dd6f5a15c65..817ad358f0c 100644 --- a/src/backend/access/nbtree/nbtvalidate.c +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -106,6 +106,10 @@ btvalidate(Oid opclassoid) case BTOPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case BTSKIPSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), -- cgit v1.2.3