From 9b5c8d45f62bd3d243a40cc84deb93893f2f5122 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 14 Apr 2008 17:05:34 +0000 Subject: Push index operator lossiness determination down to GIST/GIN opclass "consistent" functions, and remove pg_amop.opreqcheck, as per recent discussion. The main immediate benefit of this is that we no longer need 8.3's ugly hack of requiring @@@ rather than @@ to test weight-using tsquery searches on GIN indexes. In future it should be possible to optimize some other queries better than is done now, by detecting at runtime whether the index match is exact or not. Tom Lane, after an idea of Heikki's, and with some help from Teodor. --- doc/src/sgml/catalogs.sgml | 9 +---- doc/src/sgml/func.sgml | 4 +- doc/src/sgml/gin.sgml | 11 ++++-- doc/src/sgml/gist.sgml | 7 +++- doc/src/sgml/indexam.sgml | 20 +++++----- doc/src/sgml/ref/alter_opfamily.sgml | 24 +++++------- doc/src/sgml/ref/create_opclass.sgml | 30 +++++++-------- doc/src/sgml/textsearch.sgml | 72 ++++-------------------------------- doc/src/sgml/xindex.sgml | 25 ++++++++----- 9 files changed, 70 insertions(+), 132 deletions(-) (limited to 'doc/src') diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index a6aaa4fd5a8..f43c46908c0 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1,4 +1,4 @@ - + @@ -606,13 +606,6 @@ Operator strategy number - - amopreqcheck - bool - - Index hit must be rechecked - - amopopr oid diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index c15de7b1c9e..d2af5e63ee8 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1,4 +1,4 @@ - + Functions and Operators @@ -7738,7 +7738,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple @@@ - same as @@, but see + deprecated synonym for @@ to_tsvector('fat cats ate rats') @@@ to_tsquery('cat & rat') t diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml index 1b7f105ac56..ad82da6b38e 100644 --- a/doc/src/sgml/gin.sgml +++ b/doc/src/sgml/gin.sgml @@ -1,4 +1,4 @@ - + GIN Indexes @@ -111,12 +111,12 @@ - bool consistent(bool check[], StrategyNumber n, Datum query) + bool consistent(bool check[], StrategyNumber n, Datum query, bool *recheck) Returns TRUE if the indexed value satisfies the query operator with - strategy number n (or would satisfy, if the operator is - marked RECHECK in the operator class). The check array has + strategy number n (or might satisfy, if the recheck + indication is returned). The check array has the same length as the number of keys previously returned by extractQuery for this query. Each element of the check array is TRUE if the indexed value contains the @@ -124,6 +124,9 @@ extractQuery result array is present in the indexed value. The original query datum (not the extracted key array!) is passed in case the consistent method needs to consult it. + On success, *recheck should be set to TRUE if the heap + tuple needs to be rechecked against the query operator, or FALSE if + the index test is exact. diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml index 587517da1d1..f236e6ad614 100644 --- a/doc/src/sgml/gist.sgml +++ b/doc/src/sgml/gist.sgml @@ -1,4 +1,4 @@ - + GiST Indexes @@ -103,7 +103,10 @@ Given a predicate p on a tree page, and a user query, q, this method will return false if it is certain that both p and q cannot - be true for a given data item. + be true for a given data item. For a true result, a + recheck flag must also be returned; this indicates whether + the predicate implies the query (recheck = false) or + not (recheck = true). diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 1b42ff0231a..760d8d1a206 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -1,4 +1,4 @@ - + Index Access Method Interface Definition @@ -183,7 +183,7 @@ aminsert (Relation indexRelation, parameter. See for details. The result is TRUE if an index entry was inserted, FALSE if not. (A FALSE result does not denote an error condition, but is used for cases such - as an index AM refusing to index a NULL.) + as an index method refusing to index a NULL.) @@ -430,13 +430,13 @@ amrestrpos (IndexScanDesc scan); - The operator family can indicate that the index is lossy for a - particular operator; this implies that the index scan will return all the - entries that pass the scan key, plus possibly additional entries that do - not. The core system's index-scan machinery will then apply that operator - again to the heap tuple to verify whether or not it really should be - selected. For non-lossy operators, the index scan must return exactly the - set of matching entries, as there is no recheck. + The access method can report that the index is lossy, or + requires rechecks, for a particular query. This implies that the index + scan will return all the entries that pass the scan key, plus possibly + additional entries that do not. The core system's index-scan machinery + will then apply the index conditions again to the heap tuple to verify + whether or not it really should be selected. If the recheck option is not + specified, the index scan must return exactly the set of matching entries. @@ -849,7 +849,7 @@ amcostestimate (PlannerInfo *root, The indexSelectivity should be set to the estimated fraction of the parent table rows that will be retrieved during the index scan. In the case - of a lossy index, this will typically be higher than the fraction of + of a lossy query, this will typically be higher than the fraction of rows that actually pass the given qual conditions. diff --git a/doc/src/sgml/ref/alter_opfamily.sgml b/doc/src/sgml/ref/alter_opfamily.sgml index cbb1c7b2786..5ebd27507ca 100644 --- a/doc/src/sgml/ref/alter_opfamily.sgml +++ b/doc/src/sgml/ref/alter_opfamily.sgml @@ -1,5 +1,5 @@ @@ -21,7 +21,7 @@ PostgreSQL documentation ALTER OPERATOR FAMILY name USING index_method ADD - { OPERATOR strategy_number operator_name ( op_type, op_type ) [ RECHECK ] + { OPERATOR strategy_number operator_name ( op_type, op_type ) | FUNCTION support_number [ ( op_type [ , op_type ] ) ] funcname ( argument_type [, ...] ) } [, ... ] ALTER OPERATOR FAMILY name USING index_method DROP @@ -154,18 +154,6 @@ ALTER OPERATOR FAMILY name USING support_number @@ -247,6 +235,14 @@ ALTER OPERATOR FAMILY name USING name [ DEFAULT ] FOR TYPE data_type USING index_method [ FAMILY family_name ] AS - { OPERATOR strategy_number operator_name [ ( op_type, op_type ) ] [ RECHECK ] + { OPERATOR strategy_number operator_name [ ( op_type, op_type ) ] | FUNCTION support_number [ ( op_type [ , op_type ] ) ] funcname ( argument_type [, ...] ) | STORAGE storage_type } [, ... ] @@ -179,18 +179,6 @@ CREATE OPERATOR CLASS name [ DEFAUL - - RECHECK - - - If present, the index is lossy for this operator, and - so the rows retrieved using the index must be rechecked to - verify that they actually satisfy the qualification clause - involving this operator. - - - - support_number @@ -256,6 +244,14 @@ CREATE OPERATOR CLASS name [ DEFAUL is likely to be inlined into the calling query, which will prevent the optimizer from recognizing that the query matches an index. + + + Before PostgreSQL 8.4, the OPERATOR + clause could include a RECHECK option. This is no longer + supported because whether an index operator is lossy is now + determined on-the-fly at runtime. This allows efficient handling of + cases where an operator might or might not be lossy. + @@ -271,12 +267,12 @@ CREATE OPERATOR CLASS name [ DEFAUL CREATE OPERATOR CLASS gist__int_ops DEFAULT FOR TYPE _int4 USING gist AS OPERATOR 3 &&, - OPERATOR 6 = RECHECK, + OPERATOR 6 = (anyarray, anyarray), OPERATOR 7 @>, OPERATOR 8 <@, OPERATOR 20 @@ (_int4, query_int), - FUNCTION 1 g_int_consistent (internal, _int4, int4), - FUNCTION 2 g_int_union (bytea, internal), + FUNCTION 1 g_int_consistent (internal, _int4, int, oid, internal), + FUNCTION 2 g_int_union (internal, internal), FUNCTION 3 g_int_compress (internal), FUNCTION 4 g_int_decompress (internal), FUNCTION 5 g_int_penalty (internal, internal, internal), diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 1aec17efd97..caa8847ef8e 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -3142,19 +3142,7 @@ SELECT plainto_tsquery('supernovae stars'); A GiST index is lossy, meaning that the index may produce false matches, and it is necessary to check the actual table row to eliminate such false matches. - PostgreSQL does this automatically; for - example, in the query plan below, the Filter: - line indicates the index output will be rechecked: - - -EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); - QUERY PLAN -------------------------------------------------------------------------- - Index Scan using textsearch_gidx on apod (cost=0.00..12.29 rows=2 width=1469) - Index Cond: (textsearch @@ '''supernova'''::tsquery) - Filter: (textsearch @@ '''supernova'''::tsquery) - - + (PostgreSQL does this automatically when needed.) GiST indexes are lossy because each document is represented in the index by a fixed-length signature. The signature is generated by hashing each word into a random bit in an n-bit string, with all these bits OR-ed @@ -3174,57 +3162,11 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); - GIN indexes are not lossy but their performance depends logarithmically on - the number of unique words. - - - - Actually, GIN indexes store only the words (lexemes) of tsvector - values, and not their weight labels. Thus, while a GIN index can be - considered non-lossy for a query that does not specify weights, it is - lossy for one that does. Thus a table row recheck is needed when using - a query that involves weights. Unfortunately, in the current design of - PostgreSQL, whether a recheck is needed is a static - property of a particular operator, and not something that can be enabled - or disabled on-the-fly depending on the values given to the operator. - To deal with this situation without imposing the overhead of rechecks - on queries that do not need them, the following approach has been - adopted: - - - - - - The standard text match operator @@ is marked as non-lossy - for GIN indexes. - - - - - - An additional match operator @@@ is provided, and marked - as lossy for GIN indexes. This operator behaves exactly like - @@ otherwise. - - - - - - When a GIN index search is initiated with the @@ operator, - the index support code will throw an error if the query specifies any - weights. This protects against giving wrong answers due to failure - to recheck the weights. - - - - - - In short, you must use @@@ rather than @@ to - perform GIN index searches on queries that involve weight restrictions. - For queries that do not have weight restrictions, either operator will - work, but @@ will be faster. - This awkwardness will probably be addressed in a future release of - PostgreSQL. + GIN indexes are not lossy for standard queries, but their performance + depends logarithmically on the number of unique words. + (However, GIN indexes store only the words (lexemes) of tsvector + values, and not their weight labels. Thus a table row recheck is needed + when using a query that involves weights.) diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 68d3123ef85..6bf7535b636 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -1,4 +1,4 @@ - + Interfacing Extensions To Indexes @@ -913,26 +913,31 @@ ALTER OPERATOR FAMILY integer_ops USING btree ADD Normally, declaring an operator as a member of an operator class - (or family) means - that the index method can retrieve exactly the set of rows + (or family) means that the index method can retrieve exactly the set of rows that satisfy a WHERE condition using the operator. For example: SELECT * FROM table WHERE integer_column < 4; can be satisfied exactly by a B-tree index on the integer column. But there are cases where an index is useful as an inexact guide to - the matching rows. For example, if a GiST index stores only - bounding boxes for objects, then it cannot exactly satisfy a WHERE + the matching rows. For example, if a GiST index stores only bounding boxes + for geometric objects, then it cannot exactly satisfy a WHERE condition that tests overlap between nonrectangular objects such as polygons. Yet we could use the index to find objects whose bounding box overlaps the bounding box of the target object, and then do the exact overlap test only on the objects found by the index. If this scenario applies, the index is said to be lossy for the - operator, and we add RECHECK to the OPERATOR clause - in the CREATE OPERATOR CLASS command. - RECHECK is valid if the index is guaranteed to return - all the required rows, plus perhaps some additional rows, which - can be eliminated by performing the original operator invocation. + operator. Lossy index searches are implemented by having the index + method return a recheck flag when a row might or might + not really satisfy the query condition. The core system will then + test the original query condition on the retrieved row to see whether + it should be returned as a valid match. This approach works if + the index is guaranteed to return all the required rows, plus perhaps + some additional rows, which can be eliminated by performing the original + operator invocation. The index methods that support lossy searches + (currently, GiST and GIN) allow the support functions of individual + operator classes to set the recheck flag, and so this is essentially an + operator-class feature. -- cgit v1.2.3