From bb140506df605fab58f48926ee1db1f80bdafb59 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Thu, 7 Apr 2016 18:44:18 +0300 Subject: Phrase full text search. Patch introduces new text search operator (<-> or ) into tsquery. On-disk and binary in/out format of tsquery are backward compatible. It has two side effect: - change order for tsquery, so, users, who has a btree index over tsquery, should reindex it - less number of parenthesis in tsquery output, and tsquery becomes more readable Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov Reviewers: Alexander Korotkov, Artur Zakirov --- doc/src/sgml/datatype.sgml | 9 ++- doc/src/sgml/func.sgml | 39 ++++++++++ doc/src/sgml/textsearch.sgml | 182 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 215 insertions(+), 15 deletions(-) (limited to 'doc/src') diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 7c3ef92cd2e..0b60c61d480 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -3924,8 +3924,9 @@ SELECT to_tsvector('english', 'The Fat Rats'); A tsquery value stores lexemes that are to be searched for, and combines them honoring the Boolean operators - & (AND), | (OR), and - ! (NOT). Parentheses can be used to enforce grouping + & (AND), | (OR), + ! (NOT) and <-> (FOLLOWED BY) phrase search + operator. Parentheses can be used to enforce grouping of the operators: @@ -3946,8 +3947,8 @@ SELECT 'fat & rat & ! cat'::tsquery; In the absence of parentheses, ! (NOT) binds most tightly, - and & (AND) binds more tightly than - | (OR). + and & (AND) and <-> (FOLLOWED BY) + both bind more tightly than | (OR). diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 15b6b4eb3d5..9b0778baa99 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -9127,6 +9127,12 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple !! 'cat'::tsquery !'cat' + + <-> + tsquery followed by tsquery + to_tsquery('fat') <-> to_tsquery('rat') + 'fat' <-> 'rat' + @> tsquery contains another ? @@ -9219,6 +9225,18 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple plainto_tsquery('english', 'The Fat Rats') 'fat' & 'rat' + + + + phraseto_tsquery + + phraseto_tsquery( config regconfig , query text) + + tsquery + produce tsquery ignoring punctuation + phraseto_tsquery('english', 'The Fat Rats') + 'fat' <-> 'rat' + @@ -9421,6 +9439,27 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple SELECT ts_rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases') 'b' & ( 'foo' | 'bar' ) + + + + tsquery_phrase + + tsquery_phrase(query1 tsquery, query2 tsquery) + + tsquery + implementation of <-> (FOLLOWED BY) operator + tsquery_phrase(to_tsquery('fat'), to_tsquery('cat')) + 'fat' <-> 'cat' + + + + tsquery_phrase(query1 tsquery, query2 tsquery, distance integer) + + tsquery + phrase-concatenate with distance + tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10) + 'fat' <10> 'cat' + diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index ea3abc9e15a..930c8f0a5dc 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -263,9 +263,10 @@ SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::t As the above example suggests, a tsquery is not just raw text, any more than a tsvector is. A tsquery contains search terms, which must be already-normalized lexemes, and - may combine multiple terms using AND, OR, and NOT operators. + may combine multiple terms using AND, OR, NOT and FOLLOWED BY operators. (For details see .) There are - functions to_tsquery and plainto_tsquery + functions to_tsquery, plainto_tsquery + and phraseto_tsquery that are helpful in converting user-written text into a proper tsquery, for example by normalizing words appearing in the text. Similarly, to_tsvector is used to parse and @@ -293,6 +294,35 @@ SELECT 'fat cats ate fat rats'::tsvector @@ to_tsquery('fat & rat'); already normalized, so rats does not match rat. + + Phrase search is made possible with the help of the <-> + (FOLLOWED BY) operator, which enforces lexeme order. This allows you + to discard strings not containing the desired phrase, for example: + + +SELECT q @@ to_tsquery('fatal <-> error') +FROM unnest(array[to_tsvector('fatal error'), + to_tsvector('error is not fatal')]) AS q; + ?column? +---------- + t + f + + + A more generic version of the FOLLOWED BY operator takes form of + <N>, where N stands for the greatest allowed distance + between the specified lexemes. The phraseto_tsquery + function makes use of this behavior in order to construct a + tsquery capable of matching the provided phrase: + + +SELECT phraseto_tsquery('cat ate some rats'); + phraseto_tsquery +------------------------------- + ( 'cat' <-> 'ate' ) <2> 'rat' + + + The @@ operator also supports text input, allowing explicit conversion of a text @@ -709,11 +739,14 @@ UPDATE tt SET ti = PostgreSQL provides the - functions to_tsquery and - plainto_tsquery for converting a query to - the tsquery data type. to_tsquery - offers access to more features than plainto_tsquery, - but is less forgiving about its input. + functions to_tsquery, + plainto_tsquery and + phraseto_tsquery + for converting a query to the tsquery data type. + to_tsquery offers access to more features + than both plainto_tsquery and + phraseto_tsquery, but is less forgiving + about its input. @@ -728,7 +761,8 @@ to_tsquery( config to_tsquery creates a tsquery value from querytext, which must consist of single tokens separated by the Boolean operators & (AND), - | (OR) and ! (NOT). These operators + | (OR), ! (NOT), and also the + <-> (FOLLOWED BY) phrase search operator. These operators can be grouped using parentheses. In other words, the input to to_tsquery must already follow the general rules for tsquery input, as described in Note that plainto_tsquery cannot - recognize Boolean operators, weight labels, or prefix-match labels - in its input: + recognize Boolean and phrase search operators, weight labels, + or prefix-match labels in its input: SELECT plainto_tsquery('english', 'The Fat & Rats:C'); @@ -827,6 +861,57 @@ SELECT plainto_tsquery('english', 'The Fat & Rats:C'); Here, all the input punctuation was discarded as being space symbols. + + phraseto_tsquery + + + +phraseto_tsquery( config regconfig, querytext text) returns tsquery + + + + phraseto_tsquery behaves much like + plainto_tsquery, with the exception + that it utilizes the <-> (FOLLOWED BY) phrase search + operator instead of the & (AND) Boolean operator. + This is particularly useful when searching for exact lexeme sequences, + since the phrase search operator helps to maintain lexeme order. + + + + Example: + + +SELECT phraseto_tsquery('english', 'The Fat Rats'); + phraseto_tsquery +------------------ + 'fat' <-> 'rat' + + + Just like the plainto_tsquery, the + phraseto_tsquery function cannot + recognize Boolean and phrase search operators, weight labels, + or prefix-match labels in its input: + + +SELECT phraseto_tsquery('english', 'The Fat & Rats:C'); + phraseto_tsquery +----------------------------- + ( 'fat' <-> 'rat' ) <-> 'c' + + + It is possible to specify the configuration to be used to parse the document, + for example, we could create a new one using the hunspell dictionary + (namely 'eng_hunspell') in order to match phrases with different word forms: + + +SELECT phraseto_tsquery('eng_hunspell', 'developer of the building which collapsed'); + phraseto_tsquery +-------------------------------------------------------------------------------------------- + ( 'developer' <3> 'building' ) <2> 'collapse' | ( 'developer' <3> 'build' ) <2> 'collapse' + + + @@ -1387,6 +1472,81 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank + + + + tsquery <-> tsquery + + + + + Returns the phrase-concatenation of the two given queries. + + +SELECT to_tsquery('fat') <-> to_tsquery('cat | rat'); + ?column? +----------------------------------- + 'fat' <-> 'cat' | 'fat' <-> 'rat' + + + + + + + + + + + tsquery_phrase + + + tsquery_phrase(query1 tsquery, query2 tsquery [, distance integer ]) returns tsquery + + + + + Returns the distanced phrase-concatenation of the two given queries. + This function lies in the implementation of the <-> operator. + + +SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10); + tsquery_phrase +------------------ + 'fat' <10> 'cat' + + + + + + + + + + + setweight + + + setweight(query tsquery, weight "char") returns tsquery + + + + + setweight returns a copy of the input query in which every + position has been labeled with the given weight(s), either + A, B, C, + D or their combination. These labels are retained when + queries are concatenated, allowing words from different parts of a document + to be weighted differently by ranking functions. + + + + Note that weight labels apply to positions, not + lexemes. If the input query has been stripped of + positions then setweight does nothing. + + + + @@ -2428,7 +2588,7 @@ more sample word(s) : more indexed word(s) Specific stop words recognized by the subdictionary cannot be - specified; instead use ? to mark the location where any + specified; instead use <-> to mark the location where any stop word can appear. For example, assuming that a and the are stop words according to the subdictionary: -- cgit v1.2.3