From be8a7a6866276b228b4ffaa3003e1dc2dd1d140a Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Wed, 21 Mar 2018 14:57:42 +0300 Subject: Add strict_word_similarity to pg_trgm module strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com --- doc/src/sgml/pgtrgm.sgml | 88 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 2 deletions(-) (limited to 'doc/src') diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index b5d893c9fbb..8f395296d8f 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -105,6 +105,17 @@ the explanation below. + + + strict_word_similarity(text, text) + strict_word_similarity + + real + + Same as word_similarity(text, text), but forces + extent boundaries to match word boundaries. + + show_limit()show_limit real @@ -157,6 +168,29 @@ a part of the word. + + At the same time, strict_word_similarity(text, text) + has to select an extent that matches word boundaries. In the example above, + strict_word_similarity(text, text) would select the + extent {" w"," wo","wor","ord","rds", ds "}, which + corresponds to the whole word 'words'. + + +# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words'); + strict_word_similarity | similarity +------------------------+------------ + 0.571429 | 0.571429 +(1 row) + + + + + Thus, the strict_word_similarity(text, text) function + is useful for finding similar subsets of whole words, while + word_similarity(text, text) is more suitable for + searching similar parts of words. + + <filename>pg_trgm</filename> Operators @@ -196,6 +230,24 @@ Commutator of the <% operator. + + text <<% text + boolean + + Returns true if its second argument has a continuous + extent of an ordered trigram set that matches word boundaries, + and its similarity to the trigram set of the first argument is greater + than the current strict word similarity threshold set by the + pg_trgm.strict_word_similarity_threshold parameter. + + + + text %>> text + boolean + + Commutator of the <<% operator. + + text <-> text real @@ -223,6 +275,25 @@ Commutator of the <<-> operator. + + + text <<<-> text + + real + + Returns the distance between the arguments, that is + one minus the strict_word_similarity() value. + + + + + text <->>> text + + real + + Commutator of the <<<-> operator. + +
@@ -322,12 +393,19 @@ SELECT t, t <-> 'word' AS dist Also you can use an index on the t column for word - similarity. For example: + similarity or strict word similarity. Typical queries are: SELECT t, word_similarity('word', t) AS sml FROM test_trgm WHERE 'word' <% t ORDER BY sml DESC, t; + + and + +SELECT t, strict_word_similarity('word', t) AS sml + FROM test_trgm + WHERE 'word' <<% t + ORDER BY sml DESC, t; This will return all values in the text column for which there is a continuous extent in the corresponding ordered trigram set that is @@ -337,11 +415,17 @@ SELECT t, word_similarity('word', t) AS sml - A variant of the above query is + Possible variants of the above queries are: SELECT t, 'word' <<-> t AS dist FROM test_trgm ORDER BY dist LIMIT 10; + + and + +SELECT t, 'word' <<<-> t AS dist + FROM test_trgm + ORDER BY dist LIMIT 10; This can be implemented quite efficiently by GiST indexes, but not by GIN indexes. -- cgit v1.2.3