From be8a7a6866276b228b4ffaa3003e1dc2dd1d140a Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Wed, 21 Mar 2018 14:57:42 +0300
Subject: Add strict_word_similarity to pg_trgm module

strict_word_similarity is similar to existing word_similarity function but
it takes into account word boundaries to compute similarity.

Author: Alexander Korotkov
Review by: David Steele, Liudmila Mantrova, me
Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
---
 doc/src/sgml/pgtrgm.sgml | 88 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 2 deletions(-)

(limited to 'doc/src')
diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml
index b5d893c9fbb..8f395296d8f 100644
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -105,6 +105,17 @@
        the explanation below.
       </entry>
      </row>
+     <row>
+      <entry>
+       <function>strict_word_similarity(text, text)</function>
+       <indexterm><primary>strict_word_similarity</primary></indexterm>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Same as <function>word_similarity(text, text)</function>, but forces
+       extent boundaries to match word boundaries.
+      </entry>
+     </row>
      <row>
       <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
       <entry><type>real</type></entry>
@@ -157,6 +168,29 @@
    a part of the word.
   </para>
 
+  <para>
+   At the same time, <function>strict_word_similarity(text, text)</function>
+   has to select an extent that matches word boundaries.  In the example above,
+   <function>strict_word_similarity(text, text)</function> would select the
+   extent <literal>{"  w"," wo","wor","ord","rds", ds "}</literal>, which
+   corresponds to the whole word <literal>'words'</literal>.
+
+<programlisting>
+# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
+ strict_word_similarity | similarity
+------------------------+------------
+               0.571429 |   0.571429
+(1 row)
+</programlisting>
+  </para>
+
+  <para>
+   Thus, the <function>strict_word_similarity(text, text)</function> function
+   is useful for finding similar subsets of whole words, while
+   <function>word_similarity(text, text)</function> is more suitable for
+   searching similar parts of words.
+  </para>
+
   <table id="pgtrgm-op-table">
    <title><filename>pg_trgm</filename> Operators</title>
    <tgroup cols="3">
@@ -196,6 +230,24 @@
        Commutator of the <literal>&lt;%</literal> operator.
       </entry>
      </row>
+     <row>
+      <entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Returns <literal>true</literal> if its second argument has a continuous
+       extent of an ordered trigram set that matches word boundaries,
+       and its similarity to the trigram set of the first argument is greater
+       than the current strict word similarity threshold set by the
+       <varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
+      </entry>
+     </row>
+     <row>
+      <entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;%</literal> operator.
+      </entry>
+     </row>
      <row>
       <entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
       <entry><type>real</type></entry>
@@ -223,6 +275,25 @@
        Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
       </entry>
      </row>
+     <row>
+      <entry>
+       <type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns the <quote>distance</quote> between the arguments, that is
+       one minus the <function>strict_word_similarity()</function> value.
+      </entry>
+     </row>
+     <row>
+      <entry>
+       <type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
+      </entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
@@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist
 
   <para>
    Also you can use an index on the <structfield>t</structfield> column for word
-   similarity.  For example:
+   similarity or strict word similarity.  Typical queries are:
 <programlisting>
 SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
   FROM test_trgm
   WHERE '<replaceable>word</replaceable>' &lt;% t
   ORDER BY sml DESC, t;
+</programlisting>
+   and
+<programlisting>
+SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
+  FROM test_trgm
+  WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
+  ORDER BY sml DESC, t;
 </programlisting>
    This will return all values in the text column for which there is a
    continuous extent in the corresponding ordered trigram set that is
@@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
   </para>
 
   <para>
-   A variant of the above query is
+   Possible variants of the above queries are:
 <programlisting>
 SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
   FROM test_trgm
   ORDER BY dist LIMIT 10;
+</programlisting>
+   and
+<programlisting>
+SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
+  FROM test_trgm
+  ORDER BY dist LIMIT 10;
 </programlisting>
    This can be implemented quite efficiently by GiST indexes, but not
    by GIN indexes.
-- 
cgit v1.2.3