From f576b17cd6ba653bdace1f0da9a3b57f4984e460 Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Wed, 16 Mar 2016 18:59:21 +0300
Subject: Add word_similarity to pg_trgm contrib module.

Patch introduces a concept of similarity over string and just a word from
another string.

Version of extension is not changed because 1.2 was already introduced in 9.6
release cycle, so, there wasn't a public version.

Author: Alexander Korotkov, Artur Zakirov
---
 doc/src/sgml/pgtrgm.sgml | 79 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'doc/src')
diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml
index 1a9b5a796f1..cdce17251f6 100644
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -92,6 +92,21 @@
        (In practice this is seldom useful except for debugging.)
       </entry>
      </row>
+     <row>
+      <entry>
+       <function>word_similarity(text, text)</function>
+       <indexterm><primary>word_similarity</primary></indexterm>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns a number that indicates how similar the first string
+       to the most similar word of the second string. The function searches in
+       the second string a most similar word not a most similar substring.  The
+       range of the result is zero (indicating that the two strings are
+       completely dissimilar) to one (indicating that the first string is
+       identical to one of the word of the second string).
+      </entry>
+     </row>
      <row>
       <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
       <entry><type>real</type></entry>
@@ -137,6 +152,16 @@
        <varname>pg_trgm.similarity_threshold</>.
       </entry>
      </row>
+     <row>
+       <entry><type>text</> <literal>%&gt;</literal> <type>text</></entry>
+       <entry><type>boolean</type></entry>
+       <entry>
+        Returns <literal>true</> if its first argument has the similar word in
+        the second argument and they have a similarity that is greater than the
+        current word similarity threshold set by
+        <varname>pg_trgm.word_similarity_threshold</> parameter.
+       </entry>
+      </row>
      <row>
       <entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry>
       <entry><type>real</type></entry>
@@ -145,6 +170,16 @@
        one minus the <function>similarity()</> value.
       </entry>
      </row>
+     <row>
+       <entry>
+        <type>text</> <literal>&lt;-&gt;&gt;</literal> <type>text</>
+       </entry>
+       <entry><type>real</type></entry>
+       <entry>
+        Returns the <quote>distance</> between the arguments, that is
+        one minus the <function>word_similarity()</> value.
+       </entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
@@ -168,6 +203,23 @@
      </para>
     </listitem>
    </varlistentry>
+    <varlistentry id="guc-pgtrgm-word-similarity-threshold" xreflabel="pg_trgm.word_similarity_threshold">
+     <term>
+      <varname>pg_trgm.word_similarity_threshold</> (<type>real</type>)
+      <indexterm>
+       <primary>
+        <varname>pg_trgm.word_similarity_threshold</> configuration parameter
+       </primary>
+      </indexterm>
+     </term>
+     <listitem>
+      <para>
+       Sets the current word similarity threshold that is used by
+       the <literal>%&gt;</> operator.  The threshold must be between
+       0 and 1 (default is 0.6).
+      </para>
+     </listitem>
+    </varlistentry>
   </variablelist>
  </sect2>
 
@@ -225,6 +277,33 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
    a small number of the closest matches is wanted.
   </para>
 
+  <para>
+   Also you can use an index on the <structfield>t</> column for word
+   similarity.  For example:
+<programlisting>
+SELECT t, word_similarity('<replaceable>word</>', t) AS sml
+  FROM test_trgm
+  WHERE t %&gt; '<replaceable>word</>'
+  ORDER BY sml DESC, t;
+</programlisting>
+   This will return all values in the text column that have a word
+   which sufficiently similar to <replaceable>word</>, sorted from best
+   match to worst.  The index will be used to make this a fast operation
+   even over very large data sets.
+  </para>
+
+  <para>
+   A variant of the above query is
+<programlisting>
+SELECT t, t &lt;-&gt;&gt; '<replaceable>word</>' AS dist
+  FROM test_trgm
+  ORDER BY dist LIMIT 10;
+</programlisting>
+   This can be implemented quite efficiently by GiST indexes, but not
+   by GIN indexes.
+  </para>
+
+
   <para>
    Beginning in <productname>PostgreSQL</> 9.1, these index types also support
    index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
-- 
cgit v1.2.3