Fix bugs in contrib/pg_trgm's LIKE pattern analysis code.

Extraction of trigrams did not process LIKE escape sequences properly, leading to possible misidentification of trigrams near escapes, resulting in incorrect index search results. Fujii Masao
author: Tom Lane <tgl@sss.pgh.pa.us> 2012-08-20 13:24:58 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2012-08-20 13:24:58 -0400
commit: e0badf67e9cd409ea35f2c2d5e3ca36ffecb47d7 (patch)
tree: 98e16ba2f65c60b349289d02f6f4e91e6625c2c3 /contrib
parent: 33f40976a716287ecddfb01f446797d8df215125 (diff)
3 files changed, 39 insertions, 19 deletions
diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out
index e7af7d48902..81d0ca80b20 100644
--- a/contrib/pg_trgm/expected/pg_trgm.out
+++ b/contrib/pg_trgm/expected/pg_trgm.out
@@ -3497,6 +3497,12 @@ select * from test2 where t like '%bcd%';
  abcdef
 (1 row)
 
+select * from test2 where t like E'%\\bcd%';
+   t    
+--------
+ abcdef
+(1 row)
+
 select * from test2 where t ilike '%BCD%';
    t    
 --------
@@ -3539,6 +3545,12 @@ select * from test2 where t like '%bcd%';
  abcdef
 (1 row)
 
+select * from test2 where t like E'%\\bcd%';
+   t    
+--------
+ abcdef
+(1 row)
+
 select * from test2 where t ilike '%BCD%';
    t    
 --------
diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql
index ea902f602f9..81ab1e79b17 100644
--- a/contrib/pg_trgm/sql/pg_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_trgm.sql
@@ -49,6 +49,7 @@ explain (costs off)
   select * from test2 where t ilike '%BCD%';
 select * from test2 where t like '%BCD%';
 select * from test2 where t like '%bcd%';
+select * from test2 where t like E'%\\bcd%';
 select * from test2 where t ilike '%BCD%';
 select * from test2 where t ilike 'qua%';
 drop index test2_idx_gin;
@@ -60,5 +61,6 @@ explain (costs off)
   select * from test2 where t ilike '%BCD%';
 select * from test2 where t like '%BCD%';
 select * from test2 where t like '%bcd%';
+select * from test2 where t like E'%\\bcd%';
 select * from test2 where t ilike '%BCD%';
 select * from test2 where t ilike 'qua%';
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 4e32c6f654c..87dffd1dd2c 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -272,33 +272,36 @@ get_wildcard_part(const char *str, int lenstr,
 	const char *beginword = str;
 	const char *endword;
 	char	   *s = buf;
-	bool		in_wildcard_meta = false;
+	bool		in_leading_wildcard_meta = false;
+	bool		in_trailing_wildcard_meta = false;
 	bool		in_escape = false;
 	int			clen;
 
 	/*
-	 * Find the first word character remembering whether last character was
-	 * wildcard meta-character.
+	 * Find the first word character, remembering whether preceding character
+	 * was wildcard meta-character.  Note that the in_escape state persists
+	 * from this loop to the next one, since we may exit at a word character
+	 * that is in_escape.
 	 */
 	while (beginword - str < lenstr)
 	{
 		if (in_escape)
 		{
-			in_escape = false;
-			in_wildcard_meta = false;
 			if (iswordchr(beginword))
 				break;
+			in_escape = false;
+			in_leading_wildcard_meta = false;
 		}
 		else
 		{
 			if (ISESCAPECHAR(beginword))
 				in_escape = true;
 			else if (ISWILDCARDCHAR(beginword))
-				in_wildcard_meta = true;
+				in_leading_wildcard_meta = true;
 			else if (iswordchr(beginword))
 				break;
 			else
-				in_wildcard_meta = false;
+				in_leading_wildcard_meta = false;
 		}
 		beginword += pg_mblen(beginword);
 	}
@@ -310,11 +313,11 @@ get_wildcard_part(const char *str, int lenstr,
 		return NULL;
 
 	/*
-	 * Add left padding spaces if last character wasn't wildcard
+	 * Add left padding spaces if preceding character wasn't wildcard
 	 * meta-character.
 	 */
 	*charlen = 0;
-	if (!in_wildcard_meta)
+	if (!in_leading_wildcard_meta)
 	{
 		if (LPADDING > 0)
 		{
@@ -333,15 +336,11 @@ get_wildcard_part(const char *str, int lenstr,
 	 * string boundary.  Strip escapes during copy.
 	 */
 	endword = beginword;
-	in_wildcard_meta = false;
-	in_escape = false;
 	while (endword - str < lenstr)
 	{
 		clen = pg_mblen(endword);
 		if (in_escape)
 		{
-			in_escape = false;
-			in_wildcard_meta = false;
 			if (iswordchr(endword))
 			{
 				memcpy(s, endword, clen);
@@ -349,7 +348,17 @@ get_wildcard_part(const char *str, int lenstr,
 				s += clen;
 			}
 			else
+			{
+				/*
+				 * Back up endword to the escape character when stopping at
+				 * an escaped char, so that subsequent get_wildcard_part will
+				 * restart from the escape character.  We assume here that
+				 * escape chars are single-byte.
+				 */
+				endword--;
 				break;
+			}
+			in_escape = false;
 		}
 		else
 		{
@@ -357,7 +366,7 @@ get_wildcard_part(const char *str, int lenstr,
 				in_escape = true;
 			else if (ISWILDCARDCHAR(endword))
 			{
-				in_wildcard_meta = true;
+				in_trailing_wildcard_meta = true;
 				break;
 			}
 			else if (iswordchr(endword))
@@ -367,19 +376,16 @@ get_wildcard_part(const char *str, int lenstr,
 				s += clen;
 			}
 			else
-			{
-				in_wildcard_meta = false;
 				break;
-			}
 		}
 		endword += clen;
 	}
 
 	/*
-	 * Add right padding spaces if last character wasn't wildcard
+	 * Add right padding spaces if next character isn't wildcard
 	 * meta-character.
 	 */
-	if (!in_wildcard_meta)
+	if (!in_trailing_wildcard_meta)
 	{
 		if (RPADDING > 0)
 		{
author	Tom Lane <tgl@sss.pgh.pa.us>	2012-08-20 13:24:58 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2012-08-20 13:24:58 -0400
commit	e0badf67e9cd409ea35f2c2d5e3ca36ffecb47d7 (patch)
tree	98e16ba2f65c60b349289d02f6f4e91e6625c2c3 /contrib
parent	33f40976a716287ecddfb01f446797d8df215125 (diff)