Fix not-terribly-safe coding in NIImportOOAffixes() and NIImportAffixes().

There were two places in spell.c that supposed that they could search for a location in a string produced by lowerstr() and then transpose the offset into the original string. But this fails completely if lowerstr() transforms any characters into characters of different byte length, as can happen in Turkish UTF8 for instance. We'd added some comments about this coding in commit 51e78ab4ff328296, but failed to realize that it was not merely confusing but wrong. Coverity complained about this code years ago, but in such an opaque fashion that nobody understood what it was on about. I'm not entirely sure that this issue *is* what it's on about, actually, but perhaps this patch will shut it up -- and in any case the problem is clear. Back-patch to all supported branches.
author: Tom Lane <tgl@sss.pgh.pa.us> 2016-03-06 19:21:03 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2016-03-06 19:21:03 -0500
commit: c2d61adc32fe0f91b735f85263ecdd82589dba80 (patch)
tree: cd3d58958e563088acdea47bc4da0865c9f9a5c4
parent: ababe02ac2fa814c4e076e2ff92cd84e4222e5c7 (diff)
1 files changed, 22 insertions, 13 deletions
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
index 8c4d989474c..28fe987e103 100644
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -169,6 +169,19 @@ findchar(char *str, int c)
 	return NULL;
 }
 
+static char *
+findchar2(char *str, int c1, int c2)
+{
+	while (*str)
+	{
+		if (t_iseq(str, c1) || t_iseq(str, c2))
+			return str;
+		str += pg_mblen(str);
+	}
+
+	return NULL;
+}
+
 
 /* backward string compare for suffix tree operations */
 static int
@@ -856,22 +869,20 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 
 			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
 				goto nextline;
-			prepl = lowerstr_ctx(Conf, repl);
-			/* Find position of '/' in lowercased string "prepl" */
-			if ((ptr = strchr(prepl, '/')) != NULL)
+			/* Get flags after '/' (flags are case sensitive) */
+			if ((ptr = strchr(repl, '/')) != NULL)
 			{
-				/*
-				 * Here we use non-lowercased string "repl". We need position
-				 * of '/' in "repl".
-				 */
-				*ptr = '\0';
-				ptr = repl + (ptr - prepl) + 1;
+				ptr++;
 				while (*ptr)
 				{
 					aflg |= Conf->flagval[*(unsigned char *) ptr];
 					ptr++;
 				}
 			}
+			/* Get lowercased version of string before '/' */
+			prepl = lowerstr_ctx(Conf, repl);
+			if ((ptr = strchr(prepl, '/')) != NULL)
+				*ptr = '\0';
 			pfind = lowerstr_ctx(Conf, find);
 			pmask = lowerstr_ctx(Conf, mask);
 			if (t_iseq(find, '0'))
@@ -939,12 +950,10 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 
 		if (STRNCMP(pstr, "compoundwords") == 0)
 		{
-			/* Find position in lowercased string "pstr" */
-			s = findchar(pstr, 'l');
+			/* Find case-insensitive L flag in non-lowercased string */
+			s = findchar2(recoded, 'l', 'L');
 			if (s)
 			{
-				/* Here we use non-lowercased string "recoded" */
-				s = recoded + (s - pstr);
 				while (*s && !t_isspace(s))
 					s += pg_mblen(s);
 				while (*s && t_isspace(s))
author	Tom Lane <tgl@sss.pgh.pa.us>	2016-03-06 19:21:03 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2016-03-06 19:21:03 -0500
commit	c2d61adc32fe0f91b735f85263ecdd82589dba80 (patch)
tree	cd3d58958e563088acdea47bc4da0865c9f9a5c4
parent	ababe02ac2fa814c4e076e2ff92cd84e4222e5c7 (diff)