diff options
| -rw-r--r-- | contrib/unaccent/unaccent.c | 86 | 
1 files changed, 67 insertions, 19 deletions
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index 8e012ac1725..396ea24700e 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -93,35 +93,83 @@ initSuffixTree(char *filename)  	do  	{ -		char		src[4096]; -		char		trg[4096]; -		int			srclen; -		int			trglen; -		char	   *line = NULL; - +		/* +		 * pg_do_encoding_conversion() (called by tsearch_readline()) will +		 * emit exception if it finds untranslatable characters in current +		 * locale. We just skip such lines, continuing with the next. +		 */  		skip = true;  		PG_TRY();  		{ -			/* -			 * pg_do_encoding_conversion() (called by tsearch_readline()) will -			 * emit exception if it finds untranslatable characters in current -			 * locale. We just skip such characters. -			 */ +			char	   *line; +  			while ((line = tsearch_readline(&trst)) != NULL)  			{ -				if (sscanf(line, "%s\t%s\n", src, trg) != 2) -					continue; +				/* +				 * The format of each line must be "src trg" where src and trg +				 * are sequences of one or more non-whitespace characters, +				 * separated by whitespace.  Whitespace at start or end of +				 * line is ignored. +				 */ +				int			state; +				char	   *ptr; +				char	   *src = NULL; +				char	   *trg = NULL; +				int			ptrlen; +				int			srclen = 0; +				int			trglen = 0; + +				state = 0; +				for (ptr = line; *ptr; ptr += ptrlen) +				{ +					ptrlen = pg_mblen(ptr); +					/* ignore whitespace, but end src or trg */ +					if (t_isspace(ptr)) +					{ +						if (state == 1) +							state = 2; +						else if (state == 3) +							state = 4; +						continue; +					} +					switch (state) +					{ +						case 0: +							/* start of src */ +							src = ptr; +							srclen = ptrlen; +							state = 1; +							break; +						case 1: +							/* continue src */ +							srclen += ptrlen; +							break; +						case 2: +							/* start of trg */ +							trg = ptr; +							trglen = ptrlen; +							state = 3; +							break; +						case 3: +							/* continue trg */ +							trglen += ptrlen; +							break; +						default: +							/* bogus line format */ +							state = -1; +							break; +					} +				} -				srclen = strlen(src); -				trglen = strlen(trg); +				if (state >= 3) +					rootSuffixTree = placeChar(rootSuffixTree, +											   (unsigned char *) src, srclen, +											   trg, trglen); -				rootSuffixTree = placeChar(rootSuffixTree, -										   (unsigned char *) src, srclen, -										   trg, trglen); -				skip = false;  				pfree(line);  			} +			skip = false;  		}  		PG_CATCH();  		{  | 
