diff options
Diffstat (limited to 'src/backend/utils/mb/wchar.c')
-rw-r--r-- | src/backend/utils/mb/wchar.c | 699 |
1 files changed, 556 insertions, 143 deletions
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 293cab42c37..4e40bc60f1d 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.47.2.1 2005/12/24 09:42:30 ishii Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.47.2.2 2006/05/21 20:05:48 tgl Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -96,7 +96,7 @@ static int pg_euc2wchar_with_len return (cnt); } -static int +static inline int pg_euc_mblen(const unsigned char *s) { int len; @@ -112,7 +112,7 @@ pg_euc_mblen(const unsigned char *s) return (len); } -static int +static inline int pg_euc_dsplen(const unsigned char *s) { int len; @@ -396,7 +396,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } /* - * returns the byte length of a UTF8 word pointed to by s + * returns the byte length of a UTF8 character pointed to by s */ int pg_utf_mblen(const unsigned char *s) @@ -720,229 +720,642 @@ pg_gb18030_dsplen(const unsigned char *s) return (len); } +/* + *------------------------------------------------------------------- + * multibyte sequence validators + * + * These functions accept "s", a pointer to the first byte of a string, + * and "len", the remaining length of the string. If there is a validly + * encoded character beginning at *s, return its length in bytes; else + * return -1. + * + * The functions can assume that len > 0 and that *s != '\0', but they must + * test for and reject zeroes in any additional bytes of a multibyte character. + * + * Note that this definition allows the function for a single-byte + * encoding to be just "return 1". + *------------------------------------------------------------------- + */ -pg_wchar_tbl pg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, 1}, /* 0; PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, 3}, /* 1; PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, 3}, /* 2; PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ - {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4}, /* 6; PG_UTF8 */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 11; PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 12; PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 13; PG_LATIN6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 14; PG_LATIN7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 15; PG_LATIN8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 16; PG_LATIN9 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 17; PG_LATIN10 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 18; PG_WIN1256 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 19; PG_WIN1258 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 20; PG_WIN874 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 21; PG_KOI8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 22; PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 22; PG_WIN1252 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 23; PG_WIN866 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 24; ISO-8859-5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 25; ISO-8859-6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ - {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ -}; +static int +pg_ascii_verifier(const unsigned char *s, int len) +{ + return 1; +} -/* returns the byte length of a word for mule internal code */ -int -pg_mic_mblen(const unsigned char *mbstr) +#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe) + +static int +pg_eucjp_verifier(const unsigned char *s, int len) { - return (pg_mule_mblen(mbstr)); + int l; + unsigned char c1, c2; + + c1 = *s++; + + switch (c1) + { + case SS2: /* JIS X 0201 */ + l = 2; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xdf) + return -1; + break; + + case SS3: /* JIS X 0212 */ + l = 3; + if (l > len) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + + return l; } -/* - * Returns the byte length of a multibyte word. - */ -int -pg_encoding_mblen(int encoding, const char *mbstr) +static int +pg_euckr_verifier(const unsigned char *s, int len) { - Assert(PG_VALID_ENCODING(encoding)); + int l; + unsigned char c1, c2; - return ((encoding >= 0 && - encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr))); + c1 = *s++; + + if (IS_HIGHBIT_SET(c1)) + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + + return l; } -/* - * Returns the display length of a multibyte word. - */ -int -pg_encoding_dsplen(int encoding, const char *mbstr) +/* EUC-CN byte sequences are exactly same as EUC-KR */ +#define pg_euccn_verifier pg_euckr_verifier + +static int +pg_euctw_verifier(const unsigned char *s, int len) { - Assert(PG_VALID_ENCODING(encoding)); + int l; + unsigned char c1, c2; - return ((encoding >= 0 && - encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr))); + c1 = *s++; + + switch (c1) + { + case SS2: /* CNS 11643 Plane 1-7 */ + l = 4; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xa7) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + case SS3: /* unused */ + return -1; + + default: + if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */ + { + l = 2; + if (l > len) + return -1; + /* no further range check on c1? */ + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + return l; } -/* - * fetch maximum length of a char encoding - */ -int -pg_encoding_max_length(int encoding) +static int +pg_johab_verifier(const unsigned char *s, int len) { - Assert(PG_VALID_ENCODING(encoding)); + int l, mbl; + unsigned char c; - return pg_wchar_table[encoding].maxmblen; + l = mbl = pg_johab_mblen(s); + + if (len < l) + return -1; + + if (!IS_HIGHBIT_SET(*s)) + return mbl; + + while (--l > 0) + { + c = *++s; + if (!IS_EUC_RANGE_VALID(c)) + return -1; + } + return mbl; } -#ifndef FRONTEND +static int +pg_mule_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c; + + l = mbl = pg_mule_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + c = *++s; + if (!IS_HIGHBIT_SET(c)) + return -1; + } + return mbl; +} + +static int +pg_latin1_verifier(const unsigned char *s, int len) +{ + return 1; +} + +static int +pg_sjis_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c1, c2; + + l = mbl = pg_sjis_mblen(s); + + if (len < l) + return -1; + + if (l == 1) /* pg_sjis_mblen already verified it */ + return mbl; + + c1 = *s++; + c2 = *s; + if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) + return -1; + return mbl; +} + +static int +pg_big5_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_big5_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_gbk_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gbk_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_uhc_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_uhc_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} +static int +pg_gb18030_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gb18030_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_utf8_verifier(const unsigned char *s, int len) +{ + int l = pg_utf_mblen(s); + + if (len < l) + return -1; + + if (!pg_utf8_islegal(s, l)) + return -1; + + return l; +} + +/* + * Check for validity of a single UTF-8 encoded character + * + * This directly implements the rules in RFC3629. The bizarre-looking + * restrictions on the second byte are meant to ensure that there isn't + * more than one encoding of a given Unicode character point; that is, + * you may not use a longer-than-necessary byte sequence with high order + * zero bits to represent a character that would fit in fewer bytes. + * To do otherwise is to create security hazards (eg, create an apparent + * non-ASCII character that decodes to plain ASCII). + * + * length is assumed to have been obtained by pg_utf_mblen(), and the + * caller must have checked that that many bytes are present in the buffer. + */ bool pg_utf8_islegal(const unsigned char *source, int length) { unsigned char a; - const unsigned char *srcptr = source + length; switch (length) { default: + /* reject lengths 5 and 6 for now */ return false; - /* Everything else falls through when "true"... */ case 4: - if ((a = (*--srcptr)) < 0x80 || a > 0xBF) + a = source[3]; + if (a < 0x80 || a > 0xBF) return false; + /* FALL THRU */ case 3: - if ((a = (*--srcptr)) < 0x80 || a > 0xBF) + a = source[2]; + if (a < 0x80 || a > 0xBF) return false; + /* FALL THRU */ case 2: - if ((a = (*--srcptr)) > 0xBF) - return false; + a = source[1]; switch (*source) { - /* no fall-through in this inner switch */ case 0xE0: - if (a < 0xA0) + if (a < 0xA0 || a > 0xBF) return false; break; case 0xED: - if (a > 0x9F) + if (a < 0x80 || a > 0x9F) return false; break; case 0xF0: - if (a < 0x90) + if (a < 0x90 || a > 0xBF) return false; break; case 0xF4: - if (a > 0x8F) + if (a < 0x80 || a > 0x8F) return false; break; default: - if (a < 0x80) + if (a < 0x80 || a > 0xBF) return false; + break; } - + /* FALL THRU */ case 1: - if (*source >= 0x80 && *source < 0xC2) + a = *source; + if (a >= 0x80 && a < 0xC2) + return false; + if (a > 0xF4) return false; + break; } - if (*source > 0xF4) - return false; return true; } +/* + *------------------------------------------------------------------- + * encoding info table + *------------------------------------------------------------------- + */ +pg_wchar_tbl pg_wchar_table[] = { + {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* 0; PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 1; PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3}, /* 2; PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* 3; PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 3}, /* 4; PG_EUC_TW */ + {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* 5; PG_JOHAB */ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* 6; PG_UTF8 */ + {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 8; PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 9; PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 10; PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 11; PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 12; PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 13; PG_LATIN6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 14; PG_LATIN7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 15; PG_LATIN8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 16; PG_LATIN9 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 17; PG_LATIN10 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 18; PG_WIN1256 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 19; PG_WIN1258 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 20; PG_WIN874 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 21; PG_KOI8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_WIN1252 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 23; PG_WIN866 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 24; ISO-8859-5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 25; ISO-8859-6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 26; ISO-8859-7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 27; ISO-8859-8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 28; PG_WIN1250 */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* 29; PG_SJIS */ + {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* 30; PG_BIG5 */ + {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* 31; PG_GBK */ + {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* 32; PG_UHC */ + {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 2} /* 33; PG_GB18030 */ +}; + +/* returns the byte length of a word for mule internal code */ +int +pg_mic_mblen(const unsigned char *mbstr) +{ + return pg_mule_mblen(mbstr); +} + +/* + * Returns the byte length of a multibyte character. + */ +int +pg_encoding_mblen(int encoding, const char *mbstr) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr))); +} + +/* + * Returns the display length of a multibyte character. + */ +int +pg_encoding_dsplen(int encoding, const char *mbstr) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr))); +} /* - * Verify mbstr to make sure that it has a valid character sequence. - * mbstr is not necessarily NULL terminated; length of mbstr is + * Verify the first multibyte character of the given string. + * Return its byte length if good, -1 if bad. (See comments above for + * full details of the mbverify API.) + */ +int +pg_encoding_verifymb(int encoding, const char *mbstr, int len) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) : + ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len))); +} + +/* + * fetch maximum length of a given encoding + */ +int +pg_encoding_max_length(int encoding) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return pg_wchar_table[encoding].maxmblen; +} + +#ifndef FRONTEND + +/* + * fetch maximum length of the encoding for the current database + */ +int +pg_database_encoding_max_length(void) +{ + return pg_wchar_table[GetDatabaseEncoding()].maxmblen; +} + +/* + * Verify mbstr to make sure that it is validly encoded in the current + * database encoding. Otherwise same as pg_verify_mbstr(). + */ +bool +pg_verifymbstr(const char *mbstr, int len, bool noError) +{ + return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError); +} + +/* + * Verify mbstr to make sure that it is validly encoded in the specified + * encoding. + * + * mbstr is not necessarily zero terminated; length of mbstr is * specified by len. * * If OK, return TRUE. If a problem is found, return FALSE when noError is * true; when noError is false, ereport() a descriptive message. */ bool -pg_verifymbstr(const char *mbstr, int len, bool noError) +pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) { - int l; - int i; - int encoding; + mbverifier mbverify; + + Assert(PG_VALID_ENCODING(encoding)); + + /* + * In single-byte encodings, we need only reject nulls (\0). + */ + if (pg_encoding_max_length(encoding) <= 1) + { + const char *nullpos = memchr(mbstr, 0, len); - /* we do not need any check in single-byte encodings */ - if (pg_database_encoding_max_length() <= 1) - return true; + if (nullpos == NULL) + return true; + if (noError) + return false; + report_invalid_encoding(encoding, nullpos, 1); + } - encoding = GetDatabaseEncoding(); + /* fetch function pointer just once */ + mbverify = pg_wchar_table[encoding].mbverify; - while (len > 0 && *mbstr) + while (len > 0) { - l = pg_mblen(mbstr); + int l; - /* special UTF-8 check */ - if (encoding == PG_UTF8) + /* fast path for ASCII-subset characters */ + if (!IS_HIGHBIT_SET(*mbstr)) { - if (!pg_utf8_islegal((const unsigned char *) mbstr, l)) + if (*mbstr != '\0') { - if (noError) - return false; - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid UTF-8 byte sequence detected near byte 0x%02x", - (unsigned char) *mbstr))); + mbstr++; + len--; + continue; } + if (noError) + return false; + report_invalid_encoding(encoding, mbstr, len); } - else - { - for (i = 1; i < l; i++) - { - /* - * we expect that every multibyte char consists of bytes - * having the 8th bit set - */ - if (i >= len || (mbstr[i] & 0x80) == 0) - { - char buf[8 * 2 + 1]; - char *p = buf; - int j, - jlimit; - - if (noError) - return false; - - jlimit = Min(l, len); - jlimit = Min(jlimit, 8); /* prevent buffer overrun */ - for (j = 0; j < jlimit; j++) - p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + l = (*mbverify) ((const unsigned char *) mbstr, len); - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"%s\": 0x%s", - GetDatabaseEncodingName(), buf))); - } - } + if (l < 0) + { + if (noError) + return false; + report_invalid_encoding(encoding, mbstr, len); } - len -= l; + mbstr += l; + len -= l; } return true; } /* - * fetch maximum length of a char encoding for the current database + * report_invalid_encoding: complain about invalid multibyte character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. */ -int -pg_database_encoding_max_length(void) +void +report_invalid_encoding(int encoding, const char *mbstr, int len) { - return pg_wchar_table[GetDatabaseEncoding()].maxmblen; + int l = pg_encoding_mblen(encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte sequence for encoding \"%s\": 0x%s", + pg_enc2name_tbl[encoding].name, + buf))); +} + +/* + * report_untranslatable_char: complain about untranslatable character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. + */ +void +report_untranslatable_char(int src_encoding, int dest_encoding, + const char *mbstr, int len) +{ + int l = pg_encoding_mblen(src_encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"", + buf, + pg_enc2name_tbl[src_encoding].name, + pg_enc2name_tbl[dest_encoding].name))); } #endif |