diff options
author | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2017-03-13 20:46:39 +0200 |
---|---|---|
committer | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2017-03-13 20:46:39 +0200 |
commit | aeed17d00037950a16cc5ebad5b5592e5fa1ad0f (patch) | |
tree | 070aac060c2f923b5c636afbab51272bd2d04056 /src/backend/utils/mb/conv.c | |
parent | 84892692fdedb753cfdd9a63b318b47ec640915f (diff) |
Use radix tree for character encoding conversions.
Replace the mapping tables used to convert between UTF-8 and other
character encodings with new radix tree-based maps. Looking up an entry in
a radix tree is much faster than a binary search in the old maps. As a
bonus, the radix tree representation is also more compact, making the
binaries slightly smaller.
The "combined" maps work the same as before, with binary search. They are
much smaller than the main tables, so it doesn't matter so much. However,
the "combined" maps are now stored in the same .map files as the main
tables. This seems more clear, since they're always used together, and
generated from the same source files.
Patch by Kyotaro Horiguchi, with lot of hacking by me at various stages.
Reviewed by Michael Paquier and Daniel Gustafsson.
Discussion: https://www.postgresql.org/message-id/20170306.171609.204324917.horiguchi.kyotaro%40lab.ntt.co.jp
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r-- | src/backend/utils/mb/conv.c | 251 |
1 files changed, 173 insertions, 78 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index 9014a5727cb..5ce5c9a9c25 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -284,36 +284,6 @@ mic2latin_with_table(const unsigned char *mic, /* * comparison routine for bsearch() - * this routine is intended for UTF8 -> local code - */ -static int -compare1(const void *p1, const void *p2) -{ - uint32 v1, - v2; - - v1 = *(const uint32 *) p1; - v2 = ((const pg_utf_to_local *) p2)->utf; - return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); -} - -/* - * comparison routine for bsearch() - * this routine is intended for local code -> UTF8 - */ -static int -compare2(const void *p1, const void *p2) -{ - uint32 v1, - v2; - - v1 = *(const uint32 *) p1; - v2 = ((const pg_local_to_utf *) p2)->code; - return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); -} - -/* - * comparison routine for bsearch() * this routine is intended for combined UTF8 -> local code */ static int @@ -364,6 +334,121 @@ store_coded_char(unsigned char *dest, uint32 code) } /* + * Convert a character using a conversion radix tree. + * + * 'l' is the length of the input character in bytes, and b1-b4 are + * the input character's bytes. + */ +static inline uint32 +pg_mb_radix_conv(const pg_mb_radix_tree *rt, + int l, + unsigned char b1, + unsigned char b2, + unsigned char b3, + unsigned char b4) +{ + if (l == 4) + { + /* 4-byte code */ + + /* check code validity */ + if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || + b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || + b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || + b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) + return 0; + + /* perform lookup */ + if (rt->chars32) + { + uint32 idx = rt->b4root; + + idx = rt->chars32[b1 + idx - rt->b4_1_lower]; + idx = rt->chars32[b2 + idx - rt->b4_2_lower]; + idx = rt->chars32[b3 + idx - rt->b4_3_lower]; + return rt->chars32[b4 + idx - rt->b4_4_lower]; + } + else + { + uint16 idx = rt->b4root; + + idx = rt->chars16[b1 + idx - rt->b4_1_lower]; + idx = rt->chars16[b2 + idx - rt->b4_2_lower]; + idx = rt->chars16[b3 + idx - rt->b4_3_lower]; + return rt->chars16[b4 + idx - rt->b4_4_lower]; + } + } + else if (l == 3) + { + /* 3-byte code */ + + /* check code validity */ + if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || + b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || + b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) + return 0; + + /* perform lookup */ + if (rt->chars32) + { + uint32 idx = rt->b3root; + + idx = rt->chars32[b2 + idx - rt->b3_1_lower]; + idx = rt->chars32[b3 + idx - rt->b3_2_lower]; + return rt->chars32[b4 + idx - rt->b3_3_lower]; + } + else + { + uint16 idx = rt->b3root; + + idx = rt->chars16[b2 + idx - rt->b3_1_lower]; + idx = rt->chars16[b3 + idx - rt->b3_2_lower]; + return rt->chars16[b4 + idx - rt->b3_3_lower]; + } + } + else if (l == 2) + { + /* 2-byte code */ + + /* check code validity - first byte */ + if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || + b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) + return 0; + + /* perform lookup */ + if (rt->chars32) + { + uint32 idx = rt->b2root; + + idx = rt->chars32[b3 + idx - rt->b2_1_lower]; + return rt->chars32[b4 + idx - rt->b2_2_lower]; + } + else + { + uint16 idx = rt->b2root; + + idx = rt->chars16[b3 + idx - rt->b2_1_lower]; + return rt->chars16[b4 + idx - rt->b2_2_lower]; + } + } + else if (l == 1) + { + /* 1-byte code */ + + /* check code validity - first byte */ + if (b4 < rt->b1_lower || b4 > rt->b1_upper) + return 0; + + /* perform lookup */ + if (rt->chars32) + return rt->chars32[b4 + rt->b1root - rt->b1_lower]; + else + return rt->chars16[b4 + rt->b1root - rt->b1_lower]; + } + return 0; /* shouldn't happen */ +} + +/* * UTF8 ---> local code * * utf: input string in UTF8 encoding (need not be null-terminated) @@ -371,7 +456,6 @@ store_coded_char(unsigned char *dest, uint32 code) * iso: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters - * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters @@ -389,14 +473,13 @@ store_coded_char(unsigned char *dest, uint32 code) void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, - const pg_utf_to_local *map, int mapsize, + const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iutf; int l; - const pg_utf_to_local *p; const pg_utf_to_local_combined *cp; if (!PG_VALID_ENCODING(encoding)) @@ -406,6 +489,11 @@ UtfToLocal(const unsigned char *utf, int len, for (; len > 0; len -= l) { + unsigned char b1 = 0; + unsigned char b2 = 0; + unsigned char b3 = 0; + unsigned char b4 = 0; + /* "break" cases all represent errors */ if (*utf == '\0') break; @@ -427,27 +515,28 @@ UtfToLocal(const unsigned char *utf, int len, /* collect coded char of length l */ if (l == 2) { - iutf = *utf++ << 8; - iutf |= *utf++; + b3 = *utf++; + b4 = *utf++; } else if (l == 3) { - iutf = *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; + b2 = *utf++; + b3 = *utf++; + b4 = *utf++; } else if (l == 4) { - iutf = *utf++ << 24; - iutf |= *utf++ << 16; - iutf |= *utf++ << 8; - iutf |= *utf++; + b1 = *utf++; + b2 = *utf++; + b3 = *utf++; + b4 = *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf = 0; /* keep compiler quiet */ } + iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4); /* First, try with combined map if possible */ if (cmap && len > l) @@ -516,13 +605,14 @@ UtfToLocal(const unsigned char *utf, int len, } /* Now check ordinary map */ - p = bsearch(&iutf, map, mapsize, - sizeof(pg_utf_to_local), compare1); - - if (p) + if (map) { - iso = store_coded_char(iso, p->code); - continue; + uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); + if (converted) + { + iso = store_coded_char(iso, converted); + continue; + } } /* if there's a conversion function, try that */ @@ -557,7 +647,6 @@ UtfToLocal(const unsigned char *utf, int len, * utf: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters - * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters @@ -575,14 +664,13 @@ UtfToLocal(const unsigned char *utf, int len, void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, - const pg_local_to_utf *map, int mapsize, + const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iiso; int l; - const pg_local_to_utf *p; const pg_local_to_utf_combined *cp; if (!PG_VALID_ENCODING(encoding)) @@ -592,6 +680,11 @@ LocalToUtf(const unsigned char *iso, int len, for (; len > 0; len -= l) { + unsigned char b1 = 0; + unsigned char b2 = 0; + unsigned char b3 = 0; + unsigned char b4 = 0; + /* "break" cases all represent errors */ if (*iso == '\0') break; @@ -610,53 +703,55 @@ LocalToUtf(const unsigned char *iso, int len, /* collect coded char of length l */ if (l == 1) - iiso = *iso++; + b4 = *iso++; else if (l == 2) { - iiso = *iso++ << 8; - iiso |= *iso++; + b3 = *iso++; + b4 = *iso++; } else if (l == 3) { - iiso = *iso++ << 16; - iiso |= *iso++ << 8; - iiso |= *iso++; + b2 = *iso++; + b3 = *iso++; + b4 = *iso++; } else if (l == 4) { - iiso = *iso++ << 24; - iiso |= *iso++ << 16; - iiso |= *iso++ << 8; - iiso |= *iso++; + b1 = *iso++; + b2 = *iso++; + b3 = *iso++; + b4 = *iso++; } else { elog(ERROR, "unsupported character length %d", l); iiso = 0; /* keep compiler quiet */ } + iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4); - /* First check ordinary map */ - p = bsearch(&iiso, map, mapsize, - sizeof(pg_local_to_utf), compare2); - - if (p) + if (map) { - utf = store_coded_char(utf, p->utf); - continue; - } + uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); - /* If there's a combined character map, try that */ - if (cmap) - { - cp = bsearch(&iiso, cmap, cmapsize, - sizeof(pg_local_to_utf_combined), compare4); - - if (cp) + if (converted) { - utf = store_coded_char(utf, cp->utf1); - utf = store_coded_char(utf, cp->utf2); + utf = store_coded_char(utf, converted); continue; } + + /* If there's a combined character map, try that */ + if (cmap) + { + cp = bsearch(&iiso, cmap, cmapsize, + sizeof(pg_local_to_utf_combined), compare4); + + if (cp) + { + utf = store_coded_char(utf, cp->utf1); + utf = store_coded_char(utf, cp->utf2); + continue; + } + } } /* if there's a conversion function, try that */ |