From aeed17d00037950a16cc5ebad5b5592e5fa1ad0f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 13 Mar 2017 20:46:39 +0200 Subject: Use radix tree for character encoding conversions. Replace the mapping tables used to convert between UTF-8 and other character encodings with new radix tree-based maps. Looking up an entry in a radix tree is much faster than a binary search in the old maps. As a bonus, the radix tree representation is also more compact, making the binaries slightly smaller. The "combined" maps work the same as before, with binary search. They are much smaller than the main tables, so it doesn't matter so much. However, the "combined" maps are now stored in the same .map files as the main tables. This seems more clear, since they're always used together, and generated from the same source files. Patch by Kyotaro Horiguchi, with lot of hacking by me at various stages. Reviewed by Michael Paquier and Daniel Gustafsson. Discussion: https://www.postgresql.org/message-id/20170306.171609.204324917.horiguchi.kyotaro%40lab.ntt.co.jp --- src/backend/utils/mb/Unicode/convutils.pm | 806 ++++++++++++++++++++++++------ 1 file changed, 656 insertions(+), 150 deletions(-) (limited to 'src/backend/utils/mb/Unicode/convutils.pm') diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm index eb3c602c32d..6bd84712b05 100644 --- a/src/backend/utils/mb/Unicode/convutils.pm +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -3,44 +3,27 @@ # # src/backend/utils/mb/Unicode/convutils.pm +package convutils; + use strict; -####################################################################### -# convert UCS-4 to UTF-8 -# -sub ucs2utf -{ - my ($ucs) = @_; - my $utf; +use Exporter 'import'; - if ($ucs <= 0x007f) - { - $utf = $ucs; - } - elsif ($ucs > 0x007f && $ucs <= 0x07ff) - { - $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); - } - elsif ($ucs > 0x07ff && $ucs <= 0xffff) - { - $utf = - ((($ucs >> 12) | 0xe0) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - else - { - $utf = - ((($ucs >> 18) | 0xf0) << 24) | - (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - return ($utf); -} +our @EXPORT = qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); + +# Constants used in the 'direction' field of the character maps +use constant { + NONE => 0, + TO_UNICODE => 1, + FROM_UNICODE => 2, + BOTH => 3 +}; ####################################################################### # read_source - common routine to read source file # # fname ; input file name +# sub read_source { my ($fname) = @_; @@ -70,7 +53,9 @@ sub read_source code => hex($1), ucs => hex($2), comment => $4, - direction => "both" + direction => BOTH, + f => $fname, + l => $. }; # Ignore pure ASCII mappings. PostgreSQL character conversion code @@ -85,20 +70,18 @@ sub read_source } ################################################################## -# print_tables : output mapping tables +# print_conversion_tables - output mapping tables # -# Arguments: -# charset - string name of the character set. -# table - mapping table (see format below) -# verbose - if 1, output comment on each line, -# if 2, also output source file name and number +# print_conversion_tables($this_script, $csname, \%charset) # +# this_script - the name of the *caller script* of this feature +# csname - character set name other than ucs +# charset - ref to character set array # +# Input character set array format: # -# Mapping table format: -# -# Mapping table is a list of hashes. Each hash has the following fields: -# direction - Direction: 'both', 'from_unicode' or 'to_unicode' +# Each element in the character set array is a hash. Each hash has the following fields: +# direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether) # ucs - Unicode code point # ucs_second - Second Unicode code point, if this is a "combined" character. # code - Byte sequence in the "other" character set, as an integer @@ -106,180 +89,703 @@ sub read_source # f - Source filename # l - Line number in source file # +sub print_conversion_tables +{ + my ($this_script, $csname, $charset) = @_; + + print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, $charset); + print_conversion_tables_direction($this_script, $csname, TO_UNICODE, $charset); +} + +############################################################################# +# INTERNAL ROUTINES + +####################################################################### +# print_conversion_tables_direction - write the whole content of C source of radix tree +# +# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth) +# +# this_script - the name of the *caller script* of this feature +# csname - character set name other than ucs +# direction - desired direction, TO_UNICODE or FROM_UNICODE +# charset - ref to character set array # -sub print_tables +sub print_conversion_tables_direction { - my ($charset, $table, $verbose) = @_; + my ($this_script, $csname, $direction, $charset) = @_; - # Build an array with only the to-UTF8 direction mappings - my @to_unicode; - my @to_unicode_combined; - my @from_unicode; - my @from_unicode_combined; + my $fname; + my $tblname; + if ($direction == TO_UNICODE) + { + $fname = lc("${csname}_to_utf8.map"); + $tblname = lc("${csname}_to_unicode_tree"); - foreach my $i (@$table) + print "- Writing ${csname}=>UTF8 conversion table: $fname\n"; + } + else { - if (defined $i->{ucs_second}) + $fname = lc("utf8_to_${csname}.map"); + $tblname = lc("${csname}_from_unicode_tree"); + + print "- Writing UTF8=>${csname} conversion table: $fname\n"; + } + + open(my $out, '>', $fname) || die("cannot open $fname"); + + print $out "/* src/backend/utils/mb/Unicode/$fname */\n"; + print $out "/* This file is generated by $this_script */\n\n"; + + # Collect regular, non-combined, mappings, and create the radix tree from them. + my $charmap = &make_charmap($out, $charset, $direction, 0); + print_radix_table($out, $tblname, $charmap); + + # Collect combined characters, and create combined character table (if any) + my $charmap_combined = &make_charmap_combined($charset, $direction); + + if (scalar @{$charmap_combined} > 0) + { + if ($direction == TO_UNICODE) { - my $entry = {utf8 => ucs2utf($i->{ucs}), - utf8_second => ucs2utf($i->{ucs_second}), - code => $i->{code}, - comment => $i->{comment}, - f => $i->{f}, l => $i->{l}}; - if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") - { - push @to_unicode_combined, $entry; - } - if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") - { - push @from_unicode_combined, $entry; - } + print_to_utf8_combined_map($out, $csname, + $charmap_combined, 1); } else { - my $entry = {utf8 => ucs2utf($i->{ucs}), - code => $i->{code}, - comment => $i->{comment}, - f => $i->{f}, l => $i->{l}}; - if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") - { - push @to_unicode, $entry; - } - if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") - { - push @from_unicode, $entry; - } + print_from_utf8_combined_map($out, $csname, + $charmap_combined, 1); } } - print_to_utf8_map($charset, \@to_unicode, $verbose); - print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0); - print_from_utf8_map($charset, \@from_unicode, $verbose); - print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0); + close($out); } -sub print_from_utf8_map +sub print_from_utf8_combined_map { - my ($charset, $table, $verbose) = @_; + my ($out, $charset, $table, $verbose) = @_; my $last_comment = ""; - my $fname = lc("utf8_to_${charset}.map"); - print "- Writing UTF8=>${charset} conversion table: $fname\n"; - open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local ULmap${charset}[ %d ] = {", - scalar(@$table)); + printf $out "\n/* Combined character map */\n"; + printf $out "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + scalar(@$table); my $first = 1; foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); - printf($out "\n {0x%04x, 0x%04x}", $i->{utf8}, $i->{code}); + printf $out "\n {0x%08x, 0x%08x, 0x%04x}", + $i->{utf8}, $i->{utf8_second}, $i->{code}; if ($verbose >= 2) { - $last_comment = "$i->{f}:$i->{l} $i->{comment}"; + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } - else + elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; - close($out); } -sub print_from_utf8_combined_map +sub print_to_utf8_combined_map { - my ($charset, $table, $verbose) = @_; + my ($out, $charset, $table, $verbose) = @_; my $last_comment = ""; - my $fname = lc("utf8_to_${charset}_combined.map"); - print "- Writing UTF8=>${charset} conversion table: $fname\n"; - open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". - "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", - scalar(@$table)); + printf $out "\n/* Combined character map */\n"; + printf $out "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + scalar(@$table); + my $first = 1; - foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table) + foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) { print($out ",") if (!$first); $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); + + printf $out "\n {0x%04x, 0x%08x, 0x%08x}", + $i->{code}, $i->{utf8}, $i->{utf8_second}; - printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code}); - $last_comment = "$i->{comment}"; + if ($verbose >= 2) + { + $last_comment = + sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); + } + elsif ($verbose >= 1) + { + $last_comment = $i->{comment}; + } } - print($out "\t/* $last_comment */") if ($verbose); + print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; - close($out); } -sub print_to_utf8_map +####################################################################### +# print_radix_table(, , ) +# +# Input: A hash, mapping an input character to an output character. +# +# Constructs a radix tree from the hash, and prints it out as a C-struct. +# +sub print_radix_table { - my ($charset, $table, $verbose) = @_; - - my $last_comment = ""; + my ($out, $tblname, $c) = @_; + + ### + ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each + ### radix tree is represented as a nested hash, each hash indexed by + ### input byte + ### + my %b1map; + my %b2map; + my %b3map; + my %b4map; + foreach my $in (keys %$c) + { + my $out = $c->{$in}; - my $fname = lc("${charset}_to_utf8.map"); + if ($in < 0x100) + { + $b1map{$in} = $out; + } + elsif ($in < 0x10000) + { + my $b1 = $in >> 8; + my $b2 = $in & 0xff; - print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; - open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; - printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". - "static const pg_local_to_utf LUmap${charset}[ %d ] = {", - scalar(@$table)); - my $first = 1; - foreach my $i (sort {$a->{code} <=> $b->{code}} @$table) - { - print($out ",") if (!$first); - $first = 0; - print($out "\t/* $last_comment */") if ($verbose); + $b2map{$b1}{$b2} = $out; + } + elsif ($in < 0x1000000) + { + my $b1 = $in >> 16; + my $b2 = ($in >> 8) & 0xff; + my $b3 = $in & 0xff; - printf($out "\n {0x%04x, 0x%x}", $i->{code}, $i->{utf8}); - if ($verbose >= 2) + $b3map{$b1}{$b2}{$b3} = $out; + } + elsif ($in < 0x100000000) { - $last_comment = "$i->{f}:$i->{l} $i->{comment}"; + my $b1 = $in >> 24; + my $b2 = ($in >> 16) & 0xff; + my $b3 = ($in >> 8) & 0xff; + my $b4 = $in & 0xff; + + $b4map{$b1}{$b2}{$b3}{$b4} = $out; } else { - $last_comment = $i->{comment}; + die sprintf("up to 4 byte code is supported: %x", $in); } } - print($out "\t/* $last_comment */") if ($verbose); - print $out "\n};\n"; - close($out); + + my @segments; + + ### + ### Build a linear list of "segments", from the nested hashes. + ### + ### Each segment is a lookup table, keyed by the next byte in the input. + ### The segments are written out physically to one big array in the final + ### step, but logically, they form a radix tree. Or rather, four radix + ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte + ### inputs, and 4-byte inputs. + ### + ### Each segment is represented by a hash with following fields: + ### + ### comment => + ### label =>