diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/utils/mb/Unicode/Makefile | 5 | ||||
-rwxr-xr-x | src/backend/utils/mb/Unicode/UCS_to_GB18030.pl | 28 | ||||
-rw-r--r-- | src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c | 7 |
3 files changed, 29 insertions, 11 deletions
diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index ad789b31e54..27424b2a001 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -54,7 +54,7 @@ $(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml)) $(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT)) $(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT)) $(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT)) -$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.xml)) +$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.ucm)) $(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT)) $(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt)) $(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt)) @@ -78,6 +78,9 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt: gb-18030-2000.xml windows-949-2000.xml: $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F) +gb-18030-2000.ucm: + $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/$(@F) + GB2312.TXT: $(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt' diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index ddcbd6ef0c4..084fdf66af1 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -5,13 +5,14 @@ # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # # Generate UTF-8 <--> GB18030 code conversion tables from -# "gb-18030-2000.xml", obtained from -# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# "gb-18030-2000.ucm", obtained from +# https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm # # The lines we care about in the source file look like -# <a u="009A" b="81 30 83 36"/> -# where the "u" field is the Unicode code point in hex, -# and the "b" field is the hex byte sequence for GB18030 +# <UXXXX> \xYY[\xYY...] |n +# where XXXX is the Unicode code point in hex, +# and the \xYY... is the hex byte sequence for GB18030, +# and n is a flag indicating the type of mapping. use strict; use warnings FATAL => 'all'; @@ -22,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_GB18030.pl'; # Read the input -my $in_file = "gb-18030-2000.xml"; +my $in_file = "gb-18030-2000.ucm"; open(my $in, '<', $in_file) || die("cannot open $in_file"); @@ -30,9 +31,18 @@ my @mapping; while (<$in>) { - next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/); - my ($u, $c) = ($1, $2); - $c =~ s/ //g; + # Mappings may have been removed by commenting out + next if /^#/; + + next if !/^<U([0-9A-Fa-f]+)>\s+ + ((?:\\x[0-9A-Fa-f]{2})+)\s+ + \|(\d+)/x; + my ($u, $c, $flag) = ($1, $2, $3); + $c =~ s/\\x//g; + + # We only want round-trip mappings + next if ($flag ne '0'); + my $ucs = hex($u); my $code = hex($c); if ($code >= 0x80 && $ucs >= 0x0080) diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index ffc9c58cd13..a512df93577 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -124,7 +124,12 @@ utf8word_to_unicode(uint32 c) /* * Perform mapping of GB18030 ranges to UTF8 * - * The ranges we need to convert are specified in gb-18030-2000.xml. + * General description, and the range we need to convert for U+10000 and up: + * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/gb18030.html + * + * Ranges up to U+FFFF: + * https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/ranges.txt + * * All are ranges of 4-byte GB18030 codes. */ static uint32 |