diff options
Diffstat (limited to 'src/common/unicode/generate-unicode_case_table.pl')
-rw-r--r-- | src/common/unicode/generate-unicode_case_table.pl | 161 |
1 files changed, 157 insertions, 4 deletions
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl index 61ac3c7d8e6..957ec14168c 100644 --- a/src/common/unicode/generate-unicode_case_table.pl +++ b/src/common/unicode/generate-unicode_case_table.pl @@ -3,7 +3,7 @@ # Generate Unicode character case mappings. Does not include tailoring # or locale-specific mappings. # -# Input: UnicodeData.txt +# Input: SpecialCasing.txt UnicodeData.txt # Output: unicode_case_table.h # # Copyright (c) 2000-2025, PostgreSQL Global Development Group @@ -21,6 +21,10 @@ GetOptions('outdir:s' => \$output_path); my $output_table_file = "$output_path/unicode_case_table.h"; +# The maximum number of codepoints that can result from case mapping +# of a single character. See Unicode section 5.18 "Case Mappings". +my $MAX_CASE_EXPANSION = 3; + my $FH; my %simple = (); @@ -51,6 +55,98 @@ while (my $line = <$FH>) } close $FH; +# Map for special casing rules that aren't represented in the simple +# mapping. Language-sensitive mappings are not supported. +# +# See https://www.unicode.org/reports/tr44/#SpecialCasing.txt, or the +# SpecialCasing.txt file itself for details. + +# for now, only Final_Sigma is supported +my %condition_map = (Final_Sigma => 'PG_U_FINAL_SIGMA'); + +my %special = (); +open($FH, '<', "$output_path/SpecialCasing.txt") + or die "Could not open $output_path/SpecialCasing.txt: $!."; +while (my $line = <$FH>) +{ + # language-sensitive mappings not supported + last if $line =~ /\# Language-Sensitive Mappings/; + + # remove comments + $line =~ s/^(.*?)#.*$/$1/s; + + # ignore empty lines + next unless $line =~ /;/; + + my @elts = split /;/, $line; + my $code = hex($elts[0]); + + # Codepoint may map to multiple characters when converting + # case. Split each mapping on whitespace and extract the + # hexadecimal into an array of codepoints. + my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1])); + my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2])); + my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3])); + my @conditions = map { + # supporting negated conditions may require storing a + # mask of relevant conditions for a given rule to differentiate + # between lack of a condition and a negated condition + die "negated conditions not supported" if /^Not_/; + $condition_map{$_} || die "unrecognized condition: $_" + } (grep /\w+/, (split /\s+/, $elts[4])); + + my $cond_str = (join '|', @conditions) || '0'; + + # if empty, create a self-mapping + push @lower, $code if (scalar @lower == 0); + push @title, $code if (scalar @title == 0); + push @upper, $code if (scalar @upper == 0); + + # none should map to more than 3 codepoints + die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'" + if (scalar @lower) > $MAX_CASE_EXPANSION; + die "titlecase expansion for 0x$elts[0] exceeds maximum: '$elts[2]'" + if (scalar @title) > $MAX_CASE_EXPANSION; + die "uppercase expansion for 0x$elts[0] exceeds maximum: '$elts[3]'" + if (scalar @upper) > $MAX_CASE_EXPANSION; + + # pad arrays to a fixed length of 3 + while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 } + while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 } + while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 } + + # Characters with special mappings may not have simple mappings; + # ensure that an entry exists. + $simple{$code} ||= { + Simple_Lowercase => $code, + Simple_Titlecase => $code, + Simple_Uppercase => $code + }; + + # Multiple special case rules for a single codepoint could be + # supported by making several entries for each codepoint, and have + # the simple mapping point to the first entry. The caller could + # scan forward looking for an entry that matches the conditions, + # or fall back to the normal behavior. + die "multiple special case mappings not supported" + if defined $special{$code}; + + $special{$code} = { + Lowercase => \@lower, + Titlecase => \@title, + Uppercase => \@upper, + Conditions => $cond_str + }; +} +close $FH; + +# assign sequential array indexes to the special mappings +my $special_idx = 0; +foreach my $code (sort { $a <=> $b } (keys %special)) +{ + $special{$code}{Index} = $special_idx++; +} + # Start writing out the output files open my $OT, '>', $output_table_file or die "Could not open output file $output_table_file: $!\n"; @@ -63,6 +159,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple)) $num_simple++ unless $code < 0x80; } +my $num_special = scalar(keys %special) + 1; + print $OT <<"EOS"; /*------------------------------------------------------------------------- * @@ -86,6 +184,19 @@ print $OT <<"EOS"; #include "common/unicode_case.h" #include "mb/pg_wchar.h" +/* + * The maximum number of codepoints that can result from case mapping + * of a single character. See Unicode section 5.18 "Case Mappings". + */ +#define MAX_CASE_EXPANSION 3 + +/* + * Case mapping condition flags. For now, only Final_Sigma is supported. + * + * See Unicode Context Specification for Casing. + */ +#define PG_U_FINAL_SIGMA (1 << 0) + typedef enum { CaseLower = 0, @@ -97,10 +208,45 @@ typedef enum typedef struct { pg_wchar codepoint; /* Unicode codepoint */ + int16 conditions; + pg_wchar map[NCaseKind][MAX_CASE_EXPANSION]; +} pg_special_case; + +typedef struct +{ + pg_wchar codepoint; /* Unicode codepoint */ pg_wchar simplemap[NCaseKind]; + const pg_special_case *special_case; } pg_case_map; /* + * Special case mappings that aren't representable in the simple map. + * Entries are referenced from simple_case_map. + */ +static const pg_special_case special_case[$num_special] = +{ +EOS + +foreach my $code (sort { $a <=> $b } (keys %special)) +{ + die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION; + die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION; + die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION; + my $lower = join ", ", + (map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} }); + my $title = join ", ", + (map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} }); + my $upper = join ", ", + (map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} }); + printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions}; + printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper; +} + +print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n"; +print $OT <<"EOS"; +}; + +/* * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup), * sparse for higher codepoints (requiring scan or binary search). */ @@ -114,8 +260,10 @@ for (my $code = 0; $code < 0x80; $code++) my $lc = ($simple{$code}{Simple_Lowercase} || $code); my $tc = ($simple{$code}{Simple_Titlecase} || $code); my $uc = ($simple{$code}{Simple_Uppercase} || $code); + die "unexpected special case for code $code" + if defined $special{$code}; printf $OT - "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n", $code, $lc, $tc, $uc; } printf $OT "\n"; @@ -126,9 +274,14 @@ foreach my $code (sort { $a <=> $b } (keys %simple)) next unless $code >= 0x80; # already output above my $map = $simple{$code}; + my $special_case = "NULL"; + if (exists $special{$code}) + { + $special_case = sprintf "&special_case[%d]", $special{$code}{Index}; + } printf $OT - "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", + "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n", $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase}, - $map->{Simple_Uppercase}; + $map->{Simple_Uppercase}, $special_case; } print $OT "};\n"; |