diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2017-08-16 16:51:56 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2017-08-16 16:51:56 -0400 |
commit | ec0a69e49bf41a37b5c2d6f6be66d8abae00ee05 (patch) | |
tree | 6da8c1a2d4d06d9c2e4af7d9a217101f4189a6ce /contrib/unaccent/generate_unaccent_rules.py | |
parent | 2b74303637edc09cf692fbfab3fd93a5e47ccabf (diff) |
Extend the default rules file for contrib/unaccent with Vietnamese letters.
Improve generate_unaccent_rules.py to handle composed characters whose base
is another composed character rather than a plain letter. The net effect
of this is to add a bunch of multi-accented Vietnamese characters to
unaccent.rules.
Original complaint from Kha Nguyen, diagnosis of the script's shortcoming
by Thomas Munro.
Dang Minh Huong and Michael Paquier
Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com
Diffstat (limited to 'contrib/unaccent/generate_unaccent_rules.py')
-rw-r--r-- | contrib/unaccent/generate_unaccent_rules.py | 39 |
1 files changed, 31 insertions, 8 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index a5eb42f0b18..4b1b011861f 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -48,24 +48,47 @@ def is_mark(codepoint): return codepoint.general_category in ("Mn", "Me", "Mc") def is_letter_with_marks(codepoint, table): - """Returns true for plain letters combined with one or more marks.""" + """Returns true for letters combined with one or more marks.""" # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values - return len(codepoint.combining_ids) > 1 and \ - is_plain_letter(table[codepoint.combining_ids[0]]) and \ - all(is_mark(table[i]) for i in codepoint.combining_ids[1:]) + + # Letter may have no combining characters, in which case it has + # no marks. + if len(codepoint.combining_ids) == 1: + return False + + # A letter without diacritical marks has none of them. + if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: + return False + + # Check if the base letter of this letter has marks. + codepoint_base = codepoint.combining_ids[0] + if (is_plain_letter(table[codepoint_base]) is False and \ + is_letter_with_marks(table[codepoint_base], table) is False): + return False + + return True def is_letter(codepoint, table): """Return true for letter with or without diacritical marks.""" return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) def get_plain_letter(codepoint, table): - """Return the base codepoint without marks.""" + """Return the base codepoint without marks. If this codepoint has more + than one combining character, do a recursive lookup on the table to + find out its plain base letter.""" if is_letter_with_marks(codepoint, table): - return table[codepoint.combining_ids[0]] + if len(table[codepoint.combining_ids[0]].combining_ids) > 1: + return get_plain_letter(table[codepoint.combining_ids[0]], table) + elif is_plain_letter(table[codepoint.combining_ids[0]]): + return table[codepoint.combining_ids[0]] + + # Should not come here + assert(False) elif is_plain_letter(codepoint): return codepoint - else: - raise "mu" + + # Should not come here + assert(False) def is_ligature(codepoint, table): """Return true for letters combined with letters.""" |