summaryrefslogtreecommitdiff
path: root/contrib/unaccent/generate_unaccent_rules.py
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2017-08-16 16:51:56 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2017-08-16 16:51:56 -0400
commitec0a69e49bf41a37b5c2d6f6be66d8abae00ee05 (patch)
tree6da8c1a2d4d06d9c2e4af7d9a217101f4189a6ce /contrib/unaccent/generate_unaccent_rules.py
parent2b74303637edc09cf692fbfab3fd93a5e47ccabf (diff)
Extend the default rules file for contrib/unaccent with Vietnamese letters.
Improve generate_unaccent_rules.py to handle composed characters whose base is another composed character rather than a plain letter. The net effect of this is to add a bunch of multi-accented Vietnamese characters to unaccent.rules. Original complaint from Kha Nguyen, diagnosis of the script's shortcoming by Thomas Munro. Dang Minh Huong and Michael Paquier Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com
Diffstat (limited to 'contrib/unaccent/generate_unaccent_rules.py')
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py39
1 files changed, 31 insertions, 8 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index a5eb42f0b18..4b1b011861f 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -48,24 +48,47 @@ def is_mark(codepoint):
return codepoint.general_category in ("Mn", "Me", "Mc")
def is_letter_with_marks(codepoint, table):
- """Returns true for plain letters combined with one or more marks."""
+ """Returns true for letters combined with one or more marks."""
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
- return len(codepoint.combining_ids) > 1 and \
- is_plain_letter(table[codepoint.combining_ids[0]]) and \
- all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+ # Letter may have no combining characters, in which case it has
+ # no marks.
+ if len(codepoint.combining_ids) == 1:
+ return False
+
+ # A letter without diacritical marks has none of them.
+ if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
+ return False
+
+ # Check if the base letter of this letter has marks.
+ codepoint_base = codepoint.combining_ids[0]
+ if (is_plain_letter(table[codepoint_base]) is False and \
+ is_letter_with_marks(table[codepoint_base], table) is False):
+ return False
+
+ return True
def is_letter(codepoint, table):
"""Return true for letter with or without diacritical marks."""
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
def get_plain_letter(codepoint, table):
- """Return the base codepoint without marks."""
+ """Return the base codepoint without marks. If this codepoint has more
+ than one combining character, do a recursive lookup on the table to
+ find out its plain base letter."""
if is_letter_with_marks(codepoint, table):
- return table[codepoint.combining_ids[0]]
+ if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
+ return get_plain_letter(table[codepoint.combining_ids[0]], table)
+ elif is_plain_letter(table[codepoint.combining_ids[0]]):
+ return table[codepoint.combining_ids[0]]
+
+ # Should not come here
+ assert(False)
elif is_plain_letter(codepoint):
return codepoint
- else:
- raise "mu"
+
+ # Should not come here
+ assert(False)
def is_ligature(codepoint, table):
"""Return true for letters combined with letters."""