summaryrefslogtreecommitdiff
path: root/contrib/unaccent/generate_unaccent_rules.py
diff options
context:
space:
mode:
authorMichael Paquier <michael@paquier.xyz>2022-07-05 16:17:51 +0900
committerMichael Paquier <michael@paquier.xyz>2022-07-05 16:17:51 +0900
commite3dd7c06e62774628e102c3cd47ee46e85519de7 (patch)
treeb729dea00a5341c81fe42bdb329109da7883b58c /contrib/unaccent/generate_unaccent_rules.py
parent84ad713cf85aeffee5dd39f62d49a1b9e34632da (diff)
Simplify a bit the special rules generating unaccent.rules
As noted by Thomas Munro, CLDR 36 has added SOUND RECORDING COPYRIGHT (U+2117), and we use CLDR 41, so this can be removed from the set of special cases. The set of regression tests is expanded for degree signs, which are two of the special cases, and a fancy case with U+210C in Latin-ASCII.xml that we have discovered about when diving into what could be done for Cyrillic characters (this last part is material for a future patch, not tackled yet). While on it, some of the assertions of generate_unaccent_rules.py are expanded to report the codepoint on which a failure is found, something useful for debugging. Extracted from a larger patch by the same author. Author: Przemysław Sztoch Discussion: https://postgr.es/m/8478da0d-3b61-d24f-80b4-ce2f5e971c60@sztoch.pl
Diffstat (limited to 'contrib/unaccent/generate_unaccent_rules.py')
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py5
1 files changed, 2 insertions, 3 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index c405e231b39..b4b4c38bebe 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -134,12 +134,12 @@ def get_plain_letter(codepoint, table):
return table[codepoint.combining_ids[0]]
# Should not come here
- assert(False)
+ assert False, 'Codepoint U+%0.2X' % codepoint.id
elif is_plain_letter(codepoint):
return codepoint
# Should not come here
- assert(False)
+ assert False, 'Codepoint U+%0.2X' % codepoint.id
def is_ligature(codepoint, table):
@@ -212,7 +212,6 @@ def special_cases():
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS
charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT
- charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
return charactersSet