summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Epler <jepler@gmail.com>2022-09-05 07:58:04 -0500
committerDamien George <damien@micropython.org>2022-09-06 17:08:18 +1000
commite90b85cc98a24003f2d673bab2c255ab3dce66e7 (patch)
tree9403eb1246766f0d081bb8a78a2515bc1f6f567a
parent719dbbf5639cdeff99bf629c45d66b18007e9958 (diff)
extmod/modure: Convert byte offsets to unicode indices when necessary.
And add a test. Fixes issue #9202. Signed-off-by: Jeff Epler <jepler@gmail.com>
-rw-r--r--extmod/modure.c16
-rw-r--r--tests/unicode/unicode_ure.py32
2 files changed, 48 insertions, 0 deletions
diff --git a/extmod/modure.c b/extmod/modure.c
index 799fef13b..a674d6649 100644
--- a/extmod/modure.c
+++ b/extmod/modure.c
@@ -33,6 +33,10 @@
#include "py/objstr.h"
#include "py/stackctrl.h"
+#if MICROPY_PY_BUILTINS_STR_UNICODE
+#include "py/unicode.h"
+#endif
+
#if MICROPY_PY_URE
#define re1_5_stack_chk() MP_STACK_CHECK()
@@ -121,6 +125,18 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span
e = self->caps[no * 2 + 1] - begin;
}
+ #if MICROPY_PY_BUILTINS_STR_UNICODE
+ if (mp_obj_get_type(self->str) == &mp_type_str) {
+ const byte *begin = (const byte *)mp_obj_str_get_str(self->str);
+ if (s != -1) {
+ s = utf8_ptr_to_index(begin, begin + s);
+ }
+ if (e != -1) {
+ e = utf8_ptr_to_index(begin, begin + e);
+ }
+ }
+ #endif
+
span[0] = mp_obj_new_int(s);
span[1] = mp_obj_new_int(e);
}
diff --git a/tests/unicode/unicode_ure.py b/tests/unicode/unicode_ure.py
new file mode 100644
index 000000000..5a5dc6005
--- /dev/null
+++ b/tests/unicode/unicode_ure.py
@@ -0,0 +1,32 @@
+# test match.span() for unicode strings
+
+try:
+ import ure as re
+except ImportError:
+ try:
+ import re
+ except ImportError:
+ print("SKIP")
+ raise SystemExit
+
+try:
+ m = re.match(".", "a")
+ m.span
+except AttributeError:
+ print("SKIP")
+ raise SystemExit
+
+
+def print_spans(match):
+ print("----")
+ try:
+ i = 0
+ while True:
+ print(match.span(i), match.start(i), match.end(i))
+ i += 1
+ except IndexError:
+ pass
+
+
+m = re.match(r"([0-9]*)(([a-z]*)([0-9]*))", "1234\u2764567")
+print_spans(m)