summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDamien George <damien.p.george@gmail.com>2018-05-24 13:08:51 +1000
committerDamien George <damien.p.george@gmail.com>2018-07-02 14:55:02 +1000
commite30a5fc7bcd27900e0657db97ed54fc056d8f852 (patch)
tree83f40bfb3a4e5b27bf89f0fdd3fbaf67f795c76b
parent1e9b871d295ff3c8ab6d9cd0fafa94c52271820a (diff)
extmod/modure: Add ure.sub() function and method, and tests.
This feature is controlled at compile time by MICROPY_PY_URE_SUB, disabled by default. Thanks to @dmazzella for the original patch for this feature; see #3770.
-rw-r--r--extmod/modure.c128
-rw-r--r--py/mpconfig.h4
-rw-r--r--tests/extmod/ure_sub.py61
-rw-r--r--tests/extmod/ure_sub_unmatched.py19
-rw-r--r--tests/extmod/ure_sub_unmatched.py.exp1
5 files changed, 213 insertions, 0 deletions
diff --git a/extmod/modure.c b/extmod/modure.c
index a536f907f..0d5330cb5 100644
--- a/extmod/modure.c
+++ b/extmod/modure.c
@@ -249,10 +249,127 @@ STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split);
+#if MICROPY_PY_URE_SUB
+
+STATIC mp_obj_t re_sub_helper(mp_obj_t self_in, size_t n_args, const mp_obj_t *args) {
+ mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in);
+ mp_obj_t replace = args[1];
+ mp_obj_t where = args[2];
+ mp_int_t count = 0;
+ if (n_args > 3) {
+ count = mp_obj_get_int(args[3]);
+ // Note: flags are currently ignored
+ }
+
+ size_t where_len;
+ const char *where_str = mp_obj_str_get_data(where, &where_len);
+ Subject subj;
+ subj.begin = where_str;
+ subj.end = subj.begin + where_len;
+ int caps_num = (self->re.sub + 1) * 2;
+
+ vstr_t vstr_return;
+ vstr_return.buf = NULL; // We'll init the vstr after the first match
+ mp_obj_match_t *match = mp_local_alloc(sizeof(mp_obj_match_t) + caps_num * sizeof(char*));
+ match->base.type = &match_type;
+ match->num_matches = caps_num / 2; // caps_num counts start and end pointers
+ match->str = where;
+
+ for (;;) {
+ // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
+ memset((char*)match->caps, 0, caps_num * sizeof(char*));
+ int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, false);
+
+ // If we didn't have a match, or had an empty match, it's time to stop
+ if (!res || match->caps[0] == match->caps[1]) {
+ break;
+ }
+
+ // Initialise the vstr if it's not already
+ if (vstr_return.buf == NULL) {
+ vstr_init(&vstr_return, match->caps[0] - subj.begin);
+ }
+
+ // Add pre-match string
+ vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin);
+
+ // Get replacement string
+ const char* repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace));
+
+ // Append replacement string to result, substituting any regex groups
+ while (*repl != '\0') {
+ if (*repl == '\\') {
+ ++repl;
+ bool is_g_format = false;
+ if (*repl == 'g' && repl[1] == '<') {
+ // Group specified with syntax "\g<number>"
+ repl += 2;
+ is_g_format = true;
+ }
+
+ if ('0' <= *repl && *repl <= '9') {
+ // Group specified with syntax "\g<number>" or "\number"
+ unsigned int match_no = 0;
+ do {
+ match_no = match_no * 10 + (*repl++ - '0');
+ } while ('0' <= *repl && *repl <= '9');
+ if (is_g_format && *repl == '>') {
+ ++repl;
+ }
+
+ if (match_no >= (unsigned int)match->num_matches) {
+ nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, MP_OBJ_NEW_SMALL_INT(match_no)));
+ }
+
+ const char *start_match = match->caps[match_no * 2];
+ if (start_match != NULL) {
+ // Add the substring matched by group
+ const char *end_match = match->caps[match_no * 2 + 1];
+ vstr_add_strn(&vstr_return, start_match, end_match - start_match);
+ }
+ }
+ } else {
+ // Just add the current byte from the replacement string
+ vstr_add_byte(&vstr_return, *repl++);
+ }
+ }
+
+ // Move start pointer to end of last match
+ subj.begin = match->caps[1];
+
+ // Stop substitutions if count was given and gets to 0
+ if (count > 0 && --count == 0) {
+ break;
+ }
+ }
+
+ mp_local_free(match);
+
+ if (vstr_return.buf == NULL) {
+ // Optimisation for case of no substitutions
+ return where;
+ }
+
+ // Add post-match string
+ vstr_add_strn(&vstr_return, subj.begin, subj.end - subj.begin);
+
+ return mp_obj_new_str_from_vstr(mp_obj_get_type(where), &vstr_return);
+}
+
+STATIC mp_obj_t re_sub(size_t n_args, const mp_obj_t *args) {
+ return re_sub_helper(args[0], n_args, args);
+}
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_sub_obj, 3, 5, re_sub);
+
+#endif
+
STATIC const mp_rom_map_elem_t re_locals_dict_table[] = {
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) },
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) },
{ MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) },
+ #if MICROPY_PY_URE_SUB
+ { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) },
+ #endif
};
STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table);
@@ -307,11 +424,22 @@ STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) {
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search);
+#if MICROPY_PY_URE_SUB
+STATIC mp_obj_t mod_re_sub(size_t n_args, const mp_obj_t *args) {
+ mp_obj_t self = mod_re_compile(1, args);
+ return re_sub_helper(self, n_args, args);
+}
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_sub_obj, 3, 5, mod_re_sub);
+#endif
+
STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = {
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) },
{ MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) },
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) },
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) },
+ #if MICROPY_PY_URE_SUB
+ { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&mod_re_sub_obj) },
+ #endif
{ MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) },
};
diff --git a/py/mpconfig.h b/py/mpconfig.h
index 727375b12..8b0f291cb 100644
--- a/py/mpconfig.h
+++ b/py/mpconfig.h
@@ -1150,6 +1150,10 @@ typedef double mp_float_t;
#define MICROPY_PY_URE_MATCH_SPAN_START_END (0)
#endif
+#ifndef MICROPY_PY_URE_SUB
+#define MICROPY_PY_URE_SUB (0)
+#endif
+
#ifndef MICROPY_PY_UHEAPQ
#define MICROPY_PY_UHEAPQ (0)
#endif
diff --git a/tests/extmod/ure_sub.py b/tests/extmod/ure_sub.py
new file mode 100644
index 000000000..4aeb8650a
--- /dev/null
+++ b/tests/extmod/ure_sub.py
@@ -0,0 +1,61 @@
+try:
+ import ure as re
+except ImportError:
+ try:
+ import re
+ except ImportError:
+ print('SKIP')
+ raise SystemExit
+
+try:
+ re.sub
+except AttributeError:
+ print('SKIP')
+ raise SystemExit
+
+
+def multiply(m):
+ return str(int(m.group(0)) * 2)
+
+print(re.sub("\d+", multiply, "10 20 30 40 50"))
+
+print(re.sub("\d+", lambda m: str(int(m.group(0)) // 2), "10 20 30 40 50"))
+
+def A():
+ return "A"
+print(re.sub('a', A(), 'aBCBABCDabcda.'))
+
+print(
+ re.sub(
+ r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):',
+ 'static PyObject*\npy_\\1(void){\n return;\n}\n',
+ '\n\ndef myfunc():\n\ndef myfunc1():\n\ndef myfunc2():'
+ )
+)
+
+print(
+ re.compile(
+ '(calzino) (blu|bianco|verde) e (scarpa) (blu|bianco|verde)'
+ ).sub(
+ r'\g<1> colore \2 con \g<3> colore \4? ...',
+ 'calzino blu e scarpa verde'
+ )
+)
+
+# no matches at all
+print(re.sub('a', 'b', 'c'))
+
+# with maximum substitution count specified
+print(re.sub('a', 'b', '1a2a3a', 2))
+
+# invalid group
+try:
+ re.sub('(a)', 'b\\2', 'a')
+except:
+ print('invalid group')
+
+# invalid group with very large number (to test overflow in uPy)
+try:
+ re.sub('(a)', 'b\\199999999999999999999999999999999999999', 'a')
+except:
+ print('invalid group')
diff --git a/tests/extmod/ure_sub_unmatched.py b/tests/extmod/ure_sub_unmatched.py
new file mode 100644
index 000000000..4795b3196
--- /dev/null
+++ b/tests/extmod/ure_sub_unmatched.py
@@ -0,0 +1,19 @@
+# test re.sub with unmatched groups, behaviour changed in CPython 3.5
+
+try:
+ import ure as re
+except ImportError:
+ try:
+ import re
+ except ImportError:
+ print('SKIP')
+ raise SystemExit
+
+try:
+ re.sub
+except AttributeError:
+ print('SKIP')
+ raise SystemExit
+
+# first group matches, second optional group doesn't so is replaced with a blank
+print(re.sub(r'(a)(b)?', r'\2-\1', '1a2'))
diff --git a/tests/extmod/ure_sub_unmatched.py.exp b/tests/extmod/ure_sub_unmatched.py.exp
new file mode 100644
index 000000000..1e5f0fda0
--- /dev/null
+++ b/tests/extmod/ure_sub_unmatched.py.exp
@@ -0,0 +1 @@
+1-a2