diff options
| author | Jared Hancock <jared.hancock@centeredsolutions.com> | 2024-03-25 20:58:51 -0500 |
|---|---|---|
| committer | Damien George <damien@micropython.org> | 2025-08-11 14:11:56 +1000 |
| commit | 14ccdeb4d7b9b88ab012258e77d3070340fbc3da (patch) | |
| tree | 5dfddbc202170ecaa12f1d61746442afef938f5b | |
| parent | 485dac783b8ba7b88fdbf28fcdf54eb053cd8ef7 (diff) | |
extmod/modre: Add support for start- and endpos.
Pattern objects have two additional parameters for the ::search and ::match
methods to define the starting and ending position of the subject within
the string to be searched.
This allows for searching a sub-string without creating a slice. However,
one caveat of using the start-pos rather than a slice is that the start
anchor (`^`) remains anchored to the beginning of the text.
Signed-off-by: Jared Hancock <jared@greezybacon.me>
| -rw-r--r-- | docs/library/re.rst | 14 | ||||
| -rw-r--r-- | extmod/modre.c | 25 | ||||
| -rw-r--r-- | tests/extmod/re_start_end_pos.py | 78 |
3 files changed, 114 insertions, 3 deletions
diff --git a/docs/library/re.rst b/docs/library/re.rst index 19b15d2d2..b8aeefd90 100644 --- a/docs/library/re.rst +++ b/docs/library/re.rst @@ -154,8 +154,8 @@ Regex objects Compiled regular expression. Instances of this class are created using `re.compile()`. -.. method:: regex.match(string) - regex.search(string) +.. method:: regex.match(string, [pos, [endpos]]) + regex.search(string, [pos, [endpos]]) regex.sub(replace, string, count=0, flags=0, /) Similar to the module-level functions :meth:`match`, :meth:`search` @@ -163,6 +163,16 @@ Compiled regular expression. Instances of this class are created using Using methods is (much) more efficient if the same regex is applied to multiple strings. + The optional second parameter *pos* gives an index in the string where the + search is to start; it defaults to ``0``. This is not completely equivalent + to slicing the string; the ``'^'`` pattern character matches at the real + beginning of the string and at positions just after a newline, but not + necessarily at the index where the search is to start. + + The optional parameter *endpos* limits how far the string will be searched; + it will be as if the string is *endpos* characters long, so only the + characters from *pos* to ``endpos - 1`` will be searched for a match. + .. method:: regex.split(string, max_split=-1, /) Split a *string* using regex. If *max_split* is given, it specifies diff --git a/extmod/modre.c b/extmod/modre.c index d17ec68d5..85e5d1b0f 100644 --- a/extmod/modre.c +++ b/extmod/modre.c @@ -196,10 +196,11 @@ static void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t // Note: this function can't be named re_exec because it may clash with system headers, eg on FreeBSD static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *args) { - (void)n_args; mp_obj_re_t *self; + bool was_compiled = false; if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) { self = MP_OBJ_TO_PTR(args[0]); + was_compiled = true; } else { self = MP_OBJ_TO_PTR(mod_re_compile(1, args)); } @@ -207,6 +208,28 @@ static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *ar size_t len; subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len); subj.end = subj.begin + len; + + if (was_compiled && n_args > 2) { + // Arg #2 is starting-pos + mp_int_t startpos = mp_obj_get_int(args[2]); + if (startpos > (mp_int_t)len) { + startpos = len; + } else if (startpos < 0) { + startpos = 0; + } + subj.begin += startpos; + if (n_args > 3) { + // Arg #3 is ending-pos + mp_int_t endpos = mp_obj_get_int(args[3]); + if (endpos > (mp_int_t)len) { + endpos = len; + } else if (endpos < startpos) { + endpos = startpos; + } + subj.end = subj.begin_line + endpos; + } + } + int caps_num = (self->re.sub + 1) * 2; mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num); // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char diff --git a/tests/extmod/re_start_end_pos.py b/tests/extmod/re_start_end_pos.py new file mode 100644 index 000000000..bd1658437 --- /dev/null +++ b/tests/extmod/re_start_end_pos.py @@ -0,0 +1,78 @@ +# test start and end pos specification + +try: + import re +except ImportError: + print("SKIP") + raise SystemExit + + +def print_groups(match): + print("----") + try: + if match is not None: + i = 0 + while True: + print(match.group(i)) + i += 1 + except IndexError: + pass + + +p = re.compile(r"o") +m = p.match("dog") +print_groups(m) + +m = p.match("dog", 1) +print_groups(m) + +m = p.match("dog", 2) +print_groups(m) + +# No match past end of input +m = p.match("dog", 5) +print_groups(m) + +m = p.match("dog", 0, 1) +print_groups(m) + +# Caret only matches the actual beginning +p = re.compile(r"^o") +m = p.match("dog", 1) +print_groups(m) + +# End at beginning means searching empty string +p = re.compile(r"o") +m = p.match("dog", 1, 1) +print_groups(m) + +# End before the beginning doesn't match anything +m = p.match("dog", 2, 1) +print_groups(m) + +# Negative starting values don't crash +m = p.search("dog", -2) +print_groups(m) + +m = p.search("dog", -2, -5) +print_groups(m) + +# Search also works +print("--search") + +p = re.compile(r"o") +m = p.search("dog") +print_groups(m) + +m = p.search("dog", 1) +print_groups(m) + +m = p.search("dog", 2) +print_groups(m) + +# Negative starting values don't crash +m = p.search("dog", -2) +print_groups(m) + +m = p.search("dog", -2, -5) +print_groups(m) |
