summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJared Hancock <jared.hancock@centeredsolutions.com>2024-03-25 20:58:51 -0500
committerDamien George <damien@micropython.org>2025-08-11 14:11:56 +1000
commit14ccdeb4d7b9b88ab012258e77d3070340fbc3da (patch)
tree5dfddbc202170ecaa12f1d61746442afef938f5b
parent485dac783b8ba7b88fdbf28fcdf54eb053cd8ef7 (diff)
extmod/modre: Add support for start- and endpos.
Pattern objects have two additional parameters for the ::search and ::match methods to define the starting and ending position of the subject within the string to be searched. This allows for searching a sub-string without creating a slice. However, one caveat of using the start-pos rather than a slice is that the start anchor (`^`) remains anchored to the beginning of the text. Signed-off-by: Jared Hancock <jared@greezybacon.me>
-rw-r--r--docs/library/re.rst14
-rw-r--r--extmod/modre.c25
-rw-r--r--tests/extmod/re_start_end_pos.py78
3 files changed, 114 insertions, 3 deletions
diff --git a/docs/library/re.rst b/docs/library/re.rst
index 19b15d2d2..b8aeefd90 100644
--- a/docs/library/re.rst
+++ b/docs/library/re.rst
@@ -154,8 +154,8 @@ Regex objects
Compiled regular expression. Instances of this class are created using
`re.compile()`.
-.. method:: regex.match(string)
- regex.search(string)
+.. method:: regex.match(string, [pos, [endpos]])
+ regex.search(string, [pos, [endpos]])
regex.sub(replace, string, count=0, flags=0, /)
Similar to the module-level functions :meth:`match`, :meth:`search`
@@ -163,6 +163,16 @@ Compiled regular expression. Instances of this class are created using
Using methods is (much) more efficient if the same regex is applied to
multiple strings.
+ The optional second parameter *pos* gives an index in the string where the
+ search is to start; it defaults to ``0``. This is not completely equivalent
+ to slicing the string; the ``'^'`` pattern character matches at the real
+ beginning of the string and at positions just after a newline, but not
+ necessarily at the index where the search is to start.
+
+ The optional parameter *endpos* limits how far the string will be searched;
+ it will be as if the string is *endpos* characters long, so only the
+ characters from *pos* to ``endpos - 1`` will be searched for a match.
+
.. method:: regex.split(string, max_split=-1, /)
Split a *string* using regex. If *max_split* is given, it specifies
diff --git a/extmod/modre.c b/extmod/modre.c
index d17ec68d5..85e5d1b0f 100644
--- a/extmod/modre.c
+++ b/extmod/modre.c
@@ -196,10 +196,11 @@ static void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t
// Note: this function can't be named re_exec because it may clash with system headers, eg on FreeBSD
static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *args) {
- (void)n_args;
mp_obj_re_t *self;
+ bool was_compiled = false;
if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) {
self = MP_OBJ_TO_PTR(args[0]);
+ was_compiled = true;
} else {
self = MP_OBJ_TO_PTR(mod_re_compile(1, args));
}
@@ -207,6 +208,28 @@ static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *ar
size_t len;
subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len);
subj.end = subj.begin + len;
+
+ if (was_compiled && n_args > 2) {
+ // Arg #2 is starting-pos
+ mp_int_t startpos = mp_obj_get_int(args[2]);
+ if (startpos > (mp_int_t)len) {
+ startpos = len;
+ } else if (startpos < 0) {
+ startpos = 0;
+ }
+ subj.begin += startpos;
+ if (n_args > 3) {
+ // Arg #3 is ending-pos
+ mp_int_t endpos = mp_obj_get_int(args[3]);
+ if (endpos > (mp_int_t)len) {
+ endpos = len;
+ } else if (endpos < startpos) {
+ endpos = startpos;
+ }
+ subj.end = subj.begin_line + endpos;
+ }
+ }
+
int caps_num = (self->re.sub + 1) * 2;
mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num);
// cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
diff --git a/tests/extmod/re_start_end_pos.py b/tests/extmod/re_start_end_pos.py
new file mode 100644
index 000000000..bd1658437
--- /dev/null
+++ b/tests/extmod/re_start_end_pos.py
@@ -0,0 +1,78 @@
+# test start and end pos specification
+
+try:
+ import re
+except ImportError:
+ print("SKIP")
+ raise SystemExit
+
+
+def print_groups(match):
+ print("----")
+ try:
+ if match is not None:
+ i = 0
+ while True:
+ print(match.group(i))
+ i += 1
+ except IndexError:
+ pass
+
+
+p = re.compile(r"o")
+m = p.match("dog")
+print_groups(m)
+
+m = p.match("dog", 1)
+print_groups(m)
+
+m = p.match("dog", 2)
+print_groups(m)
+
+# No match past end of input
+m = p.match("dog", 5)
+print_groups(m)
+
+m = p.match("dog", 0, 1)
+print_groups(m)
+
+# Caret only matches the actual beginning
+p = re.compile(r"^o")
+m = p.match("dog", 1)
+print_groups(m)
+
+# End at beginning means searching empty string
+p = re.compile(r"o")
+m = p.match("dog", 1, 1)
+print_groups(m)
+
+# End before the beginning doesn't match anything
+m = p.match("dog", 2, 1)
+print_groups(m)
+
+# Negative starting values don't crash
+m = p.search("dog", -2)
+print_groups(m)
+
+m = p.search("dog", -2, -5)
+print_groups(m)
+
+# Search also works
+print("--search")
+
+p = re.compile(r"o")
+m = p.search("dog")
+print_groups(m)
+
+m = p.search("dog", 1)
+print_groups(m)
+
+m = p.search("dog", 2)
+print_groups(m)
+
+# Negative starting values don't crash
+m = p.search("dog", -2)
+print_groups(m)
+
+m = p.search("dog", -2, -5)
+print_groups(m)