summaryrefslogtreecommitdiff
path: root/py
diff options
context:
space:
mode:
Diffstat (limited to 'py')
-rw-r--r--py/mpconfig.h5
-rw-r--r--py/objstr.c10
-rw-r--r--py/unicode.c28
-rw-r--r--py/unicode.h1
4 files changed, 44 insertions, 0 deletions
diff --git a/py/mpconfig.h b/py/mpconfig.h
index dac8a903c..38cf4b560 100644
--- a/py/mpconfig.h
+++ b/py/mpconfig.h
@@ -691,6 +691,11 @@ typedef double mp_float_t;
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
#endif
+// Whether to check for valid UTF-8 when converting bytes to str
+#ifndef MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+#define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE)
+#endif
+
// Whether str.center() method provided
#ifndef MICROPY_PY_BUILTINS_STR_CENTER
#define MICROPY_PY_BUILTINS_STR_CENTER (0)
diff --git a/py/objstr.c b/py/objstr.c
index 4c287af04..f6214f80c 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -161,6 +161,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
if (str_hash == 0) {
str_hash = qstr_compute_hash(str_data, str_len);
}
+ #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+ if (!utf8_check(str_data, str_len)) {
+ mp_raise_msg(&mp_type_UnicodeError, NULL);
+ }
+ #endif
mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
o->data = str_data;
o->hash = str_hash;
@@ -168,6 +173,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
} else {
mp_buffer_info_t bufinfo;
mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
+ #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+ if (!utf8_check(bufinfo.buf, bufinfo.len)) {
+ mp_raise_msg(&mp_type_UnicodeError, NULL);
+ }
+ #endif
return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
}
}
diff --git a/py/unicode.c b/py/unicode.c
index eddb007d5..140b7ba71 100644
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
}
return n;
}
+
+bool utf8_check(const byte *p, size_t len) {
+ uint8_t need = 0;
+ const byte *end = p + len;
+ for (; p < end; p++) {
+ byte c = *p;
+ if (need) {
+ if (c >= 0x80) {
+ need--;
+ } else {
+ // mismatch
+ return 0;
+ }
+ } else {
+ if (c >= 0xc0) {
+ if (c >= 0xf8) {
+ // mismatch
+ return 0;
+ }
+ need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
+ } else if (c >= 0x80) {
+ // mismatch
+ return 0;
+ }
+ }
+ }
+ return need == 0; // no pending fragments allowed
+}
diff --git a/py/unicode.h b/py/unicode.h
index 19487a65a..c1fb51789 100644
--- a/py/unicode.h
+++ b/py/unicode.h
@@ -30,5 +30,6 @@
#include "py/misc.h"
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
+bool utf8_check(const byte *p, size_t len);
#endif // MICROPY_INCLUDED_PY_UNICODE_H