summaryrefslogtreecommitdiff
path: root/py/unicode.c
diff options
context:
space:
mode:
authortll <1040424979@qq.com>2017-06-24 08:38:32 +0800
committerDamien George <damien.p.george@gmail.com>2017-09-06 16:43:09 +1000
commit68c28174d0e0ec3f6b1461aea3a0b6a1b84610bb (patch)
tree441a42ce59c5f965b66722bd6a5a5b24525c6bcf /py/unicode.c
parent069fc48bf60b31fca4339d26cee7b4a415b185f9 (diff)
py/objstr: Add check for valid UTF-8 when making a str from bytes.
This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64.
Diffstat (limited to 'py/unicode.c')
-rw-r--r--py/unicode.c28
1 files changed, 28 insertions, 0 deletions
diff --git a/py/unicode.c b/py/unicode.c
index eddb007d5..140b7ba71 100644
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
}
return n;
}
+
+bool utf8_check(const byte *p, size_t len) {
+ uint8_t need = 0;
+ const byte *end = p + len;
+ for (; p < end; p++) {
+ byte c = *p;
+ if (need) {
+ if (c >= 0x80) {
+ need--;
+ } else {
+ // mismatch
+ return 0;
+ }
+ } else {
+ if (c >= 0xc0) {
+ if (c >= 0xf8) {
+ // mismatch
+ return 0;
+ }
+ need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
+ } else if (c >= 0x80) {
+ // mismatch
+ return 0;
+ }
+ }
+ }
+ return need == 0; // no pending fragments allowed
+}