diff options
author | tll <1040424979@qq.com> | 2017-06-24 08:38:32 +0800 |
---|---|---|
committer | Damien George <damien.p.george@gmail.com> | 2017-09-06 16:43:09 +1000 |
commit | 68c28174d0e0ec3f6b1461aea3a0b6a1b84610bb (patch) | |
tree | 441a42ce59c5f965b66722bd6a5a5b24525c6bcf /py/unicode.c | |
parent | 069fc48bf60b31fca4339d26cee7b4a415b185f9 (diff) |
py/objstr: Add check for valid UTF-8 when making a str from bytes.
This patch adds a function utf8_check() to check for a valid UTF-8 encoded
string, and calls it when constructing a str from raw bytes. The feature
is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and
is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150
bytes on Xtensa and 170 bytes on x86-64.
Diffstat (limited to 'py/unicode.c')
-rw-r--r-- | py/unicode.c | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/py/unicode.c b/py/unicode.c index eddb007d5..140b7ba71 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) { } return n; } + +bool utf8_check(const byte *p, size_t len) { + uint8_t need = 0; + const byte *end = p + len; + for (; p < end; p++) { + byte c = *p; + if (need) { + if (c >= 0x80) { + need--; + } else { + // mismatch + return 0; + } + } else { + if (c >= 0xc0) { + if (c >= 0xf8) { + // mismatch + return 0; + } + need = (0xe5 >> ((c >> 3) & 0x6)) & 3; + } else if (c >= 0x80) { + // mismatch + return 0; + } + } + } + return need == 0; // no pending fragments allowed +} |