diff options
Diffstat (limited to 'py/lexer.c')
| -rw-r--r-- | py/lexer.c | 355 | 
1 files changed, 198 insertions, 157 deletions
diff --git a/py/lexer.c b/py/lexer.c index ad4fe3fcb..329875ab0 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -63,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {      return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;  } -/*  STATIC bool is_char_following(mp_lexer_t *lex, byte c) {      return lex->chr1 == c;  } -*/  STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {      return lex->chr1 == c1 || lex->chr1 == c2; @@ -106,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {      return lex->chr1 >= '0' && lex->chr1 <= '7';  } +STATIC bool is_string_or_bytes(mp_lexer_t *lex) { +    return is_char_or(lex, '\'', '\"') +        || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"')) +        || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) +            && is_char_following_following_or(lex, '\'', '\"')); +} +  // to easily parse utf-8 identifiers we allow any raw byte with high bit set  STATIC bool is_head_of_identifier(mp_lexer_t *lex) {      return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80; @@ -272,14 +277,144 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {      return true;  } -void mp_lexer_to_next(mp_lexer_t *lex) { -    // start new token text -    vstr_reset(&lex->vstr); +STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { +    // get first quoting character +    char quote_char = '\''; +    if (is_char(lex, '\"')) { +        quote_char = '\"'; +    } +    next_char(lex); -    // skip white space and comments +    // work out if it's a single or triple quoted literal +    size_t num_quotes; +    if (is_char_and(lex, quote_char, quote_char)) { +        // triple quotes +        next_char(lex); +        next_char(lex); +        num_quotes = 3; +    } else { +        // single quotes +        num_quotes = 1; +    } + +    size_t n_closing = 0; +    while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) { +        if (is_char(lex, quote_char)) { +            n_closing += 1; +            vstr_add_char(&lex->vstr, CUR_CHAR(lex)); +        } else { +            n_closing = 0; +            if (is_char(lex, '\\')) { +                next_char(lex); +                unichar c = CUR_CHAR(lex); +                if (is_raw) { +                    // raw strings allow escaping of quotes, but the backslash is also emitted +                    vstr_add_char(&lex->vstr, '\\'); +                } else { +                    switch (c) { +                        // note: "c" can never be MP_LEXER_EOF because next_char +                        // always inserts a newline at the end of the input stream +                        case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it +                        case '\\': break; +                        case '\'': break; +                        case '"': break; +                        case 'a': c = 0x07; break; +                        case 'b': c = 0x08; break; +                        case 't': c = 0x09; break; +                        case 'n': c = 0x0a; break; +                        case 'v': c = 0x0b; break; +                        case 'f': c = 0x0c; break; +                        case 'r': c = 0x0d; break; +                        case 'u': +                        case 'U': +                            if (lex->tok_kind == MP_TOKEN_BYTES) { +                                // b'\u1234' == b'\\u1234' +                                vstr_add_char(&lex->vstr, '\\'); +                                break; +                            } +                            // Otherwise fall through. +                        case 'x': +                        { +                            mp_uint_t num = 0; +                            if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) { +                                // not enough hex chars for escape sequence +                                lex->tok_kind = MP_TOKEN_INVALID; +                            } +                            c = num; +                            break; +                        } +                        case 'N': +                            // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the +                            // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly +                            // 3MB of text; even gzip-compressed and with minimal structure, it'll take +                            // roughly half a meg of storage. This form of Unicode escape may be added +                            // later on, but it's definitely not a priority right now. -- CJA 20140607 +                            mp_not_implemented("unicode name escapes"); +                            break; +                        default: +                            if (c >= '0' && c <= '7') { +                                // Octal sequence, 1-3 chars +                                mp_uint_t digits = 3; +                                mp_uint_t num = c - '0'; +                                while (is_following_odigit(lex) && --digits != 0) { +                                    next_char(lex); +                                    num = num * 8 + (CUR_CHAR(lex) - '0'); +                                } +                                c = num; +                            } else { +                                // unrecognised escape character; CPython lets this through verbatim as '\' and then the character +                                vstr_add_char(&lex->vstr, '\\'); +                            } +                            break; +                    } +                } +                if (c != MP_LEXER_EOF) { +                    if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) { +                        if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) { +                            vstr_add_char(&lex->vstr, c); +                        } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) { +                            vstr_add_byte(&lex->vstr, c); +                        } else { +                            // unicode character out of range +                            // this raises a generic SyntaxError; could provide more info +                            lex->tok_kind = MP_TOKEN_INVALID; +                        } +                    } else { +                        // without unicode everything is just added as an 8-bit byte +                        if (c < 0x100) { +                            vstr_add_byte(&lex->vstr, c); +                        } else { +                            // 8-bit character out of range +                            // this raises a generic SyntaxError; could provide more info +                            lex->tok_kind = MP_TOKEN_INVALID; +                        } +                    } +                } +            } else { +                // Add the "character" as a byte so that we remain 8-bit clean. +                // This way, strings are parsed correctly whether or not they contain utf-8 chars. +                vstr_add_byte(&lex->vstr, CUR_CHAR(lex)); +            } +        } +        next_char(lex); +    } + +    // check we got the required end quotes +    if (n_closing < num_quotes) { +        lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN; +    } + +    // cut off the end quotes from the token text +    vstr_cut_tail_bytes(&lex->vstr, n_closing); +} + +STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {      bool had_physical_newline = false;      while (!is_end(lex)) {          if (is_physical_newline(lex)) { +            if (stop_at_newline && lex->nested_bracket_level == 0) { +                break; +            }              had_physical_newline = true;              next_char(lex);          } else if (is_whitespace(lex)) { @@ -298,6 +433,15 @@ void mp_lexer_to_next(mp_lexer_t *lex) {              break;          }      } +    return had_physical_newline; +} + +void mp_lexer_to_next(mp_lexer_t *lex) { +    // start new token text +    vstr_reset(&lex->vstr); + +    // skip white space and comments +    bool had_physical_newline = skip_whitespace(lex, false);      // set token source information      lex->tok_line = lex->line; @@ -332,168 +476,65 @@ void mp_lexer_to_next(mp_lexer_t *lex) {      } else if (is_end(lex)) {          lex->tok_kind = MP_TOKEN_END; -    } else if (is_char_or(lex, '\'', '\"') -               || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"')) -               || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) { +    } else if (is_string_or_bytes(lex)) {          // a string or bytes literal -        // parse type codes -        bool is_raw = false; -        bool is_bytes = false; -        if (is_char(lex, 'u')) { -            next_char(lex); -        } else if (is_char(lex, 'b')) { -            is_bytes = true; -            next_char(lex); -            if (is_char(lex, 'r')) { -                is_raw = true; -                next_char(lex); -            } -        } else if (is_char(lex, 'r')) { -            is_raw = true; -            next_char(lex); -            if (is_char(lex, 'b')) { -                is_bytes = true; -                next_char(lex); -            } -        } +        // Python requires adjacent string/bytes literals to be automatically +        // concatenated.  We do it here in the tokeniser to make efficient use of RAM, +        // because then the lexer's vstr can be used to accumulate the string literal, +        // in contrast to creating a parse tree of strings and then joining them later +        // in the compiler.  It's also more compact in code size to do it here. -        // set token kind -        if (is_bytes) { -            lex->tok_kind = MP_TOKEN_BYTES; -        } else { -            lex->tok_kind = MP_TOKEN_STRING; -        } +        // MP_TOKEN_END is used to indicate that this is the first string token +        lex->tok_kind = MP_TOKEN_END; -        // get first quoting character -        char quote_char = '\''; -        if (is_char(lex, '\"')) { -            quote_char = '\"'; -        } -        next_char(lex); +        // Loop to accumulate string/bytes literals +        do { +            // parse type codes +            bool is_raw = false; +            mp_token_kind_t kind = MP_TOKEN_STRING; +            int n_char = 0; +            if (is_char(lex, 'u')) { +                n_char = 1; +            } else if (is_char(lex, 'b')) { +                kind = MP_TOKEN_BYTES; +                n_char = 1; +                if (is_char_following(lex, 'r')) { +                    is_raw = true; +                    n_char = 2; +                } +            } else if (is_char(lex, 'r')) { +                is_raw = true; +                n_char = 1; +                if (is_char_following(lex, 'b')) { +                    kind = MP_TOKEN_BYTES; +                    n_char = 2; +                } +            } -        // work out if it's a single or triple quoted literal -        mp_uint_t num_quotes; -        if (is_char_and(lex, quote_char, quote_char)) { -            // triple quotes -            next_char(lex); -            next_char(lex); -            num_quotes = 3; -        } else { -            // single quotes -            num_quotes = 1; -        } +            // Set or check token kind +            if (lex->tok_kind == MP_TOKEN_END) { +                lex->tok_kind = kind; +            } else if (lex->tok_kind != kind) { +                // Can't concatenate string with bytes +                break; +            } -        // parse the literal -        mp_uint_t n_closing = 0; -        while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) { -            if (is_char(lex, quote_char)) { -                n_closing += 1; -                vstr_add_char(&lex->vstr, CUR_CHAR(lex)); -            } else { -                n_closing = 0; -                if (is_char(lex, '\\')) { +            // Skip any type code characters +            if (n_char != 0) { +                next_char(lex); +                if (n_char == 2) {                      next_char(lex); -                    unichar c = CUR_CHAR(lex); -                    if (is_raw) { -                        // raw strings allow escaping of quotes, but the backslash is also emitted -                        vstr_add_char(&lex->vstr, '\\'); -                    } else { -                        switch (c) { -                            // note: "c" can never be MP_LEXER_EOF because next_char -                            // always inserts a newline at the end of the input stream -                            case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it -                            case '\\': break; -                            case '\'': break; -                            case '"': break; -                            case 'a': c = 0x07; break; -                            case 'b': c = 0x08; break; -                            case 't': c = 0x09; break; -                            case 'n': c = 0x0a; break; -                            case 'v': c = 0x0b; break; -                            case 'f': c = 0x0c; break; -                            case 'r': c = 0x0d; break; -                            case 'u': -                            case 'U': -                                if (is_bytes) { -                                    // b'\u1234' == b'\\u1234' -                                    vstr_add_char(&lex->vstr, '\\'); -                                    break; -                                } -                                // Otherwise fall through. -                            case 'x': -                            { -                                mp_uint_t num = 0; -                                if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) { -                                    // not enough hex chars for escape sequence -                                    lex->tok_kind = MP_TOKEN_INVALID; -                                } -                                c = num; -                                break; -                            } -                            case 'N': -                                // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the -                                // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly -                                // 3MB of text; even gzip-compressed and with minimal structure, it'll take -                                // roughly half a meg of storage. This form of Unicode escape may be added -                                // later on, but it's definitely not a priority right now. -- CJA 20140607 -                                mp_not_implemented("unicode name escapes"); -                                break; -                            default: -                                if (c >= '0' && c <= '7') { -                                    // Octal sequence, 1-3 chars -                                    mp_uint_t digits = 3; -                                    mp_uint_t num = c - '0'; -                                    while (is_following_odigit(lex) && --digits != 0) { -                                        next_char(lex); -                                        num = num * 8 + (CUR_CHAR(lex) - '0'); -                                    } -                                    c = num; -                                } else { -                                    // unrecognised escape character; CPython lets this through verbatim as '\' and then the character -                                    vstr_add_char(&lex->vstr, '\\'); -                                } -                                break; -                        } -                    } -                    if (c != MP_LEXER_EOF) { -                        if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) { -                            if (c < 0x110000 && !is_bytes) { -                                vstr_add_char(&lex->vstr, c); -                            } else if (c < 0x100 && is_bytes) { -                                vstr_add_byte(&lex->vstr, c); -                            } else { -                                // unicode character out of range -                                // this raises a generic SyntaxError; could provide more info -                                lex->tok_kind = MP_TOKEN_INVALID; -                            } -                        } else { -                            // without unicode everything is just added as an 8-bit byte -                            if (c < 0x100) { -                                vstr_add_byte(&lex->vstr, c); -                            } else { -                                // 8-bit character out of range -                                // this raises a generic SyntaxError; could provide more info -                                lex->tok_kind = MP_TOKEN_INVALID; -                            } -                        } -                    } -                } else { -                    // Add the "character" as a byte so that we remain 8-bit clean. -                    // This way, strings are parsed correctly whether or not they contain utf-8 chars. -                    vstr_add_byte(&lex->vstr, CUR_CHAR(lex));                  }              } -            next_char(lex); -        } -        // check we got the required end quotes -        if (n_closing < num_quotes) { -            lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN; -        } +            // Parse the literal +            parse_string_literal(lex, is_raw); + +            // Skip whitespace so we can check if there's another string following +            skip_whitespace(lex, true); -        // cut off the end quotes from the token text -        vstr_cut_tail_bytes(&lex->vstr, n_closing); +        } while (is_string_or_bytes(lex));      } else if (is_head_of_identifier(lex)) {          lex->tok_kind = MP_TOKEN_NAME;  | 
