diff options
Diffstat (limited to 'py')
-rw-r--r-- | py/lexer.c | 128 | ||||
-rw-r--r-- | py/lexer.h | 11 | ||||
-rw-r--r-- | py/mpconfig.h | 5 | ||||
-rw-r--r-- | py/parse.c | 8 |
4 files changed, 149 insertions, 3 deletions
diff --git a/py/lexer.c b/py/lexer.c index 07ea2b96a..ba118c9d2 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -62,6 +62,12 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) { return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3; } +#if MICROPY_PY_FSTRINGS +STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) { + return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4; +} +#endif + STATIC bool is_char_following(mp_lexer_t *lex, byte c) { return lex->chr1 == c; } @@ -105,7 +111,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) { STATIC bool is_string_or_bytes(mp_lexer_t *lex) { return is_char_or(lex, '\'', '\"') + #if MICROPY_PY_FSTRINGS + || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"')) + || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r')) + && is_char_following_following_or(lex, '\'', '\"'))) + #else || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"')) + #endif || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"')); } @@ -132,9 +144,35 @@ STATIC void next_char(mp_lexer_t *lex) { ++lex->column; } + // shift the input queue forward lex->chr0 = lex->chr1; lex->chr1 = lex->chr2; - lex->chr2 = lex->reader.readbyte(lex->reader.data); + + // and add the next byte from either the fstring args or the reader + #if MICROPY_PY_FSTRINGS + if (lex->fstring_args_idx) { + // if there are saved chars, then we're currently injecting fstring args + if (lex->fstring_args_idx < lex->fstring_args.len) { + lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++]; + } else { + // no more fstring arg bytes + lex->chr2 = '\0'; + } + + if (lex->chr0 == '\0') { + // consumed all fstring data, restore saved input queue + lex->chr0 = lex->chr0_saved; + lex->chr1 = lex->chr1_saved; + lex->chr2 = lex->chr2_saved; + // stop consuming fstring arg data + vstr_reset(&lex->fstring_args); + lex->fstring_args_idx = 0; + } + } else + #endif + { + lex->chr2 = lex->reader.readbyte(lex->reader.data); + } if (lex->chr1 == '\r') { // CR is a new line, converted to LF @@ -272,7 +310,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) { return true; } -STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { +STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) { // get first quoting character char quote_char = '\''; if (is_char(lex, '\"')) { @@ -293,12 +331,57 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) { } size_t n_closing = 0; + #if MICROPY_PY_FSTRINGS + if (is_fstring) { + // assume there's going to be interpolation, so prep the injection data + // fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args. + // only when fstring_args_idx>0 will we consume the arg data + // note: lex->fstring_args will be empty already (it's reset when finished) + vstr_add_str(&lex->fstring_args, ".format("); + } + #endif + while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) { if (is_char(lex, quote_char)) { n_closing += 1; vstr_add_char(&lex->vstr, CUR_CHAR(lex)); } else { n_closing = 0; + + #if MICROPY_PY_FSTRINGS + while (is_fstring && is_char(lex, '{')) { + next_char(lex); + if (is_char(lex, '{')) { + // "{{" is passed through unchanged to be handled by str.format + vstr_add_byte(&lex->vstr, '{'); + next_char(lex); + } else { + // remember the start of this argument (if we need it for f'{a=}'). + size_t i = lex->fstring_args.len; + // extract characters inside the { until we reach the + // format specifier or closing }. + // (MicroPython limitation) note: this is completely unaware of + // Python syntax and will not handle any expression containing '}' or ':'. + // e.g. f'{"}"}' or f'{foo({})}'. + while (!is_end(lex) && !is_char_or(lex, ':', '}')) { + // like the default case at the end of this function, stay 8-bit clean + vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex)); + next_char(lex); + } + if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') { + // if the last character of the arg was '=', then inject "arg=" before the '{'. + // f'{a=}' --> 'a={}'.format(a) + vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i); + // remove the trailing '=' + lex->fstring_args.len--; + } + // comma-separate args + vstr_add_byte(&lex->fstring_args, ','); + } + vstr_add_byte(&lex->vstr, '{'); + } + #endif + if (is_char(lex, '\\')) { next_char(lex); unichar c = CUR_CHAR(lex); @@ -451,6 +534,23 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { } void mp_lexer_to_next(mp_lexer_t *lex) { + #if MICROPY_PY_FSTRINGS + if (lex->fstring_args.len && lex->fstring_args_idx == 0) { + // moving onto the next token means the literal string is complete. + // switch into injecting the format args. + vstr_add_byte(&lex->fstring_args, ')'); + lex->chr0_saved = lex->chr0; + lex->chr1_saved = lex->chr1; + lex->chr2_saved = lex->chr2; + lex->chr0 = lex->fstring_args.buf[0]; + lex->chr1 = lex->fstring_args.buf[1]; + lex->chr2 = lex->fstring_args.buf[2]; + // we've already extracted 3 chars, but setting this non-zero also + // means we'll start consuming the fstring data + lex->fstring_args_idx = 3; + } + #endif + // start new token text vstr_reset(&lex->vstr); @@ -506,6 +606,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) { do { // parse type codes bool is_raw = false; + bool is_fstring = false; mp_token_kind_t kind = MP_TOKEN_STRING; int n_char = 0; if (is_char(lex, 'u')) { @@ -524,7 +625,25 @@ void mp_lexer_to_next(mp_lexer_t *lex) { kind = MP_TOKEN_BYTES; n_char = 2; } + #if MICROPY_PY_FSTRINGS + if (is_char_following(lex, 'f')) { + // raw-f-strings unsupported, immediately return (invalid) token. + lex->tok_kind = MP_TOKEN_FSTRING_RAW; + break; + } + #endif + } + #if MICROPY_PY_FSTRINGS + else if (is_char(lex, 'f')) { + if (is_char_following(lex, 'r')) { + // raw-f-strings unsupported, immediately return (invalid) token. + lex->tok_kind = MP_TOKEN_FSTRING_RAW; + break; + } + n_char = 1; + is_fstring = true; } + #endif // Set or check token kind if (lex->tok_kind == MP_TOKEN_END) { @@ -543,7 +662,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) { } // Parse the literal - parse_string_literal(lex, is_raw); + parse_string_literal(lex, is_raw, is_fstring); // Skip whitespace so we can check if there's another string following skip_whitespace(lex, true); @@ -703,6 +822,9 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) { lex->num_indent_level = 1; lex->indent_level = m_new(uint16_t, lex->alloc_indent_level); vstr_init(&lex->vstr, 32); + #if MICROPY_PY_FSTRINGS + vstr_init(&lex->fstring_args, 0); + #endif // store sentinel for first indentation level lex->indent_level[0] = 0; diff --git a/py/lexer.h b/py/lexer.h index 91767a44b..e16b9a8ce 100644 --- a/py/lexer.h +++ b/py/lexer.h @@ -44,6 +44,10 @@ typedef enum _mp_token_kind_t { MP_TOKEN_INVALID, MP_TOKEN_DEDENT_MISMATCH, MP_TOKEN_LONELY_STRING_OPEN, + #if MICROPY_PY_FSTRINGS + MP_TOKEN_MALFORMED_FSTRING, + MP_TOKEN_FSTRING_RAW, + #endif MP_TOKEN_NEWLINE, MP_TOKEN_INDENT, @@ -158,6 +162,9 @@ typedef struct _mp_lexer_t { mp_reader_t reader; // stream source unichar chr0, chr1, chr2; // current cached characters from source + #if MICROPY_PY_FSTRINGS + unichar chr0_saved, chr1_saved, chr2_saved; // current cached characters from alt source + #endif size_t line; // current source line size_t column; // current source column @@ -173,6 +180,10 @@ typedef struct _mp_lexer_t { size_t tok_column; // token source column mp_token_kind_t tok_kind; // token kind vstr_t vstr; // token data + #if MICROPY_PY_FSTRINGS + vstr_t fstring_args; // extracted arguments to pass to .format() + size_t fstring_args_idx; // how many bytes of fstring_args have been read + #endif } mp_lexer_t; mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader); diff --git a/py/mpconfig.h b/py/mpconfig.h index a91c39b01..a5d639efe 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -875,6 +875,11 @@ typedef double mp_float_t; #define MICROPY_PY_ASYNC_AWAIT (1) #endif +// Support for literal string interpolation, f-strings (see PEP 498, Python 3.6+) +#ifndef MICROPY_PY_FSTRINGS +#define MICROPY_PY_FSTRINGS (0) +#endif + // Support for assignment expressions with := (see PEP 572, Python 3.8+) #ifndef MICROPY_PY_ASSIGN_EXPR #define MICROPY_PY_ASSIGN_EXPR (1) diff --git a/py/parse.c b/py/parse.c index da2f5e796..ae3fa8ea6 100644 --- a/py/parse.c +++ b/py/parse.c @@ -1152,6 +1152,14 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) { } else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) { exc = mp_obj_new_exception_msg(&mp_type_IndentationError, MP_ERROR_TEXT("unindent doesn't match any outer indent level")); + #if MICROPY_PY_FSTRINGS + } else if (lex->tok_kind == MP_TOKEN_MALFORMED_FSTRING) { + exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, + MP_ERROR_TEXT("malformed f-string")); + } else if (lex->tok_kind == MP_TOKEN_FSTRING_RAW) { + exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, + MP_ERROR_TEXT("raw f-strings are not supported")); + #endif } else { exc = mp_obj_new_exception_msg(&mp_type_SyntaxError, MP_ERROR_TEXT("invalid syntax")); |