py: Implement partial PEP-498 (f-string) support.

This implements (most of) the PEP-498 spec for f-strings and is based on https://github.com/micropython/micropython/pull/4998 by @klardotsh. It is implemented in the lexer as a syntax translation to `str.format`: f"{a}" --> "{}".format(a) It also supports: f"{a=}" --> "a={}".format(a) This is done by extracting the arguments into a temporary vstr buffer, then after the string has been tokenized, the lexer input queue is saved and the contents of the temporary vstr buffer are injected into the lexer instead. There are four main limitations: - raw f-strings (`fr` or `rf` prefixes) are not supported and will raise `SyntaxError: raw f-strings are not supported`. - literal concatenation of f-strings with adjacent strings will fail "{}" f"{a}" --> "{}{}".format(a) (str.format will incorrectly use the braces from the non-f-string) f"{a}" f"{a}" --> "{}".format(a) "{}".format(a) (cannot concatenate) - PEP-498 requires the full parser to understand the interpolated argument, however because this entirely runs in the lexer it cannot resolve nested braces in expressions like f"{'}'}" - The !r, !s, and !a conversions are not supported. Includes tests and cpydiffs. Signed-off-by: Jim Mussared <jim.mussared@gmail.com>
author: Jim Mussared <jim.mussared@gmail.com> 2021-08-13 01:44:08 +1000
committer: Damien George <damien@micropython.org> 2021-08-14 16:58:40 +1000
commit: 692d36d779192f32371f7f9daa845b566f26968d (patch)
tree: c3bfe2b4a90df72aad6b6eaac8bb6dac398516d9 /py/lexer.c
parent: 162bf3c5d8055a9e9a17461878c9d058066283a5 (diff)
1 files changed, 125 insertions, 3 deletions
diff --git a/py/lexer.c b/py/lexer.c
index 07ea2b96a..ba118c9d2 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -62,6 +62,12 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
     return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
 }
 
+#if MICROPY_PY_FSTRINGS
+STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
+    return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
+}
+#endif
+
 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
     return lex->chr1 == c;
 }
@@ -105,7 +111,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
 
 STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
     return is_char_or(lex, '\'', '\"')
+           #if MICROPY_PY_FSTRINGS
+           || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
+           || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
+               && is_char_following_following_or(lex, '\'', '\"')))
+           #else
            || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+           #endif
            || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
                && is_char_following_following_or(lex, '\'', '\"'));
 }
@@ -132,9 +144,35 @@ STATIC void next_char(mp_lexer_t *lex) {
         ++lex->column;
     }
 
+    // shift the input queue forward
     lex->chr0 = lex->chr1;
     lex->chr1 = lex->chr2;
-    lex->chr2 = lex->reader.readbyte(lex->reader.data);
+
+    // and add the next byte from either the fstring args or the reader
+    #if MICROPY_PY_FSTRINGS
+    if (lex->fstring_args_idx) {
+        // if there are saved chars, then we're currently injecting fstring args
+        if (lex->fstring_args_idx < lex->fstring_args.len) {
+            lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
+        } else {
+            // no more fstring arg bytes
+            lex->chr2 = '\0';
+        }
+
+        if (lex->chr0 == '\0') {
+            // consumed all fstring data, restore saved input queue
+            lex->chr0 = lex->chr0_saved;
+            lex->chr1 = lex->chr1_saved;
+            lex->chr2 = lex->chr2_saved;
+            // stop consuming fstring arg data
+            vstr_reset(&lex->fstring_args);
+            lex->fstring_args_idx = 0;
+        }
+    } else
+    #endif
+    {
+        lex->chr2 = lex->reader.readbyte(lex->reader.data);
+    }
 
     if (lex->chr1 == '\r') {
         // CR is a new line, converted to LF
@@ -272,7 +310,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
     return true;
 }
 
-STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
     // get first quoting character
     char quote_char = '\'';
     if (is_char(lex, '\"')) {
@@ -293,12 +331,57 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
     }
 
     size_t n_closing = 0;
+    #if MICROPY_PY_FSTRINGS
+    if (is_fstring) {
+        // assume there's going to be interpolation, so prep the injection data
+        // fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
+        // only when fstring_args_idx>0 will we consume the arg data
+        // note: lex->fstring_args will be empty already (it's reset when finished)
+        vstr_add_str(&lex->fstring_args, ".format(");
+    }
+    #endif
+
     while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
         if (is_char(lex, quote_char)) {
             n_closing += 1;
             vstr_add_char(&lex->vstr, CUR_CHAR(lex));
         } else {
             n_closing = 0;
+
+            #if MICROPY_PY_FSTRINGS
+            while (is_fstring && is_char(lex, '{')) {
+                next_char(lex);
+                if (is_char(lex, '{')) {
+                    // "{{" is passed through unchanged to be handled by str.format
+                    vstr_add_byte(&lex->vstr, '{');
+                    next_char(lex);
+                } else {
+                    // remember the start of this argument (if we need it for f'{a=}').
+                    size_t i = lex->fstring_args.len;
+                    // extract characters inside the { until we reach the
+                    // format specifier or closing }.
+                    // (MicroPython limitation) note: this is completely unaware of
+                    // Python syntax and will not handle any expression containing '}' or ':'.
+                    // e.g. f'{"}"}' or f'{foo({})}'.
+                    while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
+                        // like the default case at the end of this function, stay 8-bit clean
+                        vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
+                        next_char(lex);
+                    }
+                    if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
+                        // if the last character of the arg was '=', then inject "arg=" before the '{'.
+                        // f'{a=}' --> 'a={}'.format(a)
+                        vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i);
+                        // remove the trailing '='
+                        lex->fstring_args.len--;
+                    }
+                    // comma-separate args
+                    vstr_add_byte(&lex->fstring_args, ',');
+                }
+                vstr_add_byte(&lex->vstr, '{');
+            }
+            #endif
+
             if (is_char(lex, '\\')) {
                 next_char(lex);
                 unichar c = CUR_CHAR(lex);
@@ -451,6 +534,23 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
 }
 
 void mp_lexer_to_next(mp_lexer_t *lex) {
+    #if MICROPY_PY_FSTRINGS
+    if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
+        // moving onto the next token means the literal string is complete.
+        // switch into injecting the format args.
+        vstr_add_byte(&lex->fstring_args, ')');
+        lex->chr0_saved = lex->chr0;
+        lex->chr1_saved = lex->chr1;
+        lex->chr2_saved = lex->chr2;
+        lex->chr0 = lex->fstring_args.buf[0];
+        lex->chr1 = lex->fstring_args.buf[1];
+        lex->chr2 = lex->fstring_args.buf[2];
+        // we've already extracted 3 chars, but setting this non-zero also
+        // means we'll start consuming the fstring data
+        lex->fstring_args_idx = 3;
+    }
+    #endif
+
     // start new token text
     vstr_reset(&lex->vstr);
 
@@ -506,6 +606,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
         do {
             // parse type codes
             bool is_raw = false;
+            bool is_fstring = false;
             mp_token_kind_t kind = MP_TOKEN_STRING;
             int n_char = 0;
             if (is_char(lex, 'u')) {
@@ -524,7 +625,25 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
                     kind = MP_TOKEN_BYTES;
                     n_char = 2;
                 }
+                #if MICROPY_PY_FSTRINGS
+                if (is_char_following(lex, 'f')) {
+                    // raw-f-strings unsupported, immediately return (invalid) token.
+                    lex->tok_kind = MP_TOKEN_FSTRING_RAW;
+                    break;
+                }
+                #endif
+            }
+            #if MICROPY_PY_FSTRINGS
+            else if (is_char(lex, 'f')) {
+                if (is_char_following(lex, 'r')) {
+                    // raw-f-strings unsupported, immediately return (invalid) token.
+                    lex->tok_kind = MP_TOKEN_FSTRING_RAW;
+                    break;
+                }
+                n_char = 1;
+                is_fstring = true;
             }
+            #endif
 
             // Set or check token kind
             if (lex->tok_kind == MP_TOKEN_END) {
@@ -543,7 +662,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
             }
 
             // Parse the literal
-            parse_string_literal(lex, is_raw);
+            parse_string_literal(lex, is_raw, is_fstring);
 
             // Skip whitespace so we can check if there's another string following
             skip_whitespace(lex, true);
@@ -703,6 +822,9 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
     lex->num_indent_level = 1;
     lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
     vstr_init(&lex->vstr, 32);
+    #if MICROPY_PY_FSTRINGS
+    vstr_init(&lex->fstring_args, 0);
+    #endif
 
     // store sentinel for first indentation level
     lex->indent_level[0] = 0;
author	Jim Mussared <jim.mussared@gmail.com>	2021-08-13 01:44:08 +1000
committer	Damien George <damien@micropython.org>	2021-08-14 16:58:40 +1000
commit	692d36d779192f32371f7f9daa845b566f26968d (patch)
tree	c3bfe2b4a90df72aad6b6eaac8bb6dac398516d9 /py/lexer.c
parent	162bf3c5d8055a9e9a17461878c9d058066283a5 (diff)