diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 05d0cbd2445c4c..1a98db8c615eac 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1866,6 +1866,65 @@ def test_gh139516(self): f.write('''def f(a): pass\nf"{f(a=lambda: 'à'\n)}"'''.encode()) assert_python_ok(script) + def test_gh141271(self): + self.assertEqual(f'''{""" " # not comment """=}''', '""" " # not comment """=\' " # not comment \'') + + self.assertEqual( +f'''{""" " # not comment +"""=}''', +'''""" " # not comment +"""=\' " # not comment\\n\'''') + + self.assertEqual( +f'''{"\"" # comment +=}''', +'"\\"" \n=\'"\'') + + self.assertEqual( +f'{ # comment A +(f''' +# not comment B +{ # comment C ' +3 # comment D +* 2}''', '\n# not comment E\n6')=}', +" \n(f'''\n# not comment B\n{ \n3 \n* 2}''', '\\n# not comment E\\n6')=('\\n# not comment B\\n6', '\\n# not comment E\\n6')") + + self.assertEqual( +f'{ +f'{# 1 ' +f"{# 2 " +None +=}" +=}' +}', +''' +f"{ +None +=}" +=\'\\nNone\\n=None\'''') + + self.assertEqual( +f'{ +f'{# 1 ' +f"{# 2 " +f'''{# 3 ' +f"""{# 4 " +None +=}""" +=}''' +=}" +=}' +}', +''' +f"{ +f\'\'\'{ +f"""{ +None +=}""" +=}\'\'\' +=}" +=\'\\nf\\\'\\\'\\\'{\\nf"""{\\nNone\\n=}"""\\n=}\\\'\\\'\\\'\\n=\\\'\\\\nf"""{\\\\nNone\\\\n=}"""\\\\n=\\\\\\\'\\\\\\\\nNone\\\\\\\\n=None\\\\\\\'\\\'\'''') + if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-08-22-05-29.gh-issue-141271.E7drWa.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-08-22-05-29.gh-issue-141271.E7drWa.rst new file mode 100644 index 00000000000000..6ee60f352d3b09 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-08-22-05-29.gh-issue-141271.E7drWa.rst @@ -0,0 +1 @@ +Fix complex f and t-string quote and nested comments. diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 7f25afec302c22..3f48f689ba3ad5 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -110,103 +110,161 @@ tok_backup(struct tok_state *tok, int c) } } -static int -set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { - assert(token != NULL); - assert(c == '}' || c == ':' || c == '!'); - tokenizer_mode *tok_mode = TOK_GET_MODE(tok); - - if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { - return 0; - } - PyObject *res = NULL; - - // Look for a # character outside of string literals - int hash_detected = 0; - int in_string = 0; - char quote_char = 0; +static Py_ssize_t +recurse_set_ftstring_expr(tokenizer_mode *tok_mode, char *result, Py_ssize_t *in_pos, Py_ssize_t out_pos) +{ + Py_ssize_t i = *in_pos; + Py_ssize_t j = out_pos; + Py_ssize_t in_pos_start = *in_pos; + Py_ssize_t in_pos_end = tok_mode->last_expr_size - tok_mode->last_expr_end; + char *last_expr_buffer = tok_mode->last_expr_buffer; + + int curly_depth = 1; // count these in expressions because of sets and dicts + int in_string = 0; // inside a string, constant or f or t + int is_string_ft; // string we are inside of is an f or t-string + char quote_char; + int is_triple_quote; + + // Process each character + while (i < in_pos_end) { + char ch = last_expr_buffer[i++]; + + if (in_string) { + // Skip escaped characters (also harmless line continuations) + if (ch == '\\') { + result[j++] = '\\'; + if (i < in_pos_end) { + result[j++] = last_expr_buffer[i++]; + } + continue; + } - for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { - char ch = tok_mode->last_expr_buffer[i]; + // Check for string end quotes + if (ch == quote_char) { + if (!is_triple_quote || i + 1 >= in_pos_end) { + in_string = 0; + } + else if (last_expr_buffer[i] == ch && last_expr_buffer[i + 1] == ch) { + in_string = 0; + result[j++] = ch; + result[j++] = ch; + i += 2; + } + result[j++] = ch; + continue; + } - // Skip escaped characters - if (ch == '\\') { - i++; - continue; + // If inside an f or t-string then check for expressions + if (ch == '{') { + result[j++] = '{'; + if (is_string_ft && i < in_pos_end) { + // Double '{{' is doesn't start an expression + if (last_expr_buffer[i] == '{') { + result[j++] = '{'; + i++; + } + else { + j = recurse_set_ftstring_expr(tok_mode, result, &i, j); + } + } + continue; + } } + // In ftstring expression outside of actual string part + else { + // Skip comments + if (ch == '#') { + while (i < in_pos_end) { + if (last_expr_buffer[i++] == '\n') { + result[j++] = '\n'; + break; + } + } + continue; + } - // Handle quotes - if (ch == '"' || ch == '\'') { - // The following if/else block works becase there is an off number - // of quotes in STRING tokens and the lexer only ever reaches this - // function with valid STRING tokens. - // For example: """hello""" - // First quote: in_string = 1 - // Second quote: in_string = 0 - // Third quote: in_string = 1 - if (!in_string) { - in_string = 1; + // Handle string start quotes + if (ch == '"' || ch == '\'') { quote_char = ch; + in_string = 1; + is_string_ft = 0; + is_triple_quote = 0; + + // Check preceding characters to see if is f or t-string + if ((i - 2) >= in_pos_start) { + char ch_prev = last_expr_buffer[i - 2]; + + if (ch_prev == 'f' || ch_prev == 't') { + is_string_ft = 1; + } + // If may be raw f or t-string then check pre-preceding char + else if (ch_prev == 'r' && (i - 3) >= in_pos_start) { + char ch_prev_prev = last_expr_buffer[i - 3]; + + if (ch_prev_prev == 'f' || ch_prev_prev == 't') { + is_string_ft = 1; + } + } + } + + // Check for triple quotes + if (i + 2 <= in_pos_end && last_expr_buffer[i] == ch && last_expr_buffer[i + 1] == ch) { + is_triple_quote = 1; + result[j++] = ch; + result[j++] = ch; + i += 2; + } + result[j++] = ch; + + continue; } - else if (ch == quote_char) { - in_string = 0; + + // Count nested curlies + if (ch == '{') { + curly_depth++; + } + // Check for end of expression curlies + else if (ch == '}') { + if (!--curly_depth) { + result[j++] = '}'; + break; + } } - continue; } - // Check for # outside strings - if (ch == '#' && !in_string) { - hash_detected = 1; - break; - } + // Copy other chars + result[j++] = ch; + } + + *in_pos = i; + return j; +} + +static int +set_ftstring_expr(struct tok_state* tok, struct token *token, char c) +{ + assert(token != NULL); + assert(c == '}' || c == ':' || c == '!'); + tokenizer_mode *tok_mode = TOK_GET_MODE(tok); + + if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { + return 0; } - // If we found a # character in the expression, we need to handle comments - if (hash_detected) { + PyObject *res = NULL; + + // If there is a '#' character in the expression, we need to handle possible comments + if (memchr(tok_mode->last_expr_buffer, '#', tok_mode->last_expr_size - tok_mode->last_expr_end) != NULL) { // Allocate buffer for processed result char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); if (!result) { return -1; } - Py_ssize_t i = 0; // Input position - Py_ssize_t j = 0; // Output position - in_string = 0; // Whether we're in a string - quote_char = 0; // Current string quote char - - // Process each character - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { - char ch = tok_mode->last_expr_buffer[i]; - - // Handle string quotes - if (ch == '"' || ch == '\'') { - // See comment above to understand this part - if (!in_string) { - in_string = 1; - quote_char = ch; - } else if (ch == quote_char) { - in_string = 0; - } - result[j++] = ch; - } - // Skip comments - else if (ch == '#' && !in_string) { - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && - tok_mode->last_expr_buffer[i] != '\n') { - i++; - } - if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { - result[j++] = '\n'; - } - } - // Copy other chars - else { - result[j++] = ch; - } - i++; - } + Py_ssize_t in_pos = 0; + Py_ssize_t out_pos = recurse_set_ftstring_expr(tok_mode, result, &in_pos, 0); - result[j] = '\0'; // Null-terminate the result string - res = PyUnicode_DecodeUTF8(result, j, NULL); + result[out_pos] = '\0'; // Null-terminate the result string + res = PyUnicode_DecodeUTF8(result, out_pos, NULL); PyMem_Free(result); } else { res = PyUnicode_DecodeUTF8(