From 70b9712056a4086a6968577c86c4afdfb326458a Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 02:32:01 +0300 Subject: [PATCH 1/7] bpo-40334: Improve various PEG-Parser related stuff These are all related to @vstinner's review of the PEP 617 PR. The most important changes include moving `pegen_interface.h` into `Include/internal` and refactoring some code in `Parser/pegen/pegen.c`. --- Include/{ => internal}/pegen_interface.h | 2 - Lib/test/test_sys.py | 8 +-- Makefile.pre.in | 2 +- Modules/_peg_parser.c | 2 +- PCbuild/pythoncore.vcxproj | 2 +- Parser/pegen/peg_api.c | 2 +- Parser/pegen/pegen.c | 69 ++++++++++++++---------- Python/pythonrun.c | 2 +- Python/sysmodule.c | 2 +- 9 files changed, 52 insertions(+), 39 deletions(-) rename Include/{ => internal}/pegen_interface.h (96%) diff --git a/Include/pegen_interface.h b/Include/internal/pegen_interface.h similarity index 96% rename from Include/pegen_interface.h rename to Include/internal/pegen_interface.h index bf5b29634ac332..baf27e0f7cecc3 100644 --- a/Include/pegen_interface.h +++ b/Include/internal/pegen_interface.h @@ -1,4 +1,3 @@ -#ifndef Py_LIMITED_API #ifndef Py_PEGENINTERFACE #define Py_PEGENINTERFACE #ifdef __cplusplus @@ -29,4 +28,3 @@ PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFileObject(FILE *, PyObject *fi } #endif #endif /* !Py_PEGENINTERFACE*/ -#endif /* !Py_LIMITED_API */ diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index bd4ea4794426ca..2a473eb703b51e 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -545,10 +545,10 @@ def __hash__(self): def test_sys_flags(self): self.assertTrue(sys.flags) attrs = ("debug", - "inspect", "interactive", "optimize", "use_peg", - "dont_write_bytecode", "no_user_site", "no_site", - "ignore_environment", "verbose", "bytes_warning", "quiet", - "hash_randomization", "isolated", "dev_mode", "utf8_mode") + "inspect", "interactive", "optimize", "dont_write_bytecode", + "no_user_site", "no_site", "ignore_environment", "verbose", + "bytes_warning", "quiet", "hash_randomization", "isolated", + "dev_mode", "utf8_mode", "use_peg") for attr in attrs: self.assertTrue(hasattr(sys.flags, attr), attr) attr_type = bool if attr == "dev_mode" else int diff --git a/Makefile.pre.in b/Makefile.pre.in index b34fa64ff4bdfa..b7181170f60029 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -304,7 +304,7 @@ PEGEN_OBJS= \ PEGEN_HEADERS= \ - $(srcdir)/Include/pegen_interface.h \ + $(srcdir)/Include/internal/pegen_interface.h \ $(srcdir)/Parser/pegen/pegen.h \ $(srcdir)/Parser/pegen/parse_string.h diff --git a/Modules/_peg_parser.c b/Modules/_peg_parser.c index 0a84edcfc00827..cb5f9aa63aea39 100644 --- a/Modules/_peg_parser.c +++ b/Modules/_peg_parser.c @@ -1,5 +1,5 @@ #include -#include +#include "pegen_interface.h" PyObject * _Py_parse_file(PyObject *self, PyObject *args, PyObject *kwds) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index d795c4d5a7d005..3484f44e961eaf 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -161,6 +161,7 @@ + @@ -213,7 +214,6 @@ - diff --git a/Parser/pegen/peg_api.c b/Parser/pegen/peg_api.c index 7c6903cdd93343..c42aa680c8602d 100644 --- a/Parser/pegen/peg_api.c +++ b/Parser/pegen/peg_api.c @@ -1,4 +1,4 @@ -#include +#include "pegen_interface.h" #include "../tokenizer.h" #include "pegen.h" diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index 47b712f262c46e..ddef37e7eb377e 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -8,6 +8,9 @@ static int init_normalization(Parser *p) { + if (p->normalize) { + return 0; + } PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); if (!m) { @@ -36,7 +39,7 @@ _PyPegen_new_identifier(Parser *p, char *n) if (!PyUnicode_IS_ASCII(id)) { PyObject *id2; - if (!p->normalize && !init_normalization(p)) + if (!init_normalization(p)) { Py_DECREF(id); goto error; @@ -88,6 +91,9 @@ static inline Py_ssize_t byte_offset_to_character_offset(PyObject *line, int col_offset) { const char *str = PyUnicode_AsUTF8(line); + if (!str) { + return 0; + } PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL); if (!text) { return 0; @@ -171,9 +177,10 @@ _PyPegen_get_expr_name(expr_ty e) } } -static void +static int raise_decode_error(Parser *p) { + assert(PyErr_Occurred()); const char *errtype = NULL; if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { errtype = "unicode error"; @@ -197,6 +204,8 @@ raise_decode_error(Parser *p) Py_XDECREF(value); Py_XDECREF(tback); } + + return -1; } static void @@ -211,8 +220,7 @@ raise_tokenizer_init_error(PyObject *filename) PyErr_Fetch(&type, &value, &tback); errstr = PyObject_Str(value); - Py_INCREF(Py_None); - PyObject *tmp = Py_BuildValue("(OiiN)", filename, 0, -1, Py_None); + PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); if (!tmp) { goto error; } @@ -337,9 +345,6 @@ tokenizer_error(Parser *p) errtype = PyExc_IndentationError; msg = "too many levels of indentation"; break; - case E_DECODE: - raise_decode_error(p); - return -1; case E_LINECONT: msg = "unexpected character after line continuation character"; break; @@ -513,7 +518,12 @@ _PyPegen_fill_token(Parser *p) const char *start, *end; int type = PyTokenizer_Get(p->tok, &start, &end); if (type == ERRORTOKEN) { - return tokenizer_error(p); + if (p->tok->done == E_DECODE) { + return raise_decode_error(p); + } + else { + return tokenizer_error(p); + } } if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) { type = NEWLINE; /* Add an extra newline */ @@ -530,13 +540,21 @@ _PyPegen_fill_token(Parser *p) if (p->fill == p->size) { int newsize = p->size * 2; - p->tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); - if (p->tokens == NULL) { - PyErr_Format(PyExc_MemoryError, "Realloc tokens failed"); + Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); + if (new_tokens == NULL) { + PyMem_Free(p->tokens); + p->tokens = NULL; return -1; } + else { + p->tokens = new_tokens; + } for (int i = p->size; i < newsize; i++) { p->tokens[i] = PyMem_Malloc(sizeof(Token)); + if (p->tokens[i] == NULL) { + PyErr_NoMemory(); + return -1; + } memset(p->tokens[i], '\0', sizeof(Token)); } p->size = newsize; @@ -566,8 +584,6 @@ _PyPegen_fill_token(Parser *p) t->end_lineno = p->starting_lineno + end_lineno; t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset; - // if (p->fill % 100 == 0) fprintf(stderr, "Filled at %d: %s \"%s\"\n", p->fill, - // token_name(type), PyBytes_AsString(t->bytes)); p->fill += 1; return 0; } @@ -632,11 +648,9 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres) } p->mark = m->mark; *(void **)(pres) = m->node; - // fprintf(stderr, "%d < %d: memoized!\n", p->mark, p->fill); return 1; } } - // fprintf(stderr, "%d < %d: not memoized\n", p->mark, p->fill); return 0; } @@ -678,13 +692,9 @@ _PyPegen_expect_token(Parser *p, int type) } Token *t = p->tokens[p->mark]; if (t->type != type) { - // fprintf(stderr, "No %s at %d\n", token_name(type), p->mark); return NULL; } p->mark += 1; - // fprintf(stderr, "Got %s at %d: %s\n", token_name(type), p->mark, - // PyBytes_AsString(t->bytes)); - return t; } @@ -866,10 +876,12 @@ void _PyPegen_Parser_Free(Parser *p) { Py_XDECREF(p->normalize); - for (int i = 0; i < p->size; i++) { - PyMem_Free(p->tokens[i]); + if (p->tokens) { + for (int i = 0; i < p->size; i++) { + PyMem_Free(p->tokens[i]); + } + PyMem_Free(p->tokens); } - PyMem_Free(p->tokens); PyMem_Free(p); } @@ -878,8 +890,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena { Parser *p = PyMem_Malloc(sizeof(Parser)); if (p == NULL) { - PyErr_Format(PyExc_MemoryError, "Out of memory for Parser"); - return NULL; + return (void *) PyErr_NoMemory(); } assert(tok != NULL); p->tok = tok; @@ -888,10 +899,14 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena p->tokens = PyMem_Malloc(sizeof(Token *)); if (!p->tokens) { PyMem_Free(p); - PyErr_Format(PyExc_MemoryError, "Out of memory for tokens"); - return NULL; + return (void *) PyErr_NoMemory(); } p->tokens[0] = PyMem_Malloc(sizeof(Token)); + if (!p->tokens) { + PyMem_Free(p->tokens); + PyMem_Free(p); + return (void *) PyErr_NoMemory(); + } memset(p->tokens[0], '\0', sizeof(Token)); p->mark = 0; p->fill = 0; @@ -1177,7 +1192,7 @@ _PyPegen_seq_count_dots(asdl_seq *seq) number_of_dots += 1; break; default: - assert(current_expr->type == ELLIPSIS || current_expr->type == DOT); + Py_UNREACHABLE(); } } diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 6199f0c66104a3..b5bba07b7487cc 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -29,7 +29,7 @@ #include "ast.h" // PyAST_FromNodeObject() #include "marshal.h" // PyMarshal_ReadLongFromFile() -#include // PyPegen_ASTFrom* +#include "pegen_interface.h" // PyPegen_ASTFrom* #ifdef MS_WINDOWS # include "malloc.h" // alloca() diff --git a/Python/sysmodule.c b/Python/sysmodule.c index cf3ddff44d19e8..e5f20a70413bca 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2427,7 +2427,6 @@ static PyStructSequence_Field flags_fields[] = { {"inspect", "-i"}, {"interactive", "-i"}, {"optimize", "-O or -OO"}, - {"use_peg", "-p old or -p new"}, {"dont_write_bytecode", "-B"}, {"no_user_site", "-s"}, {"no_site", "-S"}, @@ -2441,6 +2440,7 @@ static PyStructSequence_Field flags_fields[] = { {"isolated", "-I"}, {"dev_mode", "-X dev"}, {"utf8_mode", "-X utf8"}, + {"use_peg", "-X oldparser"}, {0} }; From 68a81f1a500178de0602eec789cfe988db1e2e22 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 03:32:26 +0300 Subject: [PATCH 2/7] Revert changes that conflict with another PR --- Lib/test/test_sys.py | 8 ++++---- Python/sysmodule.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 2a473eb703b51e..bd4ea4794426ca 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -545,10 +545,10 @@ def __hash__(self): def test_sys_flags(self): self.assertTrue(sys.flags) attrs = ("debug", - "inspect", "interactive", "optimize", "dont_write_bytecode", - "no_user_site", "no_site", "ignore_environment", "verbose", - "bytes_warning", "quiet", "hash_randomization", "isolated", - "dev_mode", "utf8_mode", "use_peg") + "inspect", "interactive", "optimize", "use_peg", + "dont_write_bytecode", "no_user_site", "no_site", + "ignore_environment", "verbose", "bytes_warning", "quiet", + "hash_randomization", "isolated", "dev_mode", "utf8_mode") for attr in attrs: self.assertTrue(hasattr(sys.flags, attr), attr) attr_type = bool if attr == "dev_mode" else int diff --git a/Python/sysmodule.c b/Python/sysmodule.c index e5f20a70413bca..cf3ddff44d19e8 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2427,6 +2427,7 @@ static PyStructSequence_Field flags_fields[] = { {"inspect", "-i"}, {"interactive", "-i"}, {"optimize", "-O or -OO"}, + {"use_peg", "-p old or -p new"}, {"dont_write_bytecode", "-B"}, {"no_user_site", "-s"}, {"no_site", "-S"}, @@ -2440,7 +2441,6 @@ static PyStructSequence_Field flags_fields[] = { {"isolated", "-I"}, {"dev_mode", "-X dev"}, {"utf8_mode", "-X utf8"}, - {"use_peg", "-X oldparser"}, {0} }; From f020c0ae67542f11e525c84bb51c48e815ae9b7d Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 03:44:55 +0300 Subject: [PATCH 3/7] Address feedback --- Include/internal/pegen_interface.h | 4 ++++ Parser/pegen/pegen.c | 13 ++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Include/internal/pegen_interface.h b/Include/internal/pegen_interface.h index baf27e0f7cecc3..d8621c1a889274 100644 --- a/Include/internal/pegen_interface.h +++ b/Include/internal/pegen_interface.h @@ -4,6 +4,10 @@ extern "C" { #endif +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + #include "Python.h" #include "Python-ast.h" diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index ddef37e7eb377e..0970529f025424 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -542,8 +542,7 @@ _PyPegen_fill_token(Parser *p) int newsize = p->size * 2; Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); if (new_tokens == NULL) { - PyMem_Free(p->tokens); - p->tokens = NULL; + PyErr_NoMemory(); return -1; } else { @@ -630,6 +629,7 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres) { if (p->mark == p->fill) { if (_PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; return -1; } } @@ -687,6 +687,7 @@ _PyPegen_expect_token(Parser *p, int type) { if (p->mark == p->fill) { if (_PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; return NULL; } } @@ -876,12 +877,10 @@ void _PyPegen_Parser_Free(Parser *p) { Py_XDECREF(p->normalize); - if (p->tokens) { - for (int i = 0; i < p->size; i++) { - PyMem_Free(p->tokens[i]); - } - PyMem_Free(p->tokens); + for (int i = 0; i < p->size; i++) { + PyMem_Free(p->tokens[i]); } + PyMem_Free(p->tokens); PyMem_Free(p); } From cb42515360bbc13ec914f95b153b9d1bb41b3255 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 03:48:34 +0300 Subject: [PATCH 4/7] Set p->size when memory allocation for a Token fails --- Parser/pegen/pegen.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index 0970529f025424..0a0f2a74e31108 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -551,6 +551,7 @@ _PyPegen_fill_token(Parser *p) for (int i = p->size; i < newsize; i++) { p->tokens[i] = PyMem_Malloc(sizeof(Token)); if (p->tokens[i] == NULL) { + p->size = i - 1; // Needed, in order to cleanup correctly after parser fails PyErr_NoMemory(); return -1; } From 39cf86b7baf7730bd503c21224ced3aeb3725bb2 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 04:06:47 +0300 Subject: [PATCH 5/7] Fix init_normalization --- Parser/pegen/pegen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index 0a0f2a74e31108..c12b8969f281f2 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -9,7 +9,7 @@ static int init_normalization(Parser *p) { if (p->normalize) { - return 0; + return 1; } PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); if (!m) From 62ae3f9b24f3279ade99e4a1ba6df5b9fbf1dd14 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 14:20:54 +0300 Subject: [PATCH 6/7] Fix refcount issue in raise_tokenizer_init_error --- Parser/pegen/pegen.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index c12b8969f281f2..be91c39236f6e6 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -216,26 +216,33 @@ raise_tokenizer_init_error(PyObject *filename) || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { return; } - PyObject *type, *value, *tback, *errstr; + PyObject *errstr = NULL; + PyObject *tuple = NULL; + PyObject *type, *value, *tback; PyErr_Fetch(&type, &value, &tback); errstr = PyObject_Str(value); + if (!errstr) { + goto error; + } PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); if (!tmp) { goto error; } - value = PyTuple_Pack(2, errstr, tmp); + tuple = PyTuple_Pack(2, errstr, tmp); Py_DECREF(tmp); if (!value) { goto error; } - PyErr_SetObject(PyExc_SyntaxError, value); + PyErr_SetObject(PyExc_SyntaxError, tuple); error: Py_XDECREF(type); Py_XDECREF(value); Py_XDECREF(tback); + Py_XDECREF(errstr); + Py_XDECREF(tuple); } static inline PyObject * From 94c456c57215d03de28d49f131b842a6afa7e244 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 18:15:00 +0300 Subject: [PATCH 7/7] Respond to latest feedback --- Parser/pegen/pegen.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index be91c39236f6e6..84ed851c6be484 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -558,7 +558,7 @@ _PyPegen_fill_token(Parser *p) for (int i = p->size; i < newsize; i++) { p->tokens[i] = PyMem_Malloc(sizeof(Token)); if (p->tokens[i] == NULL) { - p->size = i - 1; // Needed, in order to cleanup correctly after parser fails + p->size = i; // Needed, in order to cleanup correctly after parser fails PyErr_NoMemory(); return -1; } @@ -897,7 +897,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena { Parser *p = PyMem_Malloc(sizeof(Parser)); if (p == NULL) { - return (void *) PyErr_NoMemory(); + return (Parser *) PyErr_NoMemory(); } assert(tok != NULL); p->tok = tok; @@ -906,13 +906,13 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena p->tokens = PyMem_Malloc(sizeof(Token *)); if (!p->tokens) { PyMem_Free(p); - return (void *) PyErr_NoMemory(); + return (Parser *) PyErr_NoMemory(); } p->tokens[0] = PyMem_Malloc(sizeof(Token)); if (!p->tokens) { PyMem_Free(p->tokens); PyMem_Free(p); - return (void *) PyErr_NoMemory(); + return (Parser *) PyErr_NoMemory(); } memset(p->tokens[0], '\0', sizeof(Token)); p->mark = 0;