Skip to content

bpo-40334: Improve various PEG-Parser related stuff #19669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#ifndef Py_LIMITED_API
#ifndef Py_PEGENINTERFACE
#define Py_PEGENINTERFACE
#ifdef __cplusplus
extern "C" {
#endif

#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif

#include "Python.h"
#include "Python-ast.h"

Expand All @@ -29,4 +32,3 @@ PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFileObject(FILE *, PyObject *fi
}
#endif
#endif /* !Py_PEGENINTERFACE*/
#endif /* !Py_LIMITED_API */
2 changes: 1 addition & 1 deletion Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ PEGEN_OBJS= \


PEGEN_HEADERS= \
$(srcdir)/Include/pegen_interface.h \
$(srcdir)/Include/internal/pegen_interface.h \
$(srcdir)/Parser/pegen/pegen.h \
$(srcdir)/Parser/pegen/parse_string.h

Expand Down
2 changes: 1 addition & 1 deletion Modules/_peg_parser.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include <Python.h>
#include <pegen_interface.h>
#include "pegen_interface.h"

PyObject *
_Py_parse_file(PyObject *self, PyObject *args, PyObject *kwds)
Expand Down
2 changes: 1 addition & 1 deletion PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@
<ClInclude Include="..\Include\graminit.h" />
<ClInclude Include="..\Include\grammar.h" />
<ClInclude Include="..\Include\import.h" />
<ClInclude Include="..\Include\internal\pegen_interface.h" />
<ClInclude Include="..\Include\internal\pycore_abstract.h" />
<ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
Expand Down Expand Up @@ -213,7 +214,6 @@
<ClInclude Include="..\Include\parsetok.h" />
<ClInclude Include="..\Include\patchlevel.h" />
<ClInclude Include="..\Include\picklebufobject.h" />
<ClInclude Include="..\Include\pegen_interface.h" />
<ClInclude Include="..\Include\pyhash.h" />
<ClInclude Include="..\Include\pyhash.h" />
<ClInclude Include="..\Include\py_curses.h" />
Expand Down
2 changes: 1 addition & 1 deletion Parser/pegen/peg_api.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include <pegen_interface.h>
#include "pegen_interface.h"

#include "../tokenizer.h"
#include "pegen.h"
Expand Down
76 changes: 49 additions & 27 deletions Parser/pegen/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
static int
init_normalization(Parser *p)
{
if (p->normalize) {
return 1;
}
PyObject *m = PyImport_ImportModuleNoBlock("unicodedata");
if (!m)
{
Expand Down Expand Up @@ -36,7 +39,7 @@ _PyPegen_new_identifier(Parser *p, char *n)
if (!PyUnicode_IS_ASCII(id))
{
PyObject *id2;
if (!p->normalize && !init_normalization(p))
if (!init_normalization(p))
{
Py_DECREF(id);
goto error;
Expand Down Expand Up @@ -88,6 +91,9 @@ static inline Py_ssize_t
byte_offset_to_character_offset(PyObject *line, int col_offset)
{
const char *str = PyUnicode_AsUTF8(line);
if (!str) {
return 0;
}
PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL);
if (!text) {
return 0;
Expand Down Expand Up @@ -171,9 +177,10 @@ _PyPegen_get_expr_name(expr_ty e)
}
}

static void
static int
raise_decode_error(Parser *p)
{
assert(PyErr_Occurred());
const char *errtype = NULL;
if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
errtype = "unicode error";
Expand All @@ -197,6 +204,8 @@ raise_decode_error(Parser *p)
Py_XDECREF(value);
Py_XDECREF(tback);
}

return -1;
}

static void
Expand All @@ -207,27 +216,33 @@ raise_tokenizer_init_error(PyObject *filename)
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
return;
}
PyObject *type, *value, *tback, *errstr;
PyObject *errstr = NULL;
PyObject *tuple = NULL;
PyObject *type, *value, *tback;
PyErr_Fetch(&type, &value, &tback);
errstr = PyObject_Str(value);
if (!errstr) {
goto error;
}

Py_INCREF(Py_None);
PyObject *tmp = Py_BuildValue("(OiiN)", filename, 0, -1, Py_None);
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
if (!tmp) {
goto error;
}

value = PyTuple_Pack(2, errstr, tmp);
tuple = PyTuple_Pack(2, errstr, tmp);
Py_DECREF(tmp);
if (!value) {
goto error;
}
PyErr_SetObject(PyExc_SyntaxError, value);
PyErr_SetObject(PyExc_SyntaxError, tuple);

error:
Py_XDECREF(type);
Py_XDECREF(value);
Py_XDECREF(tback);
Py_XDECREF(errstr);
Py_XDECREF(tuple);
}

static inline PyObject *
Expand Down Expand Up @@ -337,9 +352,6 @@ tokenizer_error(Parser *p)
errtype = PyExc_IndentationError;
msg = "too many levels of indentation";
break;
case E_DECODE:
raise_decode_error(p);
return -1;
case E_LINECONT:
msg = "unexpected character after line continuation character";
break;
Expand Down Expand Up @@ -513,7 +525,12 @@ _PyPegen_fill_token(Parser *p)
const char *start, *end;
int type = PyTokenizer_Get(p->tok, &start, &end);
if (type == ERRORTOKEN) {
return tokenizer_error(p);
if (p->tok->done == E_DECODE) {
return raise_decode_error(p);
}
else {
return tokenizer_error(p);
}
}
if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) {
type = NEWLINE; /* Add an extra newline */
Expand All @@ -530,13 +547,21 @@ _PyPegen_fill_token(Parser *p)

if (p->fill == p->size) {
int newsize = p->size * 2;
p->tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
if (p->tokens == NULL) {
PyErr_Format(PyExc_MemoryError, "Realloc tokens failed");
Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
if (new_tokens == NULL) {
PyErr_NoMemory();
return -1;
}
else {
p->tokens = new_tokens;
}
for (int i = p->size; i < newsize; i++) {
p->tokens[i] = PyMem_Malloc(sizeof(Token));
if (p->tokens[i] == NULL) {
p->size = i; // Needed, in order to cleanup correctly after parser fails
PyErr_NoMemory();
return -1;
}
memset(p->tokens[i], '\0', sizeof(Token));
}
p->size = newsize;
Expand Down Expand Up @@ -566,8 +591,6 @@ _PyPegen_fill_token(Parser *p)
t->end_lineno = p->starting_lineno + end_lineno;
t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset;

// if (p->fill % 100 == 0) fprintf(stderr, "Filled at %d: %s \"%s\"\n", p->fill,
// token_name(type), PyBytes_AsString(t->bytes));
p->fill += 1;
return 0;
}
Expand Down Expand Up @@ -614,6 +637,7 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres)
{
if (p->mark == p->fill) {
if (_PyPegen_fill_token(p) < 0) {
p->error_indicator = 1;
return -1;
}
}
Expand All @@ -632,11 +656,9 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres)
}
p->mark = m->mark;
*(void **)(pres) = m->node;
// fprintf(stderr, "%d < %d: memoized!\n", p->mark, p->fill);
return 1;
}
}
// fprintf(stderr, "%d < %d: not memoized\n", p->mark, p->fill);
return 0;
}

Expand Down Expand Up @@ -673,18 +695,15 @@ _PyPegen_expect_token(Parser *p, int type)
{
if (p->mark == p->fill) {
if (_PyPegen_fill_token(p) < 0) {
p->error_indicator = 1;
return NULL;
}
}
Token *t = p->tokens[p->mark];
if (t->type != type) {
// fprintf(stderr, "No %s at %d\n", token_name(type), p->mark);
return NULL;
}
p->mark += 1;
// fprintf(stderr, "Got %s at %d: %s\n", token_name(type), p->mark,
// PyBytes_AsString(t->bytes));

return t;
}

Expand Down Expand Up @@ -878,8 +897,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena
{
Parser *p = PyMem_Malloc(sizeof(Parser));
if (p == NULL) {
PyErr_Format(PyExc_MemoryError, "Out of memory for Parser");
return NULL;
return (Parser *) PyErr_NoMemory();
}
assert(tok != NULL);
p->tok = tok;
Expand All @@ -888,10 +906,14 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena
p->tokens = PyMem_Malloc(sizeof(Token *));
if (!p->tokens) {
PyMem_Free(p);
PyErr_Format(PyExc_MemoryError, "Out of memory for tokens");
return NULL;
return (Parser *) PyErr_NoMemory();
}
p->tokens[0] = PyMem_Malloc(sizeof(Token));
if (!p->tokens) {
PyMem_Free(p->tokens);
PyMem_Free(p);
return (Parser *) PyErr_NoMemory();
}
memset(p->tokens[0], '\0', sizeof(Token));
p->mark = 0;
p->fill = 0;
Expand Down Expand Up @@ -1177,7 +1199,7 @@ _PyPegen_seq_count_dots(asdl_seq *seq)
number_of_dots += 1;
break;
default:
assert(current_expr->type == ELLIPSIS || current_expr->type == DOT);
Py_UNREACHABLE();
}
}

Expand Down
2 changes: 1 addition & 1 deletion Python/pythonrun.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include "ast.h" // PyAST_FromNodeObject()
#include "marshal.h" // PyMarshal_ReadLongFromFile()

#include <pegen_interface.h> // PyPegen_ASTFrom*
#include "pegen_interface.h" // PyPegen_ASTFrom*

#ifdef MS_WINDOWS
# include "malloc.h" // alloca()
Expand Down