Skip to content

Commit 74ea6b5

Browse files
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
1 parent f3a5b7a commit 74ea6b5

File tree

10 files changed

+90
-43
lines changed

10 files changed

+90
-43
lines changed

Include/cpython/unicodeobject.h

+2
Original file line numberDiff line numberDiff line change
@@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
12221222
and where the hash values are equal (i.e. a very probable match) */
12231223
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
12241224

1225+
PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
1226+
12251227
#ifdef __cplusplus
12261228
}
12271229
#endif

Include/errcode.h

-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ extern "C" {
2929
#define E_EOFS 23 /* EOF in triple-quoted string */
3030
#define E_EOLS 24 /* EOL in single-quoted string */
3131
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
32-
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
3332
#define E_BADSINGLE 27 /* Ill-formed single statement input */
3433

3534
#ifdef __cplusplus

Lib/test/test_fstring.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ def test_missing_expression(self):
583583
])
584584

585585
# Different error message is raised for other whitespace characters.
586-
self.assertAllRaise(SyntaxError, 'invalid character in identifier',
586+
self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0",
587587
["f'''{\xa0}'''",
588588
"\xa0",
589589
])

Lib/test/test_source_encoding.py

+3
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ def test_issue7820(self):
5757
# one byte in common with the UTF-16-LE BOM
5858
self.assertRaises(SyntaxError, eval, b'\xff\x20')
5959

60+
# one byte in common with the UTF-8 BOM
61+
self.assertRaises(SyntaxError, eval, b'\xef\x20')
62+
6063
# two bytes in common with the UTF-8 BOM
6164
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
6265

Lib/test/test_unicode_identifiers.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@ def test_non_bmp_normalized(self):
2020
def test_invalid(self):
2121
try:
2222
from test import badsyntax_3131
23-
except SyntaxError as s:
24-
self.assertEqual(str(s),
25-
"invalid character in identifier (badsyntax_3131.py, line 2)")
23+
except SyntaxError as err:
24+
self.assertEqual(str(err),
25+
"invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
26+
self.assertEqual(err.lineno, 2)
27+
self.assertEqual(err.offset, 1)
2628
else:
2729
self.fail("expected exception didn't occur")
2830

Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improved syntax errors for invalid characters in source code.

Objects/unicodeobject.c

+41-23
Original file line numberDiff line numberDiff line change
@@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self)
1230912309
Py_RETURN_TRUE;
1231012310
}
1231112311

12312-
int
12313-
PyUnicode_IsIdentifier(PyObject *self)
12312+
Py_ssize_t
12313+
_PyUnicode_ScanIdentifier(PyObject *self)
1231412314
{
1231512315
Py_ssize_t i;
12316-
int ready = PyUnicode_IS_READY(self);
12316+
if (PyUnicode_READY(self) == -1)
12317+
return -1;
1231712318

12318-
Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12319+
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
1231912320
if (len == 0) {
1232012321
/* an empty string is not a valid identifier */
1232112322
return 0;
1232212323
}
1232312324

12324-
int kind = 0;
12325-
const void *data = NULL;
12326-
const wchar_t *wstr = NULL;
12327-
Py_UCS4 ch;
12328-
if (ready) {
12329-
kind = PyUnicode_KIND(self);
12330-
data = PyUnicode_DATA(self);
12331-
ch = PyUnicode_READ(kind, data, 0);
12332-
}
12333-
else {
12334-
wstr = _PyUnicode_WSTR(self);
12335-
ch = wstr[0];
12336-
}
12325+
int kind = PyUnicode_KIND(self);
12326+
const void *data = PyUnicode_DATA(self);
12327+
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
1233712328
/* PEP 3131 says that the first character must be in
1233812329
XID_Start and subsequent characters in XID_Continue,
1233912330
and for the ASCII range, the 2.x rules apply (i.e
@@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self)
1234712338
}
1234812339

1234912340
for (i = 1; i < len; i++) {
12350-
if (ready) {
12351-
ch = PyUnicode_READ(kind, data, i);
12341+
ch = PyUnicode_READ(kind, data, i);
12342+
if (!_PyUnicode_IsXidContinue(ch)) {
12343+
return i;
1235212344
}
12353-
else {
12354-
ch = wstr[i];
12345+
}
12346+
return i;
12347+
}
12348+
12349+
int
12350+
PyUnicode_IsIdentifier(PyObject *self)
12351+
{
12352+
if (PyUnicode_IS_READY(self)) {
12353+
Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12354+
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12355+
/* an empty string is not a valid identifier */
12356+
return len && i == len;
12357+
}
12358+
else {
12359+
Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
12360+
if (len == 0) {
12361+
/* an empty string is not a valid identifier */
12362+
return 0;
1235512363
}
12356-
if (!_PyUnicode_IsXidContinue(ch)) {
12364+
12365+
const wchar_t *wstr = _PyUnicode_WSTR(self);
12366+
Py_UCS4 ch = wstr[0];
12367+
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
1235712368
return 0;
1235812369
}
12370+
12371+
for (i = 1; i < len; i++) {
12372+
ch = wstr[i];
12373+
if (!_PyUnicode_IsXidContinue(ch)) {
12374+
return 0;
12375+
}
12376+
}
12377+
return 1;
1235912378
}
12360-
return 1;
1236112379
}
1236212380

1236312381
/*[clinic input]

Parser/pegen/pegen.c

-3
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
337337
case E_TOKEN:
338338
msg = "invalid token";
339339
break;
340-
case E_IDENTIFIER:
341-
msg = "invalid character in identifier";
342-
break;
343340
case E_EOFS:
344341
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
345342
return -1;

Parser/tokenizer.c

+37-9
Original file line numberDiff line numberDiff line change
@@ -1101,25 +1101,53 @@ static int
11011101
verify_identifier(struct tok_state *tok)
11021102
{
11031103
PyObject *s;
1104-
int result;
11051104
if (tok->decoding_erred)
11061105
return 0;
11071106
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
11081107
if (s == NULL) {
11091108
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1110-
PyErr_Clear();
1111-
tok->done = E_IDENTIFIER;
1112-
} else {
1109+
tok->done = E_DECODE;
1110+
}
1111+
else {
11131112
tok->done = E_ERROR;
11141113
}
11151114
return 0;
11161115
}
1117-
result = PyUnicode_IsIdentifier(s);
1118-
Py_DECREF(s);
1119-
if (result == 0) {
1120-
tok->done = E_IDENTIFIER;
1116+
Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1117+
if (invalid < 0) {
1118+
Py_DECREF(s);
1119+
tok->done = E_ERROR;
1120+
return 0;
11211121
}
1122-
return result;
1122+
assert(PyUnicode_GET_LENGTH(s) > 0);
1123+
if (invalid < PyUnicode_GET_LENGTH(s)) {
1124+
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1125+
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1126+
/* Determine the offset in UTF-8 encoded input */
1127+
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1128+
if (s != NULL) {
1129+
Py_SETREF(s, PyUnicode_AsUTF8String(s));
1130+
}
1131+
if (s == NULL) {
1132+
tok->done = E_ERROR;
1133+
return 0;
1134+
}
1135+
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1136+
}
1137+
Py_DECREF(s);
1138+
// PyUnicode_FromFormatV() does not support %X
1139+
char hex[9];
1140+
snprintf(hex, sizeof(hex), "%04X", ch);
1141+
if (Py_UNICODE_ISPRINTABLE(ch)) {
1142+
syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1143+
}
1144+
else {
1145+
syntaxerror(tok, "invalid non-printable character U+%s", hex);
1146+
}
1147+
return 0;
1148+
}
1149+
Py_DECREF(s);
1150+
return 1;
11231151
}
11241152

11251153
static int

Python/pythonrun.c

-3
Original file line numberDiff line numberDiff line change
@@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
16031603
msg = "unexpected character after line continuation character";
16041604
break;
16051605

1606-
case E_IDENTIFIER:
1607-
msg = "invalid character in identifier";
1608-
break;
16091606
case E_BADSINGLE:
16101607
msg = "multiple statements found while compiling a single statement";
16111608
break;

0 commit comments

Comments
 (0)