Skip to content

Commit dbc9efb

Browse files
Handle lines with multi-byte unicode characters properly (#12)
* Handle lines with multi-byte unicode characters properly * Use Py_XDECREF instead of Py_DECREF Co-authored-by: Batuhan Taskaya <[email protected]> Co-authored-by: Batuhan Taskaya <[email protected]>
1 parent 67438ed commit dbc9efb

File tree

7 files changed

+67
-12
lines changed

7 files changed

+67
-12
lines changed

Include/cpython/traceback.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ typedef struct _traceback {
1010
int tb_lineno;
1111
} PyTracebackObject;
1212

13-
PyAPI_FUNC(int) _Py_DisplaySourceLine(PyObject *, PyObject *, int, int, int *);
13+
PyAPI_FUNC(int) _Py_DisplaySourceLine(PyObject *, PyObject *, int, int, int *, PyObject **);
1414
PyAPI_FUNC(void) _PyTraceback_Add(const char *, const char *, int);

Lib/idlelib/idle_test/test_run.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ def __eq__(self, other):
3333
run.print_exception()
3434

3535
tb = output.getvalue().strip().splitlines()
36-
self.assertEqual(11, len(tb))
37-
self.assertIn('UnhashableException: ex2', tb[3])
38-
self.assertIn('UnhashableException: ex1', tb[10])
36+
self.assertEqual(13, len(tb))
37+
self.assertIn('UnhashableException: ex2', tb[4])
38+
self.assertIn('UnhashableException: ex1', tb[12])
3939

4040
data = (('1/0', ZeroDivisionError, "division by zero\n"),
4141
('abc', NameError, "name 'abc' is not defined. "

Lib/test/test_doctest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2835,6 +2835,7 @@ def test_unicode(): """
28352835
exec(compile(example.source, filename, "single",
28362836
File "<doctest foo-bär@baz[0]>", line 1, in <module>
28372837
raise Exception('clé')
2838+
^^^^^^^^^^^^^^^^^^^^^^
28382839
Exception: clé
28392840
TestResults(failed=1, attempted=1)
28402841
"""

Lib/test/test_zipimport.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,10 @@ def doTraceback(self, module):
716716

717717
s = io.StringIO()
718718
print_tb(tb, 1, s)
719-
self.assertTrue(s.getvalue().endswith(raise_src))
719+
self.assertTrue(s.getvalue().endswith(
720+
' def do_raise(): raise TypeError\n'
721+
' ^^^^^^^^^^^^^^^\n'
722+
))
720723
else:
721724
raise AssertionError("This ought to be impossible")
722725

Lib/traceback.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,13 @@ def _walk_tb_with_full_positions(tb):
342342
# Internal version of walk_tb that yields full code positions including
343343
# end line and column information.
344344
while tb is not None:
345-
yield tb.tb_frame, _get_code_position(tb.tb_frame.f_code, tb.tb_lasti)
345+
# TODO: In some situations `tb_lasti` is not set properly in the
346+
# traceback, this works around the problem currently but we should
347+
# look more closely at it.
348+
if tb.tb_lasti < 0:
349+
yield tb.tb_frame, (tb.tb_lineno, None, None, None)
350+
else:
351+
yield tb.tb_frame, _get_code_position(tb.tb_frame.f_code, tb.tb_lasti)
346352
tb = tb.tb_next
347353

348354

@@ -481,9 +487,12 @@ def format(self):
481487

482488
stripped_characters = len(frame._original_line) - len(frame.line.lstrip())
483489
if frame.end_lineno == frame.lineno and frame.end_colno != 0:
490+
colno = _byte_offset_to_character_offset(frame._original_line, frame.colno)
491+
end_colno = _byte_offset_to_character_offset(frame._original_line, frame.end_colno)
492+
484493
row.append(' ')
485-
row.append(' ' * (frame.colno - stripped_characters))
486-
row.append('^' * (frame.end_colno - frame.colno))
494+
row.append(' ' * (colno - stripped_characters))
495+
row.append('^' * (end_colno - colno))
487496
row.append('\n')
488497

489498
if frame.locals:
@@ -499,6 +508,14 @@ def format(self):
499508
return result
500509

501510

511+
def _byte_offset_to_character_offset(str, offset):
512+
as_utf8 = str.encode('utf-8')
513+
if offset > len(as_utf8):
514+
offset = len(as_utf8)
515+
516+
return len(as_utf8[:offset].decode("utf-8"))
517+
518+
502519
class TracebackException:
503520
"""An exception ready for rendering.
504521

Python/_warnings.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ show_warning(PyObject *filename, int lineno, PyObject *text,
543543
PyFile_WriteString("\n", f_stderr);
544544
}
545545
else {
546-
_Py_DisplaySourceLine(f_stderr, filename, lineno, 2, NULL);
546+
_Py_DisplaySourceLine(f_stderr, filename, lineno, 2, NULL, NULL);
547547
}
548548

549549
error:

Python/traceback.c

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ _Py_FindSourceFile(PyObject *filename, char* namebuf, size_t namelen, PyObject *
369369
}
370370

371371
int
372-
_Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, int *truncation)
372+
_Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, int *truncation, PyObject **line)
373373
{
374374
int err = 0;
375375
int fd;
@@ -460,6 +460,11 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, i
460460
return err;
461461
}
462462

463+
if (line) {
464+
Py_INCREF(lineobj);
465+
*line = lineobj;
466+
}
467+
463468
/* remove the indentation of the line */
464469
kind = PyUnicode_KIND(lineobj);
465470
data = PyUnicode_DATA(lineobj);
@@ -504,6 +509,29 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, i
504509
return err;
505510
}
506511

512+
static int byte_to_character_offset_in_line(PyObject* line, int offset) {
513+
// Taken pretty much exactly from pegen.c for now.
514+
if (offset <= 0) {
515+
return offset;
516+
}
517+
const char *str = PyUnicode_AsUTF8(line);
518+
if (!str) {
519+
return 0;
520+
}
521+
Py_ssize_t len = strlen(str);
522+
if (offset > len) {
523+
offset = len;
524+
}
525+
526+
PyObject *text = PyUnicode_DecodeUTF8(str, offset, "replace");
527+
if (!text) {
528+
return 0;
529+
}
530+
Py_ssize_t size = PyUnicode_GET_LENGTH(text);
531+
Py_DECREF(text);
532+
return size;
533+
}
534+
507535
#define _TRACEBACK_SOURCE_LINE_INDENT 4
508536

509537
// TODO: Pick up filename and other stuff from the tb argument
@@ -525,15 +553,21 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
525553
if (err != 0)
526554
return err;
527555
int truncation = _TRACEBACK_SOURCE_LINE_INDENT;
556+
PyObject* source_line = NULL;
528557
/* ignore errors since we can't report them, can we? */
529-
if (!_Py_DisplaySourceLine(f, filename, lineno, _TRACEBACK_SOURCE_LINE_INDENT, &truncation)) {
558+
if (!_Py_DisplaySourceLine(f, filename, lineno, _TRACEBACK_SOURCE_LINE_INDENT, &truncation, &source_line)) {
530559
int code_offset = tb->tb_lasti;
531560
if (PyCode_Addr2Line(frame->f_code, code_offset) != PyCode_Addr2EndLine(frame->f_code, code_offset)) {
532561
goto done;
533562
}
534563

535564
int start_offset = PyCode_Addr2Offset(frame->f_code, code_offset);
536565
int end_offset = PyCode_Addr2EndOffset(frame->f_code, code_offset);
566+
567+
start_offset = byte_to_character_offset_in_line(source_line, start_offset);
568+
// Not sure why a `+ 1` is needed here for the end_offset.
569+
end_offset = byte_to_character_offset_in_line(source_line, end_offset) + 1;
570+
537571
if (start_offset <= 0 || end_offset < 0) {
538572
goto done;
539573
}
@@ -563,6 +597,7 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
563597
}
564598

565599
done:
600+
Py_XDECREF(source_line);
566601
return err;
567602
}
568603

@@ -966,4 +1001,3 @@ _Py_DumpTracebackThreads(int fd, PyInterpreterState *interp,
9661001

9671002
return NULL;
9681003
}
969-

0 commit comments

Comments
 (0)