Handle lines with multi-byte unicode characters properly (#12)

ammaraskar · isidentical · web-flow · commit dbc9efb93b3a · 2021-06-29T15:46:57.000+03:00
* Handle lines with multi-byte unicode characters properly

* Use Py_XDECREF instead of Py_DECREF

Co-authored-by: Batuhan Taskaya &lt;isidentical@gmail.com&gt;

Co-authored-by: Batuhan Taskaya &lt;isidentical@gmail.com&gt;
diff --git a/Include/cpython/traceback.h b/Include/cpython/traceback.h
@@ -10,5 +10,5 @@ typedef struct _traceback {
     int tb_lineno;
 } PyTracebackObject;
 
-PyAPI_FUNC(int) _Py_DisplaySourceLine(PyObject *, PyObject *, int, int, int *);
+PyAPI_FUNC(int) _Py_DisplaySourceLine(PyObject *, PyObject *, int, int, int *, PyObject **);
 PyAPI_FUNC(void) _PyTraceback_Add(const char *, const char *, int);
diff --git a/Lib/idlelib/idle_test/test_run.py b/Lib/idlelib/idle_test/test_run.py
@@ -33,9 +33,9 @@ def __eq__(self, other):
                         run.print_exception()
 
         tb = output.getvalue().strip().splitlines()
-        self.assertEqual(11, len(tb))
-        self.assertIn('UnhashableException: ex2', tb[3])
-        self.assertIn('UnhashableException: ex1', tb[10])
+        self.assertEqual(13, len(tb))
+        self.assertIn('UnhashableException: ex2', tb[4])
+        self.assertIn('UnhashableException: ex1', tb[12])
 
     data = (('1/0', ZeroDivisionError, "division by zero\n"),
             ('abc', NameError, "name 'abc' is not defined. "
diff --git a/Lib/test/test_doctest.py b/Lib/test/test_doctest.py
@@ -2835,6 +2835,7 @@ def test_unicode(): """
             exec(compile(example.source, filename, "single",
           File "<doctest foo-bär@baz[0]>", line 1, in <module>
             raise Exception('clé')
+            ^^^^^^^^^^^^^^^^^^^^^^
         Exception: clé
     TestResults(failed=1, attempted=1)
     """
diff --git a/Lib/test/test_zipimport.py b/Lib/test/test_zipimport.py
@@ -716,7 +716,10 @@ def doTraceback(self, module):
 
             s = io.StringIO()
             print_tb(tb, 1, s)
-            self.assertTrue(s.getvalue().endswith(raise_src))
+            self.assertTrue(s.getvalue().endswith(
+                '    def do_raise(): raise TypeError\n'
+                '                    ^^^^^^^^^^^^^^^\n'
+            ))
         else:
             raise AssertionError("This ought to be impossible")
 
diff --git a/Lib/traceback.py b/Lib/traceback.py
@@ -342,7 +342,13 @@ def _walk_tb_with_full_positions(tb):
     # Internal version of walk_tb that yields full code positions including
     # end line and column information.
     while tb is not None:
-        yield tb.tb_frame, _get_code_position(tb.tb_frame.f_code, tb.tb_lasti)
+        # TODO: In some situations `tb_lasti` is not set properly in the
+        # traceback, this works around the problem currently but we should
+        # look more closely at it.
+        if tb.tb_lasti < 0:
+            yield tb.tb_frame, (tb.tb_lineno, None, None, None)
+        else:
+            yield tb.tb_frame, _get_code_position(tb.tb_frame.f_code, tb.tb_lasti)
         tb = tb.tb_next
 
 
@@ -481,9 +487,12 @@ def format(self):
 
                 stripped_characters = len(frame._original_line) - len(frame.line.lstrip())
                 if frame.end_lineno == frame.lineno and frame.end_colno != 0:
+                    colno = _byte_offset_to_character_offset(frame._original_line, frame.colno)
+                    end_colno = _byte_offset_to_character_offset(frame._original_line, frame.end_colno)
+
                     row.append('    ')
-                    row.append(' ' * (frame.colno - stripped_characters))
-                    row.append('^' * (frame.end_colno - frame.colno))
+                    row.append(' ' * (colno - stripped_characters))
+                    row.append('^' * (end_colno - colno))
                     row.append('\n')
 
             if frame.locals:
@@ -499,6 +508,14 @@ def format(self):
         return result
 
 
+def _byte_offset_to_character_offset(str, offset):
+    as_utf8 = str.encode('utf-8')
+    if offset > len(as_utf8):
+        offset = len(as_utf8)
+
+    return len(as_utf8[:offset].decode("utf-8"))
+
+
 class TracebackException:
     """An exception ready for rendering.
 
diff --git a/Python/_warnings.c b/Python/_warnings.c
@@ -543,7 +543,7 @@ show_warning(PyObject *filename, int lineno, PyObject *text,
         PyFile_WriteString("\n", f_stderr);
     }
     else {
-        _Py_DisplaySourceLine(f_stderr, filename, lineno, 2, NULL);
+        _Py_DisplaySourceLine(f_stderr, filename, lineno, 2, NULL, NULL);
     }
 
 error:
diff --git a/Python/traceback.c b/Python/traceback.c
@@ -369,7 +369,7 @@ _Py_FindSourceFile(PyObject *filename, char* namebuf, size_t namelen, PyObject *
 }
 
 int
-_Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, int *truncation)
+_Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, int *truncation, PyObject **line)
 {
     int err = 0;
     int fd;
@@ -460,6 +460,11 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, i
         return err;
     }
 
+    if (line) {
+        Py_INCREF(lineobj);
+        *line = lineobj;
+    }
+
     /* remove the indentation of the line */
     kind = PyUnicode_KIND(lineobj);
     data = PyUnicode_DATA(lineobj);
@@ -504,6 +509,29 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent, i
     return err;
 }
 
+static int byte_to_character_offset_in_line(PyObject* line, int offset) {
+    // Taken pretty much exactly from pegen.c for now.
+    if (offset <= 0) {
+        return offset;
+    }
+    const char *str = PyUnicode_AsUTF8(line);
+    if (!str) {
+        return 0;
+    }
+    Py_ssize_t len = strlen(str);
+    if (offset > len) {
+        offset = len;
+    }
+
+    PyObject *text = PyUnicode_DecodeUTF8(str, offset, "replace");
+    if (!text) {
+        return 0;
+    }
+    Py_ssize_t size = PyUnicode_GET_LENGTH(text);
+    Py_DECREF(text);
+    return size;
+}
+
 #define _TRACEBACK_SOURCE_LINE_INDENT 4
 
 // TODO: Pick up filename and other stuff from the tb argument
@@ -525,15 +553,21 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
     if (err != 0)
         return err;
     int truncation = _TRACEBACK_SOURCE_LINE_INDENT;
+    PyObject* source_line = NULL;
     /* ignore errors since we can't report them, can we? */
-    if (!_Py_DisplaySourceLine(f, filename, lineno, _TRACEBACK_SOURCE_LINE_INDENT, &truncation)) {
+    if (!_Py_DisplaySourceLine(f, filename, lineno, _TRACEBACK_SOURCE_LINE_INDENT, &truncation, &source_line)) {
         int code_offset = tb->tb_lasti;
         if (PyCode_Addr2Line(frame->f_code, code_offset) != PyCode_Addr2EndLine(frame->f_code, code_offset)) {
             goto done;
         }
 
         int start_offset = PyCode_Addr2Offset(frame->f_code, code_offset);
         int end_offset = PyCode_Addr2EndOffset(frame->f_code, code_offset);
+
+        start_offset = byte_to_character_offset_in_line(source_line, start_offset);
+        // Not sure why a `+ 1` is needed here for the end_offset.
+        end_offset = byte_to_character_offset_in_line(source_line, end_offset) + 1;
+
         if (start_offset <= 0 || end_offset < 0) {
             goto done;
         }
@@ -563,6 +597,7 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
     }
     
 done:
+    Py_XDECREF(source_line);
     return err;
 }
 
@@ -966,4 +1001,3 @@ _Py_DumpTracebackThreads(int fd, PyInterpreterState *interp,
 
     return NULL;
 }
-

Original file line number	Diff line number	Diff line change
`@@ -543,7 +543,7 @@ show_warning(PyObject filename, int lineno, PyObject text,`
`543`	`543`	`PyFile_WriteString("\n", f_stderr);`
`544`	`544`	`}`
`545`	`545`	`else {`
`546`		`- _Py_DisplaySourceLine(f_stderr, filename, lineno, 2, NULL);`
	`546`	`+ _Py_DisplaySourceLine(f_stderr, filename, lineno, 2, NULL, NULL);`
`547`	`547`	`}`
`548`	`548`
`549`	`549`	`error:`