From 0fb80398998e9840de04e927b26ba6660a2974dd Mon Sep 17 00:00:00 2001 From: Andre Caron Date: Fri, 1 Apr 2022 17:37:09 -0400 Subject: [PATCH 1/2] Fix encoding used to read our own log files When running Tox on a French Canadian Windows computer under an account with a username that contains diacritics, Tox crashes with a `UnicodeDecodeError` because it tries to read its log files with a hard-coded encoding of `UTF-8`. These log files contain output from `python -m virtualenv` and `python -m pip`, whose output contains references to the username (such as the contents of the ``APPDATA`` environment variable). The problem is that the subprocesses which open log files without any explicit encoding. In that case, the built-in `open()` uses `locale.getpreferredencoding(False)`, which is `"cp1252"`. A username containing an "Latin Small Letter E with Acute" will be encoded as `\xe9`, which is not valid UTF-8 (the valid UTF-8 sequence is `\xc3\xa9`). One workaround is to run Tox with Python in "UTF-8 Mode". This can be achieved by setting the `PYTHONUTF8=1` environment variable or by calling with `python -Xutf8 -m tox`. Unfortunately, this is not in the Tox documentation and can be quite confusing. I spent more than two hours troubleshooting this (I develop with Python full time and I'd never heard of the "UTF-8 mode" before). Of course, we could document the workaround, but it just seems good to re-open the log files using the same encoding they were opened with in the first place. With this change, Tox works nicely with or without the "UTF-8 Mode". --- src/tox/action.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/tox/action.py b/src/tox/action.py index e7f9b77bb..4b8631495 100644 --- a/src/tox/action.py +++ b/src/tox/action.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, unicode_literals +import locale import os import pipes import signal @@ -125,7 +126,13 @@ def popen( exit_code = process.returncode finally: if out_path is not None and out_path.exists(): - lines = out_path.read_text("UTF-8").split("\n") + # Log files of Python sub-processes like `python + # -m virtualenv` are opened as text files without + # specifying an explicit encoding, which means + # they use the locale's preferred encoding. They + # cannot be assumed to be UTF-8. + encoding = locale.getpreferredencoding(False) + lines = out_path.read_text(encoding).split("\n") # first three lines are the action, cwd, and cmd - remove it output = "\n".join(lines[3:]) try: From cd507d5860ef09da463f48b79d1c3e05c0f0ea1a Mon Sep 17 00:00:00 2001 From: Andre Caron Date: Sat, 2 Apr 2022 01:39:47 -0400 Subject: [PATCH 2/2] Only use current code page as a fallback --- src/tox/action.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/tox/action.py b/src/tox/action.py index 4b8631495..2215e4968 100644 --- a/src/tox/action.py +++ b/src/tox/action.py @@ -126,13 +126,17 @@ def popen( exit_code = process.returncode finally: if out_path is not None and out_path.exists(): - # Log files of Python sub-processes like `python - # -m virtualenv` are opened as text files without - # specifying an explicit encoding, which means - # they use the locale's preferred encoding. They - # cannot be assumed to be UTF-8. - encoding = locale.getpreferredencoding(False) - lines = out_path.read_text(encoding).split("\n") + # Output of Python sub-processes like `python -m + # virtualenv` may use some other system-dependent + # encoding when redirected to a file. Let's try + # UTF-8 (the common case) and fall back to the + # system encoding if that fails. + try: + encoding = "UTF-8" + lines = out_path.read_text(encoding).split("\n") + except UnicodeDecodeError: + encoding = locale.getpreferredencoding(False) + lines = out_path.read_text(encoding).split("\n") # first three lines are the action, cwd, and cmd - remove it output = "\n".join(lines[3:]) try: