Skip to content

Commit 80a8708

Browse files
authored
BUG: validation - during tokenize, use UTF8 encoding on all platforms (#510)
* during tokenize, use UTF8 encoding on all platforms * add test * better test * fix typo in pytest param ID
1 parent b4135ce commit 80a8708

File tree

2 files changed

+48
-2
lines changed

2 files changed

+48
-2
lines changed

numpydoc/tests/test_validate.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import pytest
22
import sys
33
import warnings
4-
from functools import cached_property
4+
from contextlib import nullcontext
5+
from functools import cached_property, partial
56
from inspect import getsourcelines, getsourcefile
67

78
from numpydoc import validate
@@ -85,6 +86,50 @@ def test_extract_ignore_validation_comments(tmp_path, file_contents, expected):
8586
assert validate.extract_ignore_validation_comments(filepath) == expected
8687

8788

89+
@pytest.mark.parametrize(
90+
"assumed_encoding",
91+
(
92+
pytest.param("utf-8", id="utf8_codec"),
93+
pytest.param("cp1252", id="cp1252_codec"),
94+
),
95+
)
96+
@pytest.mark.parametrize(
97+
("classname", "actual_encoding"),
98+
(
99+
pytest.param("MÿClass", "cp1252", id="cp1252_file"),
100+
pytest.param("My\u0081Class", "utf-8", id="utf8_file"),
101+
),
102+
)
103+
def test_encodings(tmp_path, classname, actual_encoding, assumed_encoding):
104+
"""Test handling of different source file encodings."""
105+
# write file as bytes with `actual_encoding`
106+
filepath = tmp_path / "ignore_comments.py"
107+
file_contents = f"class {classname}:\n pass"
108+
with open(filepath, "wb") as file:
109+
file.write(file_contents.encode(actual_encoding))
110+
# this should fail on the ÿ in MÿClass. It represents the (presumed rare) case where
111+
# a user's editor saved the source file in cp1252 (or anything other than utf-8).
112+
if actual_encoding == "cp1252" and assumed_encoding == "utf-8":
113+
context = partial(
114+
pytest.raises,
115+
UnicodeDecodeError,
116+
match="can't decode byte 0xff in position 7: invalid start byte",
117+
)
118+
# this is the more likely case: file was utf-8 encoded, but Python on Windows uses
119+
# the system codepage to read the file. This case is fixed by numpy/numpydoc#510
120+
elif actual_encoding == "utf-8" and assumed_encoding == "cp1252":
121+
context = partial(
122+
pytest.raises,
123+
UnicodeDecodeError,
124+
match="can't decode byte 0x81 in position 9: character maps to <undefined>",
125+
)
126+
else:
127+
context = nullcontext
128+
with context():
129+
result = validate.extract_ignore_validation_comments(filepath, assumed_encoding)
130+
assert result == {}
131+
132+
88133
class GoodDocStrings:
89134
"""
90135
Collection of good doc strings.

numpydoc/validate.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
@functools.lru_cache(maxsize=2000)
121121
def extract_ignore_validation_comments(
122122
filepath: Optional[os.PathLike],
123+
encoding: str = "utf-8",
123124
) -> Dict[int, List[str]]:
124125
"""
125126
Extract inline comments indicating certain validation checks should be ignored.
@@ -136,7 +137,7 @@ def extract_ignore_validation_comments(
136137
"""
137138
numpydoc_ignore_comments = {}
138139
try:
139-
file = open(filepath)
140+
file = open(filepath, encoding=encoding)
140141
except (OSError, TypeError): # can be None, nonexistent, or unreadable
141142
return numpydoc_ignore_comments
142143
with file:

0 commit comments

Comments
 (0)