Skip to content

Commit a4ac87b

Browse files
committed
Merge branch 'source-encodings'
Closes #522.
2 parents de732b9 + 39a2944 commit a4ac87b

File tree

7 files changed

+136
-73
lines changed

7 files changed

+136
-73
lines changed

mypy/build.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def __init__(self, files: Dict[str, MypyFile],
8383
def build(program_path: str,
8484
target: int,
8585
module: str = None,
86-
program_text: str = None,
86+
program_text: Union[str, bytes] = None,
8787
alt_lib_path: str = None,
8888
bin_dir: str = None,
8989
output_dir: str = None,
@@ -236,11 +236,10 @@ def lookup_program(module: str, lib_path: List[str]) -> str:
236236
"mypy: can't find module '{}'".format(module)])
237237

238238

239-
def read_program(path: str) -> str:
239+
def read_program(path: str) -> bytes:
240240
try:
241-
f = open(path)
242-
text = f.read()
243-
f.close()
241+
with open(path, 'rb') as file:
242+
text = file.read()
244243
except IOError as ioerr:
245244
raise CompileError([
246245
"mypy: can't read file '{}': {}".format(path, ioerr.strerror)])
@@ -642,7 +641,7 @@ def fail(self, path: str, line: int, msg: str, blocker: bool = True) -> None:
642641

643642

644643
class UnprocessedFile(State):
645-
def __init__(self, info: StateInfo, program_text: str) -> None:
644+
def __init__(self, info: StateInfo, program_text: Union[str, bytes]) -> None:
646645
super().__init__(info)
647646
self.program_text = program_text
648647
trace('waiting {}'.format(info.path))
@@ -728,7 +727,7 @@ def import_module(self, id: str) -> bool:
728727
else:
729728
return False
730729

731-
def parse(self, source_text: str, fnam: str) -> MypyFile:
730+
def parse(self, source_text: Union[str, bytes], fnam: str) -> MypyFile:
732731
"""Parse the source of a file with the given name.
733732
734733
Raise CompileError if there is a parse error.

mypy/lex.py

Lines changed: 72 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import re
1010

1111
from mypy.util import short_type
12-
from typing import List, Undefined, Callable, Dict, Any, Match, Pattern, Set
12+
from typing import List, Undefined, Callable, Dict, Any, Match, Pattern, Set, Union, Tuple
1313

1414

1515
class Token:
@@ -132,31 +132,32 @@ class Bom(Token):
132132
class LexError(Token):
133133
"""Lexer error token"""
134134

135-
def __init__(self, string: str, type: int) -> None:
135+
def __init__(self, string: str, type: int, message: str = None) -> None:
136136
"""Initialize token.
137137
138138
The type argument is one of the error types below.
139139
"""
140140
super().__init__(string)
141141
self.type = type
142+
self.message = message
143+
144+
def __str__(self):
145+
if self.message:
146+
return 'LexError(%s)' % self.message
147+
else:
148+
return super().__str__()
142149

143150

144151
# Lexer error types
145152
NUMERIC_LITERAL_ERROR = 0
146153
UNTERMINATED_STRING_LITERAL = 1
147154
INVALID_CHARACTER = 2
148-
NON_ASCII_CHARACTER_IN_COMMENT = 3
149-
NON_ASCII_CHARACTER_IN_STRING = 4
150-
INVALID_UTF8_SEQUENCE = 5
151-
INVALID_BACKSLASH = 6
152-
INVALID_DEDENT = 7
153-
154-
# Encoding contexts
155-
STR_CONTEXT = 1
156-
COMMENT_CONTEXT = 2
155+
DECODE_ERROR = 3
156+
INVALID_BACKSLASH = 4
157+
INVALID_DEDENT = 5
157158

158159

159-
def lex(string: str, first_line: int = 1, pyversion: int = 3) -> List[Token]:
160+
def lex(string: Union[str, bytes], first_line: int = 1, pyversion: int = 3) -> List[Token]:
160161
"""Analyze string and return an array of token objects.
161162
162163
The last token is always Eof.
@@ -198,13 +199,6 @@ def lex(string: str, first_line: int = 1, pyversion: int = 3) -> List[Token]:
198199
re.compile('([-+*/%&|^]|\\*\\*|//|<<|>>)=')]
199200

200201

201-
# Source file encodings
202-
DEFAULT_ENCODING = 0
203-
ASCII_ENCODING = 1
204-
LATIN1_ENCODING = 2
205-
UTF8_ENCODING = 3
206-
207-
208202
# Map single-character string escape sequences to corresponding characters.
209203
escape_map = {'a': '\x07',
210204
'b': '\x08',
@@ -279,7 +273,7 @@ class Lexer:
279273
s = '' # The string being analyzed
280274
line = 0 # Current line number
281275
pre_whitespace = '' # Whitespace and comments before the next token
282-
enc = DEFAULT_ENCODING # Encoding TODO implement properly
276+
enc = '' # Encoding
283277

284278
# Generated tokens
285279
tok = Undefined(List[Token])
@@ -326,14 +320,30 @@ def __init__(self, pyversion: int = 3) -> None:
326320
if pyversion == 3:
327321
self.keywords = keywords_common | keywords3
328322

329-
def lex(self, s: str, first_line: int) -> None:
323+
def lex(self, text: Union[str, bytes], first_line: int) -> None:
330324
"""Lexically analyze a string, storing the tokens at the tok list."""
331-
self.s = s
332325
self.i = 0
333326
self.line = first_line
334327

335-
if s.startswith('\xef\xbb\xbf'):
336-
self.add_token(Bom(s[0:3]))
328+
if isinstance(text, bytes):
329+
if text.startswith(b'\xef\xbb\xbf'):
330+
self.enc = 'utf8'
331+
bom = True
332+
else:
333+
self.enc, enc_line = self.find_encoding(text)
334+
bom = False
335+
try:
336+
decoded_text = text.decode(self.enc)
337+
except UnicodeDecodeError as err:
338+
self.report_unicode_decode_error(err, text)
339+
return
340+
except LookupError:
341+
self.report_unknown_encoding(enc_line)
342+
return
343+
text = decoded_text
344+
if bom:
345+
self.add_token(Bom(text[0]))
346+
self.s = text
337347

338348
# Parse initial indent; otherwise first-line indent would not generate
339349
# an error.
@@ -343,9 +353,9 @@ def lex(self, s: str, first_line: int) -> None:
343353
map = self.map
344354

345355
# Lex the file. Repeatedly call the lexer method for the current char.
346-
while self.i < len(s):
356+
while self.i < len(text):
347357
# Get the character code of the next character to lex.
348-
c = ord(s[self.i])
358+
c = ord(text[self.i])
349359
# Dispatch to the relevant lexer method. This will consume some
350360
# characters in the text, add a token to self.tok and increment
351361
# self.i.
@@ -367,6 +377,41 @@ def lex(self, s: str, first_line: int) -> None:
367377

368378
self.add_token(Eof(''))
369379

380+
def find_encoding(self, text: bytes) -> Tuple[str, int]:
381+
result = re.match(br'(\s*#.*(\r\n?|\n))?\s*#.*coding[:=]\s*([-\w.]+)', text)
382+
if result:
383+
line = 2 if result.group(1) else 1
384+
return result.group(3).decode('ascii'), line
385+
else:
386+
default_encoding = 'utf8' if self.pyversion >= 3 else 'ascii'
387+
return default_encoding, -1
388+
389+
def report_unicode_decode_error(self, exc: UnicodeDecodeError, text: bytes) -> None:
390+
lines = text.splitlines()
391+
for line in lines:
392+
try:
393+
line.decode(self.enc)
394+
except UnicodeDecodeError as new_exc:
395+
exc = new_exc
396+
break
397+
self.line += 1
398+
else:
399+
self.line = 1
400+
self.add_token(
401+
LexError('', DECODE_ERROR,
402+
"%r codec can't decode byte %d in column %d" % (
403+
self.enc, line[exc.start], exc.start + 1)))
404+
self.add_token(Break(''))
405+
self.add_token(Eof(''))
406+
407+
def report_unknown_encoding(self, encoding_line: int) -> None:
408+
self.line = encoding_line
409+
self.add_token(
410+
LexError('', DECODE_ERROR,
411+
"Unknown encoding %r" % self.enc))
412+
self.add_token(Break(''))
413+
self.add_token(Eof(''))
414+
370415
def lex_number_or_dot(self) -> None:
371416
"""Analyse a token starting with a dot.
372417
@@ -404,7 +449,7 @@ def is_at_ellipsis(self) -> bool:
404449
r'([0-9]*\.[0-9]*([eE][-+]?[0-9]+)?|[0-9]+([eE][-+]?[0-9]+)?)[jJ]')
405450
# These characters must not appear after a number literal.
406451
name_char_exp = re.compile('[a-zA-Z0-9_]')
407-
octal_int = re.compile('0[0-9]')
452+
octal_int = re.compile('0+[1-9]')
408453

409454
def lex_number(self) -> None:
410455
"""Analyse an int or float literal.
@@ -541,7 +586,6 @@ def lex_str(self, regex: Pattern[str], re2: Pattern[str],
541586
if s.endswith('\n') or s.endswith('\r'):
542587
self.lex_multiline_string_literal(re2, s)
543588
else:
544-
self.verify_encoding(s, STR_CONTEXT)
545589
if 'b' in prefix:
546590
self.add_token(BytesLit(s))
547591
elif 'u' in prefix:
@@ -605,7 +649,6 @@ def lex_multiline_string_literal(self, re_end: Pattern[str],
605649
def lex_comment(self) -> None:
606650
"""Analyze a comment."""
607651
s = self.match(self.comment_exp)
608-
self.verify_encoding(s, COMMENT_CONTEXT)
609652
self.add_pre_whitespace(s)
610653

611654
backslash_exp = re.compile(r'\\(\n|\r\n?)')
@@ -808,25 +851,6 @@ def ignore_break(self) -> bool:
808851
t = self.tok[-1]
809852
return isinstance(t, Break) or isinstance(t, Dedent)
810853

811-
def verify_encoding(self, string: str, context: int) -> None:
812-
"""Verify that token is encoded correctly (using the file encoding)."""
813-
codec = None # type: str
814-
if self.enc == ASCII_ENCODING:
815-
codec = 'ascii'
816-
elif self.enc in [UTF8_ENCODING, DEFAULT_ENCODING]:
817-
codec = 'utf8'
818-
if codec is not None:
819-
try:
820-
pass # FIX string.decode(codec)
821-
except UnicodeDecodeError:
822-
type = INVALID_UTF8_SEQUENCE
823-
if self.enc == ASCII_ENCODING:
824-
if context == STR_CONTEXT:
825-
type = NON_ASCII_CHARACTER_IN_STRING
826-
else:
827-
type = NON_ASCII_CHARACTER_IN_COMMENT
828-
self.add_token(LexError('', type))
829-
830854

831855
if __name__ == '__main__':
832856
# Lexically analyze a file and dump the tokens to stdout.

mypy/parse.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
none = Token('') # Empty token
6767

6868

69-
def parse(s: str, fnam: str = None, errors: Errors = None,
69+
def parse(source: Union[str, bytes], fnam: str = None, errors: Errors = None,
7070
pyversion: int = 3, custom_typing_module: str = None) -> MypyFile:
7171
"""Parse a source file, without doing any semantic analysis.
7272
@@ -77,7 +77,7 @@ def parse(s: str, fnam: str = None, errors: Errors = None,
7777
3 for 3.x).
7878
"""
7979
parser = Parser(fnam, errors, pyversion, custom_typing_module)
80-
tree = parser.parse(s)
80+
tree = parser.parse(source)
8181
tree.path = fnam
8282
return tree
8383

@@ -109,7 +109,7 @@ def __init__(self, fnam: str, errors: Errors, pyversion: int,
109109
else:
110110
self.errors.set_file('<input>')
111111

112-
def parse(self, s: str) -> MypyFile:
112+
def parse(self, s: Union[str, bytes]) -> MypyFile:
113113
self.tok = lex.lex(s, pyversion=self.pyversion)
114114
self.ind = 0
115115
self.imports = []
@@ -1275,7 +1275,7 @@ def parse_name_expr(self) -> NameExpr:
12751275
node.set_line(tok)
12761276
return node
12771277

1278-
octal_int = re.compile('0[0-9]')
1278+
octal_int = re.compile('0+[1-9]')
12791279

12801280
def parse_int_expr(self) -> IntExpr:
12811281
tok = self.expect_type(IntLit)
@@ -1687,14 +1687,10 @@ def token_repr(tok: Token) -> str:
16871687
if ord(tok.string) in range(33, 127):
16881688
msg += ' ' + tok.string
16891689
return msg
1690-
elif t == lex.INVALID_UTF8_SEQUENCE:
1691-
return 'invalid UTF-8 sequence'
1692-
elif t == lex.NON_ASCII_CHARACTER_IN_COMMENT:
1693-
return 'non-ASCII character in comment'
1694-
elif t == lex.NON_ASCII_CHARACTER_IN_STRING:
1695-
return 'non-ASCII character in string'
16961690
elif t == lex.INVALID_DEDENT:
16971691
return 'inconsistent indentation'
1692+
elif t == lex.DECODE_ERROR:
1693+
return tok.message
16981694
raise ValueError('Unknown token {}'.format(repr(tok)))
16991695

17001696

@@ -1705,7 +1701,7 @@ def token_repr(tok: Token) -> str:
17051701
print('Usage: parse.py FILE')
17061702
sys.exit(2)
17071703
fnam = sys.argv[1]
1708-
s = open(fnam).read()
1704+
s = open(fnam, 'rb').read()
17091705
errors = Errors()
17101706
try:
17111707
tree = parse(s, fnam)

mypy/test/data/parse-errors.test

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,3 +427,20 @@ file, line 1: Parse error before end of line
427427
0377
428428
[out]
429429
file, line 1: Invalid numeric literal
430+
431+
[case testInvalidEncoding]
432+
# foo
433+
# coding: uft-8
434+
[out]
435+
file, line 2: Unknown encoding 'uft-8'
436+
437+
[case testInvalidEncoding2]
438+
# coding=Uft.8
439+
[out]
440+
file, line 1: Unknown encoding 'Uft.8'
441+
442+
[case testInvalidEncoding2]
443+
#!/usr/bin python
444+
# vim: set fileencoding=uft8 :
445+
[out]
446+
file, line 2: Unknown encoding 'uft8'

mypy/test/testlex.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ def test_identifiers(self):
3333

3434
def test_int_literals(self):
3535
self.assert_lex(
36-
'0 1 0987654321 10002000300040005000600070008000',
37-
'IntLit(0) IntLit( 1) LexError( 0987654321) '
36+
'0 00 1 0987654321 10002000300040005000600070008000',
37+
'IntLit(0) IntLit( 00) IntLit( 1) LexError( 0987654321) '
3838
'IntLit( 10002000300040005000600070008000) Break() Eof()')
3939

4040
def test_hex_int_literals(self):
@@ -396,12 +396,31 @@ def test_invalid_hex_int_literals(self):
396396
self.assert_lex('0x', 'LexError( ) ...')
397397
self.assert_lex('0xax', 'LexError( ) ...')
398398

399+
def test_latin1_encoding(self):
400+
self.assert_lex(b'# coding: latin1\n"\xbb"',
401+
'StrLit(# coding: latin1\\n"\xbb") Break() Eof()')
402+
403+
def test_utf8_encoding(self):
404+
self.assert_lex('"\xbb"'.encode('utf8'),
405+
'StrLit("\xbb") Break() Eof()')
406+
self.assert_lex(b'"\xbb"',
407+
"LexError('utf8' codec can't decode byte 187 in column 2) "
408+
'Break() Eof()')
409+
self.assert_lex(b'\n"abcde\xbc"',
410+
"LexError('utf8' codec can't decode byte 188 in column 7) "
411+
'Break() Eof()')
412+
413+
def test_byte_order_mark(self):
414+
self.assert_lex('\ufeff"\xbb"'.encode('utf8'),
415+
'Bom(\ufeff) StrLit("\xbb") Break() Eof()')
416+
399417
# TODO
400418
# invalid escape sequences in string literals etc.
401419

402420
def assert_lex(self, src, lexed):
403-
src = src.replace('\\n', '\n')
404-
src = src.replace('\\r', '\r')
421+
if isinstance(src, str):
422+
src = src.replace('\\n', '\n')
423+
src = src.replace('\\r', '\r')
405424

406425
if lexed.endswith(' ...'):
407426
lexed = lexed[:-3] + 'Break() Eof()'

0 commit comments

Comments
 (0)