Merge branch 'source-encodings'

JukkaL · JukkaL · commit a4ac87b86ecf · 2015-02-22T13:02:10.000-08:00
Closes #522.
diff --git a/mypy/build.py b/mypy/build.py
@@ -83,7 +83,7 @@ def __init__(self, files: Dict[str, MypyFile],
 def build(program_path: str,
           target: int,
           module: str = None,
-          program_text: str = None,
+          program_text: Union[str, bytes] = None,
           alt_lib_path: str = None,
           bin_dir: str = None,
           output_dir: str = None,
@@ -236,11 +236,10 @@ def lookup_program(module: str, lib_path: List[str]) -> str:
             "mypy: can't find module '{}'".format(module)])
 
 
-def read_program(path: str) -> str:
+def read_program(path: str) -> bytes:
     try:
-        f = open(path)
-        text = f.read()
-        f.close()
+        with open(path, 'rb') as file:
+            text = file.read()
     except IOError as ioerr:
         raise CompileError([
             "mypy: can't read file '{}': {}".format(path, ioerr.strerror)])
@@ -642,7 +641,7 @@ def fail(self, path: str, line: int, msg: str, blocker: bool = True) -> None:
 
 
 class UnprocessedFile(State):
-    def __init__(self, info: StateInfo, program_text: str) -> None:
+    def __init__(self, info: StateInfo, program_text: Union[str, bytes]) -> None:
         super().__init__(info)
         self.program_text = program_text
         trace('waiting {}'.format(info.path))
@@ -728,7 +727,7 @@ def import_module(self, id: str) -> bool:
         else:
             return False
 
-    def parse(self, source_text: str, fnam: str) -> MypyFile:
+    def parse(self, source_text: Union[str, bytes], fnam: str) -> MypyFile:
         """Parse the source of a file with the given name.
 
         Raise CompileError if there is a parse error.
diff --git a/mypy/lex.py b/mypy/lex.py
@@ -9,7 +9,7 @@
 import re
 
 from mypy.util import short_type
-from typing import List, Undefined, Callable, Dict, Any, Match, Pattern, Set
+from typing import List, Undefined, Callable, Dict, Any, Match, Pattern, Set, Union, Tuple
 
 
 class Token:
@@ -132,31 +132,32 @@ class Bom(Token):
 class LexError(Token):
     """Lexer error token"""
 
-    def __init__(self, string: str, type: int) -> None:
+    def __init__(self, string: str, type: int, message: str = None) -> None:
         """Initialize token.
 
         The type argument is one of the error types below.
         """
         super().__init__(string)
         self.type = type
+        self.message = message
+
+    def __str__(self):
+        if self.message:
+            return 'LexError(%s)' % self.message
+        else:
+            return super().__str__()
 
 
 # Lexer error types
 NUMERIC_LITERAL_ERROR = 0
 UNTERMINATED_STRING_LITERAL = 1
 INVALID_CHARACTER = 2
-NON_ASCII_CHARACTER_IN_COMMENT = 3
-NON_ASCII_CHARACTER_IN_STRING = 4
-INVALID_UTF8_SEQUENCE = 5
-INVALID_BACKSLASH = 6
-INVALID_DEDENT = 7
-
-# Encoding contexts
-STR_CONTEXT = 1
-COMMENT_CONTEXT = 2
+DECODE_ERROR = 3
+INVALID_BACKSLASH = 4
+INVALID_DEDENT = 5
 
 
-def lex(string: str, first_line: int = 1, pyversion: int = 3) -> List[Token]:
+def lex(string: Union[str, bytes], first_line: int = 1, pyversion: int = 3) -> List[Token]:
     """Analyze string and return an array of token objects.
 
     The last token is always Eof.
@@ -198,13 +199,6 @@ def lex(string: str, first_line: int = 1, pyversion: int = 3) -> List[Token]:
                re.compile('([-+*/%&|^]|\\*\\*|//|<<|>>)=')]
 
 
-# Source file encodings
-DEFAULT_ENCODING = 0
-ASCII_ENCODING = 1
-LATIN1_ENCODING = 2
-UTF8_ENCODING = 3
-
-
 # Map single-character string escape sequences to corresponding characters.
 escape_map = {'a': '\x07',
               'b': '\x08',
@@ -279,7 +273,7 @@ class Lexer:
     s = ''     # The string being analyzed
     line = 0   # Current line number
     pre_whitespace = ''     # Whitespace and comments before the next token
-    enc = DEFAULT_ENCODING  # Encoding TODO implement properly
+    enc = ''                # Encoding
 
     # Generated tokens
     tok = Undefined(List[Token])
@@ -326,14 +320,30 @@ def __init__(self, pyversion: int = 3) -> None:
         if pyversion == 3:
             self.keywords = keywords_common | keywords3
 
-    def lex(self, s: str, first_line: int) -> None:
+    def lex(self, text: Union[str, bytes], first_line: int) -> None:
         """Lexically analyze a string, storing the tokens at the tok list."""
-        self.s = s
         self.i = 0
         self.line = first_line
 
-        if s.startswith('\xef\xbb\xbf'):
-            self.add_token(Bom(s[0:3]))
+        if isinstance(text, bytes):
+            if text.startswith(b'\xef\xbb\xbf'):
+                self.enc = 'utf8'
+                bom = True
+            else:
+                self.enc, enc_line = self.find_encoding(text)
+                bom = False
+            try:
+                decoded_text = text.decode(self.enc)
+            except UnicodeDecodeError as err:
+                self.report_unicode_decode_error(err, text)
+                return
+            except LookupError:
+                self.report_unknown_encoding(enc_line)
+                return
+            text = decoded_text
+            if bom:
+                self.add_token(Bom(text[0]))
+        self.s = text
 
         # Parse initial indent; otherwise first-line indent would not generate
         # an error.
@@ -343,9 +353,9 @@ def lex(self, s: str, first_line: int) -> None:
         map = self.map
 
         # Lex the file. Repeatedly call the lexer method for the current char.
-        while self.i < len(s):
+        while self.i < len(text):
             # Get the character code of the next character to lex.
-            c = ord(s[self.i])
+            c = ord(text[self.i])
             # Dispatch to the relevant lexer method. This will consume some
             # characters in the text, add a token to self.tok and increment
             # self.i.
@@ -367,6 +377,41 @@ def lex(self, s: str, first_line: int) -> None:
 
         self.add_token(Eof(''))
 
+    def find_encoding(self, text: bytes) -> Tuple[str, int]:
+        result = re.match(br'(\s*#.*(\r\n?|\n))?\s*#.*coding[:=]\s*([-\w.]+)', text)
+        if result:
+            line = 2 if result.group(1) else 1
+            return result.group(3).decode('ascii'), line
+        else:
+            default_encoding = 'utf8' if self.pyversion >= 3 else 'ascii'
+            return default_encoding, -1
+
+    def report_unicode_decode_error(self, exc: UnicodeDecodeError, text: bytes) -> None:
+        lines = text.splitlines()
+        for line in lines:
+            try:
+                line.decode(self.enc)
+            except UnicodeDecodeError as new_exc:
+                exc = new_exc
+                break
+            self.line += 1
+        else:
+            self.line = 1
+        self.add_token(
+            LexError('', DECODE_ERROR,
+                        "%r codec can't decode byte %d in column %d" % (
+                         self.enc, line[exc.start], exc.start + 1)))
+        self.add_token(Break(''))
+        self.add_token(Eof(''))
+
+    def report_unknown_encoding(self, encoding_line: int) -> None:
+        self.line = encoding_line
+        self.add_token(
+            LexError('', DECODE_ERROR,
+                        "Unknown encoding %r" % self.enc))
+        self.add_token(Break(''))
+        self.add_token(Eof(''))
+
     def lex_number_or_dot(self) -> None:
         """Analyse a token starting with a dot.
 
@@ -404,7 +449,7 @@ def is_at_ellipsis(self) -> bool:
         r'([0-9]*\.[0-9]*([eE][-+]?[0-9]+)?|[0-9]+([eE][-+]?[0-9]+)?)[jJ]')
     # These characters must not appear after a number literal.
     name_char_exp = re.compile('[a-zA-Z0-9_]')
-    octal_int = re.compile('0[0-9]')
+    octal_int = re.compile('0+[1-9]')
 
     def lex_number(self) -> None:
         """Analyse an int or float literal.
@@ -541,7 +586,6 @@ def lex_str(self, regex: Pattern[str], re2: Pattern[str],
                 if s.endswith('\n') or s.endswith('\r'):
                     self.lex_multiline_string_literal(re2, s)
                 else:
-                    self.verify_encoding(s, STR_CONTEXT)
                     if 'b' in prefix:
                         self.add_token(BytesLit(s))
                     elif 'u' in prefix:
@@ -605,7 +649,6 @@ def lex_multiline_string_literal(self, re_end: Pattern[str],
     def lex_comment(self) -> None:
         """Analyze a comment."""
         s = self.match(self.comment_exp)
-        self.verify_encoding(s, COMMENT_CONTEXT)
         self.add_pre_whitespace(s)
 
     backslash_exp = re.compile(r'\\(\n|\r\n?)')
@@ -808,25 +851,6 @@ def ignore_break(self) -> bool:
             t = self.tok[-1]
             return isinstance(t, Break) or isinstance(t, Dedent)
 
-    def verify_encoding(self, string: str, context: int) -> None:
-        """Verify that token is encoded correctly (using the file encoding)."""
-        codec = None  # type: str
-        if self.enc == ASCII_ENCODING:
-            codec = 'ascii'
-        elif self.enc in [UTF8_ENCODING, DEFAULT_ENCODING]:
-            codec = 'utf8'
-        if codec is not None:
-            try:
-                pass  # FIX string.decode(codec)
-            except UnicodeDecodeError:
-                type = INVALID_UTF8_SEQUENCE
-                if self.enc == ASCII_ENCODING:
-                    if context == STR_CONTEXT:
-                        type = NON_ASCII_CHARACTER_IN_STRING
-                    else:
-                        type = NON_ASCII_CHARACTER_IN_COMMENT
-                self.add_token(LexError('', type))
-
 
 if __name__ == '__main__':
     # Lexically analyze a file and dump the tokens to stdout.
diff --git a/mypy/parse.py b/mypy/parse.py
@@ -66,7 +66,7 @@
 none = Token('')  # Empty token
 
 
-def parse(s: str, fnam: str = None, errors: Errors = None,
+def parse(source: Union[str, bytes], fnam: str = None, errors: Errors = None,
           pyversion: int = 3, custom_typing_module: str = None) -> MypyFile:
     """Parse a source file, without doing any semantic analysis.
 
@@ -77,7 +77,7 @@ def parse(s: str, fnam: str = None, errors: Errors = None,
     3 for 3.x).
     """
     parser = Parser(fnam, errors, pyversion, custom_typing_module)
-    tree = parser.parse(s)
+    tree = parser.parse(source)
     tree.path = fnam
     return tree
 
@@ -109,7 +109,7 @@ def __init__(self, fnam: str, errors: Errors, pyversion: int,
         else:
             self.errors.set_file('<input>')
 
-    def parse(self, s: str) -> MypyFile:
+    def parse(self, s: Union[str, bytes]) -> MypyFile:
         self.tok = lex.lex(s, pyversion=self.pyversion)
         self.ind = 0
         self.imports = []
@@ -1275,7 +1275,7 @@ def parse_name_expr(self) -> NameExpr:
         node.set_line(tok)
         return node
 
-    octal_int = re.compile('0[0-9]')
+    octal_int = re.compile('0+[1-9]')
 
     def parse_int_expr(self) -> IntExpr:
         tok = self.expect_type(IntLit)
@@ -1687,14 +1687,10 @@ def token_repr(tok: Token) -> str:
                 if ord(tok.string) in range(33, 127):
                     msg += ' ' + tok.string
                 return msg
-            elif t == lex.INVALID_UTF8_SEQUENCE:
-                return 'invalid UTF-8 sequence'
-            elif t == lex.NON_ASCII_CHARACTER_IN_COMMENT:
-                return 'non-ASCII character in comment'
-            elif t == lex.NON_ASCII_CHARACTER_IN_STRING:
-                return 'non-ASCII character in string'
             elif t == lex.INVALID_DEDENT:
                 return 'inconsistent indentation'
+            elif t == lex.DECODE_ERROR:
+                return tok.message
         raise ValueError('Unknown token {}'.format(repr(tok)))
 
 
@@ -1705,7 +1701,7 @@ def token_repr(tok: Token) -> str:
         print('Usage: parse.py FILE')
         sys.exit(2)
     fnam = sys.argv[1]
-    s = open(fnam).read()
+    s = open(fnam, 'rb').read()
     errors = Errors()
     try:
         tree = parse(s, fnam)
diff --git a/mypy/test/data/parse-errors.test b/mypy/test/data/parse-errors.test
@@ -427,3 +427,20 @@ file, line 1: Parse error before end of line
 0377
 [out]
 file, line 1: Invalid numeric literal
+
+[case testInvalidEncoding]
+# foo
+# coding: uft-8
+[out]
+file, line 2: Unknown encoding 'uft-8'
+
+[case testInvalidEncoding2]
+# coding=Uft.8
+[out]
+file, line 1: Unknown encoding 'Uft.8'
+
+[case testInvalidEncoding2]
+#!/usr/bin python
+# vim: set fileencoding=uft8 :
+[out]
+file, line 2: Unknown encoding 'uft8'
diff --git a/mypy/test/testlex.py b/mypy/test/testlex.py
@@ -33,8 +33,8 @@ def test_identifiers(self):
 
     def test_int_literals(self):
         self.assert_lex(
-            '0 1 0987654321 10002000300040005000600070008000',
-            'IntLit(0) IntLit( 1) LexError( 0987654321) '
+            '0 00 1 0987654321 10002000300040005000600070008000',
+            'IntLit(0) IntLit( 00) IntLit( 1) LexError( 0987654321) '
             'IntLit( 10002000300040005000600070008000) Break() Eof()')
 
     def test_hex_int_literals(self):
@@ -396,12 +396,31 @@ def test_invalid_hex_int_literals(self):
         self.assert_lex('0x', 'LexError(  ) ...')
         self.assert_lex('0xax', 'LexError(    ) ...')
 
+    def test_latin1_encoding(self):
+        self.assert_lex(b'# coding: latin1\n"\xbb"',
+                        'StrLit(# coding: latin1\\n"\xbb") Break() Eof()')
+
+    def test_utf8_encoding(self):
+        self.assert_lex('"\xbb"'.encode('utf8'),
+                        'StrLit("\xbb") Break() Eof()')
+        self.assert_lex(b'"\xbb"',
+                        "LexError('utf8' codec can't decode byte 187 in column 2) "
+                        'Break() Eof()')
+        self.assert_lex(b'\n"abcde\xbc"',
+                        "LexError('utf8' codec can't decode byte 188 in column 7) "
+                        'Break() Eof()')
+
+    def test_byte_order_mark(self):
+        self.assert_lex('\ufeff"\xbb"'.encode('utf8'),
+                        'Bom(\ufeff) StrLit("\xbb") Break() Eof()')
+
     # TODO
     #   invalid escape sequences in string literals etc.
 
     def assert_lex(self, src, lexed):
-        src = src.replace('\\n', '\n')
-        src = src.replace('\\r', '\r')
+        if isinstance(src, str):
+            src = src.replace('\\n', '\n')
+            src = src.replace('\\r', '\r')
 
         if lexed.endswith(' ...'):
             lexed = lexed[:-3] + 'Break() Eof()'
diff --git a/mypy/test/testparse.py b/mypy/test/testparse.py
diff --git a/stubs/3.2/builtins.py b/stubs/3.2/builtins.py