9
9
import re
10
10
11
11
from mypy .util import short_type
12
- from typing import List , Undefined , Callable , Dict , Any , Match , Pattern , Set
12
+ from typing import List , Undefined , Callable , Dict , Any , Match , Pattern , Set , Union , Tuple
13
13
14
14
15
15
class Token :
@@ -132,31 +132,32 @@ class Bom(Token):
132
132
class LexError (Token ):
133
133
"""Lexer error token"""
134
134
135
- def __init__ (self , string : str , type : int ) -> None :
135
+ def __init__ (self , string : str , type : int , message : str = None ) -> None :
136
136
"""Initialize token.
137
137
138
138
The type argument is one of the error types below.
139
139
"""
140
140
super ().__init__ (string )
141
141
self .type = type
142
+ self .message = message
143
+
144
+ def __str__ (self ):
145
+ if self .message :
146
+ return 'LexError(%s)' % self .message
147
+ else :
148
+ return super ().__str__ ()
142
149
143
150
144
151
# Lexer error types
145
152
NUMERIC_LITERAL_ERROR = 0
146
153
UNTERMINATED_STRING_LITERAL = 1
147
154
INVALID_CHARACTER = 2
148
- NON_ASCII_CHARACTER_IN_COMMENT = 3
149
- NON_ASCII_CHARACTER_IN_STRING = 4
150
- INVALID_UTF8_SEQUENCE = 5
151
- INVALID_BACKSLASH = 6
152
- INVALID_DEDENT = 7
153
-
154
- # Encoding contexts
155
- STR_CONTEXT = 1
156
- COMMENT_CONTEXT = 2
155
+ DECODE_ERROR = 3
156
+ INVALID_BACKSLASH = 4
157
+ INVALID_DEDENT = 5
157
158
158
159
159
- def lex (string : str , first_line : int = 1 , pyversion : int = 3 ) -> List [Token ]:
160
+ def lex (string : Union [ str , bytes ] , first_line : int = 1 , pyversion : int = 3 ) -> List [Token ]:
160
161
"""Analyze string and return an array of token objects.
161
162
162
163
The last token is always Eof.
@@ -198,13 +199,6 @@ def lex(string: str, first_line: int = 1, pyversion: int = 3) -> List[Token]:
198
199
re .compile ('([-+*/%&|^]|\\ *\\ *|//|<<|>>)=' )]
199
200
200
201
201
- # Source file encodings
202
- DEFAULT_ENCODING = 0
203
- ASCII_ENCODING = 1
204
- LATIN1_ENCODING = 2
205
- UTF8_ENCODING = 3
206
-
207
-
208
202
# Map single-character string escape sequences to corresponding characters.
209
203
escape_map = {'a' : '\x07 ' ,
210
204
'b' : '\x08 ' ,
@@ -279,7 +273,7 @@ class Lexer:
279
273
s = '' # The string being analyzed
280
274
line = 0 # Current line number
281
275
pre_whitespace = '' # Whitespace and comments before the next token
282
- enc = DEFAULT_ENCODING # Encoding TODO implement properly
276
+ enc = '' # Encoding
283
277
284
278
# Generated tokens
285
279
tok = Undefined (List [Token ])
@@ -326,14 +320,30 @@ def __init__(self, pyversion: int = 3) -> None:
326
320
if pyversion == 3 :
327
321
self .keywords = keywords_common | keywords3
328
322
329
- def lex (self , s : str , first_line : int ) -> None :
323
+ def lex (self , text : Union [ str , bytes ] , first_line : int ) -> None :
330
324
"""Lexically analyze a string, storing the tokens at the tok list."""
331
- self .s = s
332
325
self .i = 0
333
326
self .line = first_line
334
327
335
- if s .startswith ('\xef \xbb \xbf ' ):
336
- self .add_token (Bom (s [0 :3 ]))
328
+ if isinstance (text , bytes ):
329
+ if text .startswith (b'\xef \xbb \xbf ' ):
330
+ self .enc = 'utf8'
331
+ bom = True
332
+ else :
333
+ self .enc , enc_line = self .find_encoding (text )
334
+ bom = False
335
+ try :
336
+ decoded_text = text .decode (self .enc )
337
+ except UnicodeDecodeError as err :
338
+ self .report_unicode_decode_error (err , text )
339
+ return
340
+ except LookupError :
341
+ self .report_unknown_encoding (enc_line )
342
+ return
343
+ text = decoded_text
344
+ if bom :
345
+ self .add_token (Bom (text [0 ]))
346
+ self .s = text
337
347
338
348
# Parse initial indent; otherwise first-line indent would not generate
339
349
# an error.
@@ -343,9 +353,9 @@ def lex(self, s: str, first_line: int) -> None:
343
353
map = self .map
344
354
345
355
# Lex the file. Repeatedly call the lexer method for the current char.
346
- while self .i < len (s ):
356
+ while self .i < len (text ):
347
357
# Get the character code of the next character to lex.
348
- c = ord (s [self .i ])
358
+ c = ord (text [self .i ])
349
359
# Dispatch to the relevant lexer method. This will consume some
350
360
# characters in the text, add a token to self.tok and increment
351
361
# self.i.
@@ -367,6 +377,41 @@ def lex(self, s: str, first_line: int) -> None:
367
377
368
378
self .add_token (Eof ('' ))
369
379
380
+ def find_encoding (self , text : bytes ) -> Tuple [str , int ]:
381
+ result = re .match (br'(\s*#.*(\r\n?|\n))?\s*#.*coding[:=]\s*([-\w.]+)' , text )
382
+ if result :
383
+ line = 2 if result .group (1 ) else 1
384
+ return result .group (3 ).decode ('ascii' ), line
385
+ else :
386
+ default_encoding = 'utf8' if self .pyversion >= 3 else 'ascii'
387
+ return default_encoding , - 1
388
+
389
+ def report_unicode_decode_error (self , exc : UnicodeDecodeError , text : bytes ) -> None :
390
+ lines = text .splitlines ()
391
+ for line in lines :
392
+ try :
393
+ line .decode (self .enc )
394
+ except UnicodeDecodeError as new_exc :
395
+ exc = new_exc
396
+ break
397
+ self .line += 1
398
+ else :
399
+ self .line = 1
400
+ self .add_token (
401
+ LexError ('' , DECODE_ERROR ,
402
+ "%r codec can't decode byte %d in column %d" % (
403
+ self .enc , line [exc .start ], exc .start + 1 )))
404
+ self .add_token (Break ('' ))
405
+ self .add_token (Eof ('' ))
406
+
407
+ def report_unknown_encoding (self , encoding_line : int ) -> None :
408
+ self .line = encoding_line
409
+ self .add_token (
410
+ LexError ('' , DECODE_ERROR ,
411
+ "Unknown encoding %r" % self .enc ))
412
+ self .add_token (Break ('' ))
413
+ self .add_token (Eof ('' ))
414
+
370
415
def lex_number_or_dot (self ) -> None :
371
416
"""Analyse a token starting with a dot.
372
417
@@ -404,7 +449,7 @@ def is_at_ellipsis(self) -> bool:
404
449
r'([0-9]*\.[0-9]*([eE][-+]?[0-9]+)?|[0-9]+([eE][-+]?[0-9]+)?)[jJ]' )
405
450
# These characters must not appear after a number literal.
406
451
name_char_exp = re .compile ('[a-zA-Z0-9_]' )
407
- octal_int = re .compile ('0[0 -9]' )
452
+ octal_int = re .compile ('0+[1 -9]' )
408
453
409
454
def lex_number (self ) -> None :
410
455
"""Analyse an int or float literal.
@@ -541,7 +586,6 @@ def lex_str(self, regex: Pattern[str], re2: Pattern[str],
541
586
if s .endswith ('\n ' ) or s .endswith ('\r ' ):
542
587
self .lex_multiline_string_literal (re2 , s )
543
588
else :
544
- self .verify_encoding (s , STR_CONTEXT )
545
589
if 'b' in prefix :
546
590
self .add_token (BytesLit (s ))
547
591
elif 'u' in prefix :
@@ -605,7 +649,6 @@ def lex_multiline_string_literal(self, re_end: Pattern[str],
605
649
def lex_comment (self ) -> None :
606
650
"""Analyze a comment."""
607
651
s = self .match (self .comment_exp )
608
- self .verify_encoding (s , COMMENT_CONTEXT )
609
652
self .add_pre_whitespace (s )
610
653
611
654
backslash_exp = re .compile (r'\\(\n|\r\n?)' )
@@ -808,25 +851,6 @@ def ignore_break(self) -> bool:
808
851
t = self .tok [- 1 ]
809
852
return isinstance (t , Break ) or isinstance (t , Dedent )
810
853
811
- def verify_encoding (self , string : str , context : int ) -> None :
812
- """Verify that token is encoded correctly (using the file encoding)."""
813
- codec = None # type: str
814
- if self .enc == ASCII_ENCODING :
815
- codec = 'ascii'
816
- elif self .enc in [UTF8_ENCODING , DEFAULT_ENCODING ]:
817
- codec = 'utf8'
818
- if codec is not None :
819
- try :
820
- pass # FIX string.decode(codec)
821
- except UnicodeDecodeError :
822
- type = INVALID_UTF8_SEQUENCE
823
- if self .enc == ASCII_ENCODING :
824
- if context == STR_CONTEXT :
825
- type = NON_ASCII_CHARACTER_IN_STRING
826
- else :
827
- type = NON_ASCII_CHARACTER_IN_COMMENT
828
- self .add_token (LexError ('' , type ))
829
-
830
854
831
855
if __name__ == '__main__' :
832
856
# Lexically analyze a file and dump the tokens to stdout.
0 commit comments