Skip to content

Commit fcee66d

Browse files
authored
Add support for byte and unicode Literal strings (#6087)
This pull request adds support for byte and unicode Literal strings. I left in some comments explaining some nuances of the implementation; here are a few additional meta-notes: 1. I reworded several of the comments suggesting that the way we represent bytes as a string is a "hack" or that we should eventually switch to representing bytes as literally bytes. I started with that approach but ultimately rejected it: I ended up having to constantly serialize/deserialize between bytes and strings, which I felt complicated the code. As a result, I decided that the solution we had previously is in fact, from a high-level perspective, the best possible approach. (The actual code for translating the output of `typed_ast` into a human-readable string *is* admittedly a bit hacky though.) In any case, the phrase "how mypy currently parses the contents of bytes literals" is severely out-of-date anyways. That comment was added about 3 years ago, when we were adding the fast parser for the first time and running it concurrently with the actual parser. 2. I removed the `is_stub` field from `fastparse2.ASTConverter`: it turned out we were just never using that field. 3. One complication I ran into was figuring out how to handle forward references to literal strings. For example, suppose we have the type `List["Literal['foo']"]`. Do we treat this as being equivalent to `List[Literal[u'foo']]` or `List[Literal[b'foo']]`? If this is a Python 3 file or a Python 2 file with `unicode_literals`, we'd want to pick the former. If this is a standard Python 2 file, we'd want to pick the latter. In order to make this happen, I decided to use a heuristic where the type of the "outer" string decides the type of the "inner" string. For example: - In Python 3, `"Literal['foo']"` is a unicode string. So, the inner `Literal['foo']` will be treated as the same as `Literal[u'foo']`. - The same thing happens when using Python 2 with `unicode_literals`. - In Python 3, it is illegal to use a byte string as a forward reference. So, types like `List[b"Literal['foo']"]` are already illegal. - In standard Python 2, `"Literal['foo']"` is a byte string. So the inner `Literal['foo']` will be treated as the same as `Literal[u'foo']`. 4. I will add tests validating that all of this stuff works as expected with incremental and fine-grained mode in a separate diff -- probably after fixing and landing #6075, which I intend to use as a baseline foundation.
1 parent 9a3fa64 commit fcee66d

10 files changed

+668
-68
lines changed

mypy/checkexpr.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1784,11 +1784,17 @@ def visit_str_expr(self, e: StrExpr) -> Type:
17841784

17851785
def visit_bytes_expr(self, e: BytesExpr) -> Type:
17861786
"""Type check a bytes literal (trivial)."""
1787-
return self.named_type('builtins.bytes')
1787+
typ = self.named_type('builtins.bytes')
1788+
if is_literal_type_like(self.type_context[-1]):
1789+
return LiteralType(value=e.value, fallback=typ)
1790+
return typ
17881791

17891792
def visit_unicode_expr(self, e: UnicodeExpr) -> Type:
17901793
"""Type check a unicode literal (trivial)."""
1791-
return self.named_type('builtins.unicode')
1794+
typ = self.named_type('builtins.unicode')
1795+
if is_literal_type_like(self.type_context[-1]):
1796+
return LiteralType(value=e.value, fallback=typ)
1797+
return typ
17921798

17931799
def visit_float_expr(self, e: FloatExpr) -> Type:
17941800
"""Type check a float literal (trivial)."""

mypy/exprtotype.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
ListExpr, StrExpr, BytesExpr, UnicodeExpr, EllipsisExpr, CallExpr,
66
get_member_expr_fullname
77
)
8-
from mypy.fastparse import parse_type_comment, parse_type_string
8+
from mypy.fastparse import parse_type_string
99
from mypy.types import (
1010
Type, UnboundType, TypeList, EllipsisType, AnyType, Optional, CallableArgument, TypeOfAny,
1111
RawLiteralType,
@@ -111,8 +111,15 @@ def expr_to_unanalyzed_type(expr: Expression, _parent: Optional[Expression] = No
111111
elif isinstance(expr, ListExpr):
112112
return TypeList([expr_to_unanalyzed_type(t, expr) for t in expr.items],
113113
line=expr.line, column=expr.column)
114-
elif isinstance(expr, (StrExpr, BytesExpr, UnicodeExpr)):
115-
return parse_type_string(expr.value, expr.line, expr.column)
114+
elif isinstance(expr, StrExpr):
115+
return parse_type_string(expr.value, 'builtins.str', expr.line, expr.column,
116+
assume_str_is_unicode=expr.from_python_3)
117+
elif isinstance(expr, BytesExpr):
118+
return parse_type_string(expr.value, 'builtins.bytes', expr.line, expr.column,
119+
assume_str_is_unicode=False)
120+
elif isinstance(expr, UnicodeExpr):
121+
return parse_type_string(expr.value, 'builtins.unicode', expr.line, expr.column,
122+
assume_str_is_unicode=True)
116123
elif isinstance(expr, UnaryExpr):
117124
typ = expr_to_unanalyzed_type(expr.expr)
118125
if isinstance(typ, RawLiteralType) and isinstance(typ.value, int) and expr.op == '-':

mypy/fastparse.py

+70-16
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
NameConstant,
5252
Expression as ast3_Expression,
5353
Str,
54+
Bytes,
5455
Index,
5556
Num,
5657
UnaryOp,
@@ -140,7 +141,11 @@ def parse(source: Union[str, bytes],
140141
return tree
141142

142143

143-
def parse_type_comment(type_comment: str, line: int, errors: Optional[Errors]) -> Optional[Type]:
144+
def parse_type_comment(type_comment: str,
145+
line: int,
146+
errors: Optional[Errors],
147+
assume_str_is_unicode: bool = True,
148+
) -> Optional[Type]:
144149
try:
145150
typ = ast3.parse(type_comment, '<type_comment>', 'eval')
146151
except SyntaxError as e:
@@ -151,24 +156,39 @@ def parse_type_comment(type_comment: str, line: int, errors: Optional[Errors]) -
151156
raise
152157
else:
153158
assert isinstance(typ, ast3_Expression)
154-
return TypeConverter(errors, line=line).visit(typ.body)
159+
return TypeConverter(errors, line=line,
160+
assume_str_is_unicode=assume_str_is_unicode).visit(typ.body)
155161

156162

157-
def parse_type_string(expr_string: str, line: int, column: int) -> Type:
158-
"""Parses a type that was originally present inside of an explicit string.
163+
def parse_type_string(expr_string: str, expr_fallback_name: str,
164+
line: int, column: int, assume_str_is_unicode: bool = True) -> Type:
165+
"""Parses a type that was originally present inside of an explicit string,
166+
byte string, or unicode string.
159167
160168
For example, suppose we have the type `Foo["blah"]`. We should parse the
161169
string expression "blah" using this function.
170+
171+
If `assume_str_is_unicode` is set to true, this function will assume that
172+
`Foo["blah"]` is equivalent to `Foo[u"blah"]`. Otherwise, it assumes it's
173+
equivalent to `Foo[b"blah"]`.
174+
175+
The caller is responsible for keeping track of the context in which the
176+
type string was encountered (e.g. in Python 3 code, Python 2 code, Python 2
177+
code with unicode_literals...) and setting `assume_str_is_unicode` accordingly.
162178
"""
163179
try:
164-
node = parse_type_comment(expr_string.strip(), line=line, errors=None)
180+
node = parse_type_comment(expr_string.strip(), line=line, errors=None,
181+
assume_str_is_unicode=assume_str_is_unicode)
165182
if isinstance(node, UnboundType) and node.original_str_expr is None:
166183
node.original_str_expr = expr_string
184+
node.original_str_fallback = expr_fallback_name
167185
return node
168186
else:
169-
return RawLiteralType(expr_string, 'builtins.str', line, column)
170-
except SyntaxError:
171-
return RawLiteralType(expr_string, 'builtins.str', line, column)
187+
return RawLiteralType(expr_string, expr_fallback_name, line, column)
188+
except (SyntaxError, ValueError):
189+
# Note: the parser will raise a `ValueError` instead of a SyntaxError if
190+
# the string happens to contain things like \x00.
191+
return RawLiteralType(expr_string, expr_fallback_name, line, column)
172192

173193

174194
def is_no_type_check_decorator(expr: ast3.expr) -> bool:
@@ -966,10 +986,7 @@ def visit_FormattedValue(self, n: ast3.FormattedValue) -> Expression:
966986

967987
# Bytes(bytes s)
968988
def visit_Bytes(self, n: ast3.Bytes) -> Union[BytesExpr, StrExpr]:
969-
# The following line is a bit hacky, but is the best way to maintain
970-
# compatibility with how mypy currently parses the contents of bytes literals.
971-
contents = str(n.s)[2:-1]
972-
e = BytesExpr(contents)
989+
e = BytesExpr(bytes_to_human_readable_repr(n.s))
973990
return self.set_line(e, n)
974991

975992
# NameConstant(singleton value)
@@ -1042,10 +1059,15 @@ def visit_Index(self, n: Index) -> Node:
10421059

10431060

10441061
class TypeConverter:
1045-
def __init__(self, errors: Optional[Errors], line: int = -1) -> None:
1062+
def __init__(self,
1063+
errors: Optional[Errors],
1064+
line: int = -1,
1065+
assume_str_is_unicode: bool = True,
1066+
) -> None:
10461067
self.errors = errors
10471068
self.line = line
10481069
self.node_stack = [] # type: List[AST]
1070+
self.assume_str_is_unicode = assume_str_is_unicode
10491071

10501072
@overload
10511073
def visit(self, node: ast3.expr) -> Type: ...
@@ -1090,8 +1112,11 @@ def visit_raw_str(self, s: str) -> Type:
10901112
# An escape hatch that allows the AST walker in fastparse2 to
10911113
# directly hook into the Python 3.5 type converter in some cases
10921114
# without needing to create an intermediary `Str` object.
1093-
return (parse_type_comment(s.strip(), self.line, self.errors) or
1094-
AnyType(TypeOfAny.from_error))
1115+
return (parse_type_comment(s.strip(),
1116+
self.line,
1117+
self.errors,
1118+
self.assume_str_is_unicode)
1119+
or AnyType(TypeOfAny.from_error))
10951120

10961121
def visit_Call(self, e: Call) -> Type:
10971122
# Parse the arg constructor
@@ -1190,7 +1215,22 @@ def visit_Num(self, n: Num) -> Type:
11901215

11911216
# Str(string s)
11921217
def visit_Str(self, n: Str) -> Type:
1193-
return parse_type_string(n.s, line=self.line, column=-1)
1218+
# Note: we transform these fallback types into the correct types in
1219+
# 'typeanal.py' -- specifically in the named_type_with_normalized_str method.
1220+
# If we're analyzing Python 3, that function will translate 'builtins.unicode'
1221+
# into 'builtins.str'. In contrast, if we're analyzing Python 2 code, we'll
1222+
# translate 'builtins.bytes' in the method below into 'builtins.str'.
1223+
if 'u' in n.kind or self.assume_str_is_unicode:
1224+
return parse_type_string(n.s, 'builtins.unicode', self.line, n.col_offset,
1225+
assume_str_is_unicode=self.assume_str_is_unicode)
1226+
else:
1227+
return parse_type_string(n.s, 'builtins.str', self.line, n.col_offset,
1228+
assume_str_is_unicode=self.assume_str_is_unicode)
1229+
1230+
# Bytes(bytes s)
1231+
def visit_Bytes(self, n: Bytes) -> Type:
1232+
contents = bytes_to_human_readable_repr(n.s)
1233+
return RawLiteralType(contents, 'builtins.bytes', self.line, column=n.col_offset)
11941234

11951235
# Subscript(expr value, slice slice, expr_context ctx)
11961236
def visit_Subscript(self, n: ast3.Subscript) -> Type:
@@ -1246,3 +1286,17 @@ def stringify_name(n: AST) -> Optional[str]:
12461286
if sv is not None:
12471287
return "{}.{}".format(sv, n.attr)
12481288
return None # Can't do it.
1289+
1290+
1291+
def bytes_to_human_readable_repr(b: bytes) -> str:
1292+
"""Converts bytes into some human-readable representation. Unprintable
1293+
bytes such as the nul byte are escaped. For example:
1294+
1295+
>>> b = bytes([102, 111, 111, 10, 0])
1296+
>>> s = bytes_to_human_readable_repr(b)
1297+
>>> print(s)
1298+
foo\n\x00
1299+
>>> print(repr(s))
1300+
'foo\\n\\x00'
1301+
"""
1302+
return str(b)[2:-1]

mypy/fastparse2.py

+46-23
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
)
4646
from mypy import messages
4747
from mypy.errors import Errors
48-
from mypy.fastparse import TypeConverter, parse_type_comment
48+
from mypy.fastparse import TypeConverter, parse_type_comment, bytes_to_human_readable_repr
4949
from mypy.options import Options
5050

5151
try:
@@ -113,7 +113,6 @@ def parse(source: Union[str, bytes],
113113
assert options.python_version[0] < 3 and not is_stub_file
114114
ast = ast27.parse(source, fnam, 'exec')
115115
tree = ASTConverter(options=options,
116-
is_stub=is_stub_file,
117116
errors=errors,
118117
).visit(ast)
119118
assert isinstance(tree, MypyFile)
@@ -141,15 +140,32 @@ def is_no_type_check_decorator(expr: ast27.expr) -> bool:
141140
class ASTConverter:
142141
def __init__(self,
143142
options: Options,
144-
is_stub: bool,
145143
errors: Errors) -> None:
146144
self.class_nesting = 0
147145
self.imports = [] # type: List[ImportBase]
148146

149147
self.options = options
150-
self.is_stub = is_stub
151148
self.errors = errors
152149

150+
# Indicates whether this file is being parsed with unicode_literals enabled.
151+
# Note: typed_ast already naturally takes unicode_literals into account when
152+
# parsing so we don't have to worry when analyzing strings within this class.
153+
#
154+
# The only place where we use this field is when we call fastparse's TypeConverter
155+
# and any related methods. That class accepts a Python 3 AST instead of a Python 2
156+
# AST: as a result, it don't special-case the `unicode_literals` import and won't know
157+
# exactly whether to parse some string as bytes or unicode.
158+
#
159+
# This distinction is relevant mostly when handling Literal types -- Literal[u"foo"]
160+
# is not the same type as Literal[b"foo"], and Literal["foo"] could mean either the
161+
# former or the latter based on context.
162+
#
163+
# This field is set in the 'visit_ImportFrom' method: it's ok to delay computing it
164+
# because any `from __future__ import blah` import must be located at the top of the
165+
# file, with the exception of the docstring. This means we're guaranteed to correctly
166+
# set this field before we encounter any type hints.
167+
self.unicode_literals = False
168+
153169
# Cache of visit_X methods keyed by type of visited object
154170
self.visitor_cache = {} # type: Dict[type, Callable[[Optional[AST]], Any]]
155171

@@ -306,7 +322,8 @@ def visit_Module(self, mod: ast27.Module) -> MypyFile:
306322
# arg? kwarg, expr* defaults)
307323
def visit_FunctionDef(self, n: ast27.FunctionDef) -> Statement:
308324
lineno = n.lineno
309-
converter = TypeConverter(self.errors, line=lineno)
325+
converter = TypeConverter(self.errors, line=lineno,
326+
assume_str_is_unicode=self.unicode_literals)
310327
args, decompose_stmts = self.transform_args(n.args, lineno)
311328

312329
arg_kinds = [arg.kind for arg in args]
@@ -413,7 +430,8 @@ def transform_args(self,
413430
line: int,
414431
) -> Tuple[List[Argument], List[Statement]]:
415432
type_comments = n.type_comments # type: Sequence[Optional[str]]
416-
converter = TypeConverter(self.errors, line=line)
433+
converter = TypeConverter(self.errors, line=line,
434+
assume_str_is_unicode=self.unicode_literals)
417435
decompose_stmts = [] # type: List[Statement]
418436

419437
n_args = n.args
@@ -532,7 +550,8 @@ def visit_Delete(self, n: ast27.Delete) -> DelStmt:
532550
def visit_Assign(self, n: ast27.Assign) -> AssignmentStmt:
533551
typ = None
534552
if n.type_comment:
535-
typ = parse_type_comment(n.type_comment, n.lineno, self.errors)
553+
typ = parse_type_comment(n.type_comment, n.lineno, self.errors,
554+
assume_str_is_unicode=self.unicode_literals)
536555

537556
stmt = AssignmentStmt(self.translate_expr_list(n.targets),
538557
self.visit(n.value),
@@ -549,7 +568,8 @@ def visit_AugAssign(self, n: ast27.AugAssign) -> OperatorAssignmentStmt:
549568
# For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
550569
def visit_For(self, n: ast27.For) -> ForStmt:
551570
if n.type_comment is not None:
552-
target_type = parse_type_comment(n.type_comment, n.lineno, self.errors)
571+
target_type = parse_type_comment(n.type_comment, n.lineno, self.errors,
572+
assume_str_is_unicode=self.unicode_literals)
553573
else:
554574
target_type = None
555575
stmt = ForStmt(self.visit(n.target),
@@ -576,7 +596,8 @@ def visit_If(self, n: ast27.If) -> IfStmt:
576596
# With(withitem* items, stmt* body, string? type_comment)
577597
def visit_With(self, n: ast27.With) -> WithStmt:
578598
if n.type_comment is not None:
579-
target_type = parse_type_comment(n.type_comment, n.lineno, self.errors)
599+
target_type = parse_type_comment(n.type_comment, n.lineno, self.errors,
600+
assume_str_is_unicode=self.unicode_literals)
580601
else:
581602
target_type = None
582603
stmt = WithStmt([self.visit(n.context_expr)],
@@ -680,9 +701,12 @@ def visit_ImportFrom(self, n: ast27.ImportFrom) -> ImportBase:
680701
mod = n.module if n.module is not None else ''
681702
i = ImportAll(mod, n.level) # type: ImportBase
682703
else:
683-
i = ImportFrom(self.translate_module_id(n.module) if n.module is not None else '',
684-
n.level,
685-
[(a.name, a.asname) for a in n.names])
704+
module_id = self.translate_module_id(n.module) if n.module is not None else ''
705+
i = ImportFrom(module_id, n.level, [(a.name, a.asname) for a in n.names])
706+
707+
# See comments in the constructor for more information about this field.
708+
if module_id == '__future__' and any(a.name == 'unicode_literals' for a in n.names):
709+
self.unicode_literals = True
686710
self.imports.append(i)
687711
return self.set_line(i, n)
688712

@@ -900,18 +924,17 @@ def visit_Num(self, n: ast27.Num) -> Expression:
900924

901925
# Str(string s)
902926
def visit_Str(self, n: ast27.Str) -> Expression:
903-
# Hack: assume all string literals in Python 2 stubs are normal
904-
# strs (i.e. not unicode). All stubs are parsed with the Python 3
905-
# parser, which causes unprefixed string literals to be interpreted
906-
# as unicode instead of bytes. This hack is generally okay,
907-
# because mypy considers str literals to be compatible with
908-
# unicode.
927+
# Note: typed_ast.ast27 will handled unicode_literals for us. If
928+
# n.s is of type 'bytes', we know unicode_literals was not enabled;
929+
# otherwise we know it was.
930+
#
931+
# Note that the following code is NOT run when parsing Python 2.7 stubs:
932+
# we always parse stub files (no matter what version) using the Python 3
933+
# parser. This is also why string literals in Python 2.7 stubs are assumed
934+
# to be unicode.
909935
if isinstance(n.s, bytes):
910-
value = n.s
911-
# The following line is a bit hacky, but is the best way to maintain
912-
# compatibility with how mypy currently parses the contents of bytes literals.
913-
contents = str(value)[2:-1]
914-
e = StrExpr(contents) # type: Union[StrExpr, UnicodeExpr]
936+
contents = bytes_to_human_readable_repr(n.s)
937+
e = StrExpr(contents, from_python_3=False) # type: Union[StrExpr, UnicodeExpr]
915938
return self.set_line(e, n)
916939
else:
917940
e = UnicodeExpr(n.s)

mypy/literals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def visit_int_expr(self, e: IntExpr) -> Key:
9898
return ('Literal', e.value)
9999

100100
def visit_str_expr(self, e: StrExpr) -> Key:
101-
return ('Literal', e.value)
101+
return ('Literal', e.value, e.from_python_3)
102102

103103
def visit_bytes_expr(self, e: BytesExpr) -> Key:
104104
return ('Literal', e.value)

0 commit comments

Comments
 (0)