Skip to content

[mypyc] Use mypy.FORMAT_RE and ConversionSpecifier for % interpolation #10877

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jul 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 29 additions & 20 deletions mypy/checkstrformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import re

from typing import (
cast, List, Tuple, Dict, Callable, Union, Optional, Pattern, Match, Set, Any
cast, List, Tuple, Dict, Callable, Union, Optional, Pattern, Match, Set
)
from typing_extensions import Final, TYPE_CHECKING

Expand Down Expand Up @@ -50,14 +50,14 @@ def compile_format_re() -> Pattern[str]:
See https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting
The regexp is intentionally a bit wider to report better errors.
"""
key_re = r'(\(([^()]*)\))?' # (optional) parenthesised sequence of characters.
flags_re = r'([#0\-+ ]*)' # (optional) sequence of flags.
width_re = r'(\*|[1-9][0-9]*)?' # (optional) minimum field width (* or numbers).
precision_re = r'(?:\.(\*|[0-9]+)?)?' # (optional) . followed by * of numbers.
key_re = r'(\((?P<key>[^)]*)\))?' # (optional) parenthesised sequence of characters.
flags_re = r'(?P<flag>[#0\-+ ]*)' # (optional) sequence of flags.
width_re = r'(?P<width>[1-9][0-9]*|\*)?' # (optional) minimum field width (* or numbers).
precision_re = r'(?:\.(?P<precision>\*|[0-9]+)?)?' # (optional) . followed by * of numbers.
length_mod_re = r'[hlL]?' # (optional) length modifier (unused).
type_re = r'(.)?' # conversion type.
type_re = r'(?P<type>.)?' # conversion type.
format_re = '%' + key_re + flags_re + width_re + precision_re + length_mod_re + type_re
return re.compile(format_re)
return re.compile('({})'.format(format_re))


def compile_new_format_re(custom_spec: bool) -> Pattern[str]:
Expand Down Expand Up @@ -114,16 +114,20 @@ def compile_new_format_re(custom_spec: bool) -> Pattern[str]:


class ConversionSpecifier:
def __init__(self, key: Optional[str],
flags: str, width: str, precision: str, type: str,
def __init__(self, type: str,
key: Optional[str],
flags: Optional[str],
width: Optional[str],
precision: Optional[str],
format_spec: Optional[str] = None,
conversion: Optional[str] = None,
field: Optional[str] = None) -> None:
field: Optional[str] = None,
whole_seq: Optional[str] = None) -> None:
self.type = type
self.key = key
self.flags = flags
self.width = width
self.precision = precision
self.type = type
# Used only for str.format() calls (it may be custom for types with __format__()).
self.format_spec = format_spec
self.non_standard_format_spec = False
Expand All @@ -132,24 +136,27 @@ def __init__(self, key: Optional[str],
# Full formatted expression (i.e. key plus following attributes and/or indexes).
# Used only for str.format() calls.
self.field = field
self.whole_seq = whole_seq

@classmethod
def from_match(cls, match_obj: Match[str],
def from_match(cls, match: Match[str],
non_standard_spec: bool = False) -> 'ConversionSpecifier':
"""Construct specifier from match object resulted from parsing str.format() call."""
match = cast(Any, match_obj) # TODO: remove this once typeshed is fixed.
if non_standard_spec:
spec = cls(match.group('key'),
flags='', width='', precision='', type='',
spec = cls(type='',
key=match.group('key'),
flags='', width='', precision='',
format_spec=match.group('format_spec'),
conversion=match.group('conversion'),
field=match.group('field'))
spec.non_standard_format_spec = True
return spec
# Replace unmatched optional groups with empty matches (for convenience).
return cls(match.group('key'),
flags=match.group('flags') or '', width=match.group('width') or '',
precision=match.group('precision') or '', type=match.group('type') or '',
return cls(type=match.group('type') or '',
key=match.group('key'),
flags=match.group('flags') or '',
width=match.group('width') or '',
precision=match.group('precision') or '',
format_spec=match.group('format_spec'),
conversion=match.group('conversion'),
field=match.group('field'))
Expand Down Expand Up @@ -622,10 +629,12 @@ def check_str_interpolation(self,

def parse_conversion_specifiers(self, format: str) -> List[ConversionSpecifier]:
specifiers: List[ConversionSpecifier] = []
for parens_key, key, flags, width, precision, type in FORMAT_RE.findall(format):
for whole_seq, parens_key, key, flags, width, precision, type \
in FORMAT_RE.findall(format):
if parens_key == '':
key = None
specifiers.append(ConversionSpecifier(key, flags, width, precision, type))
specifiers.append(ConversionSpecifier(type, key, flags, width, precision,
whole_seq=whole_seq))
return specifiers

def analyze_conversion_specifiers(self, specifiers: List[ConversionSpecifier],
Expand Down
14 changes: 8 additions & 6 deletions mypyc/irbuild/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,8 @@ def transform_basic_comparison(builder: IRBuilder,
def translate_str_format_percent_sign(builder: IRBuilder,
format_expr: StrExpr,
rhs: Expression) -> Value:
literals, conversion_types = tokenizer_printf_style(format_expr.value)
literals, conversion_specifiers = tokenizer_printf_style(format_expr.value)

variables = []
if isinstance(rhs, TupleExpr):
raw_variables = rhs.items
Expand All @@ -578,15 +579,16 @@ def translate_str_format_percent_sign(builder: IRBuilder,
else:
raw_variables = []

is_conversion_matched = (len(conversion_types) == len(raw_variables))
is_conversion_matched = (len(conversion_specifiers) == len(raw_variables))

if is_conversion_matched:
for typ, var in zip(conversion_types, raw_variables):
for specifier, var in zip(conversion_specifiers, raw_variables):
node_type = builder.node_type(var)
if typ == '%d' and (is_int_rprimitive(node_type)
or is_short_int_rprimitive(node_type)):
format_type = specifier.whole_seq
if format_type == '%d' and (is_int_rprimitive(node_type)
or is_short_int_rprimitive(node_type)):
var_str = builder.call_c(int_to_str_op, [builder.accept(var)], format_expr.line)
elif typ == '%s':
elif format_type == '%s':
if is_str_rprimitive(node_type):
var_str = builder.accept(var)
else:
Expand Down
35 changes: 13 additions & 22 deletions mypyc/irbuild/format_str_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,37 @@
import re
from typing import List, Tuple

from mypy.checkstrformat import (
FORMAT_RE, ConversionSpecifier
)
from mypyc.ir.ops import Value, Integer
from mypyc.ir.rtypes import c_pyssize_t_rprimitive
from mypyc.irbuild.builder import IRBuilder
from mypyc.primitives.str_ops import str_build_op

# printf-style String Formatting:
# https://docs.python.org/3/library/stdtypes.html#old-string-formatting
printf_style_pattern = re.compile(r"""
(
% # Start sign
(?:\((?P<key>[^)]*)\))? # Optional: Mapping key
(?P<flag>[-+#0 ]+)? # Optional: Conversion flags
(?P<width>\d+|\*)? # Optional: Minimum field width
(?:\.(?P<precision>\d+|\*))? # Optional: Precision
[hlL]? # Optional: Length modifier, Ignored
(?P<type>[diouxXeEfFgGcrsa]) # Conversion type
| %%)
""", re.VERBOSE)


def tokenizer_printf_style(format_str: str) -> Tuple[List[str], List[str]]:
def tokenizer_printf_style(format_str: str) -> Tuple[List[str], List[ConversionSpecifier]]:
"""Tokenize a printf-style format string using regex.

Return:
A list of string literals and a list of conversion operations
"""
literals = []
format_ops = []
literals: List[str] = []
specifiers: List[ConversionSpecifier] = []
last_end = 0

for m in re.finditer(printf_style_pattern, format_str):
for m in re.finditer(FORMAT_RE, format_str):
whole_seq, parens_key, key, flags, width, precision, conversion_type = m.groups()
specifiers.append(ConversionSpecifier(conversion_type, key, flags, width, precision,
whole_seq=whole_seq))

cur_start = m.start(1)
format_tmp = m.group(1)
literals.append(format_str[last_end:cur_start])
format_ops.append(format_tmp)
last_end = cur_start + len(format_tmp)
last_end = cur_start + len(whole_seq)

literals.append(format_str[last_end:])

return literals, format_ops
return literals, specifiers


def join_formatted_strings(builder: IRBuilder, literals: List[str],
Expand Down
33 changes: 24 additions & 9 deletions mypyc/test/test_stringformatting.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
import unittest
from typing import List

from mypyc.irbuild.format_str_tokenizer import tokenizer_printf_style


class TestStringFormatting(unittest.TestCase):

def test_tokenizer_printf_style(self) -> None:
assert tokenizer_printf_style("I'm %s, id years old") == \
(["I'm ", ', id years old'], ['%s'])
assert tokenizer_printf_style("Test: %i%%, Test: %02d, Test: %.2f") == \
(['Test: ', '', ', Test: ', ', Test: ', ''], ['%i', '%%', '%02d', '%.2f'])
assert tokenizer_printf_style("ioasdfyuia%i%%%20s%d%sdafafadfa%s%d%x%E%.2f") == \
(['ioasdfyuia', '', '', '', '', 'dafafadfa', '', '', '', '', ''],
['%i', '%%', '%20s', '%d', '%s', '%s', '%d', '%x', '%E', '%.2f'])
assert tokenizer_printf_style("Special: %#20.2f%d, test: ") == \
(['Special: ', '', ', test: '], ['%#20.2f', '%d'])

def tokenizer_printf_style_helper(format_str: str,
literals: List[str], conversion: List[str]) -> bool:
l, specs = tokenizer_printf_style(format_str)
return literals == l and conversion == [x.whole_seq for x in specs]

assert tokenizer_printf_style_helper(
"I'm %s, id years old",
["I'm ", ', id years old'],
['%s'])
assert tokenizer_printf_style_helper(
"Test: %i%%, Test: %02d, Test: %.2f",
['Test: ', '', ', Test: ', ', Test: ', ''],
['%i', '%%', '%02d', '%.2f'])
assert tokenizer_printf_style_helper(
"ioasdfyuia%i%%%20s%d%sdafafadfa%s%d%x%E%.2f",
['ioasdfyuia', '', '', '', '', 'dafafadfa', '', '', '', '', ''],
['%i', '%%', '%20s', '%d', '%s', '%s', '%d', '%x', '%E', '%.2f'])
assert tokenizer_printf_style_helper(
"Special: %#20.2f%d, test: ",
['Special: ', '', ', test: '],
['%#20.2f', '%d'])