From dd80a181018b5b07e26e2274f20470a832c97c0b Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 30 Nov 2021 00:55:47 +0100 Subject: [PATCH 1/2] Type read_fwf --- pandas/io/parsers/python_parser.py | 31 ++++++++++++++++++++++-------- pandas/io/parsers/readers.py | 13 ++++++++----- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 27d0944572024..5d3cc41f4a291 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -9,6 +9,7 @@ import re import sys from typing import ( + IO, DefaultDict, Hashable, Iterator, @@ -1135,9 +1136,17 @@ class FixedWidthReader(abc.Iterator): A reader of fixed-width lines. """ - def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): + def __init__( + self, + f: IO[str], + colspecs: list[tuple[int, int]] | str, + delimiter: str | None, + comment: str | None, + skiprows: set[int] | None = None, + infer_nrows: int = 100, + ) -> None: self.f = f - self.buffer = None + self.buffer: Iterator | None = None self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " self.comment = comment if colspecs == "infer": @@ -1145,6 +1154,8 @@ def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=1 infer_nrows=infer_nrows, skiprows=skiprows ) else: + # for mypy + assert not isinstance(colspecs, str) self.colspecs = colspecs if not isinstance(self.colspecs, (tuple, list)): @@ -1165,7 +1176,7 @@ def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=1 "2 element tuple or list of integers" ) - def get_rows(self, infer_nrows, skiprows=None): + def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]: """ Read rows from self.f, skipping as specified. @@ -1203,7 +1214,9 @@ def get_rows(self, infer_nrows, skiprows=None): self.buffer = iter(buffer_rows) return detect_rows - def detect_colspecs(self, infer_nrows=100, skiprows=None): + def detect_colspecs( + self, infer_nrows: int = 100, skiprows: set[int] | None = None + ) -> list[tuple[int, int]]: # Regex escape the delimiters delimiters = "".join([fr"\{x}" for x in self.delimiter]) pattern = re.compile(f"([^{delimiters}]+)") @@ -1223,7 +1236,7 @@ def detect_colspecs(self, infer_nrows=100, skiprows=None): edge_pairs = list(zip(edges[::2], edges[1::2])) return edge_pairs - def __next__(self): + def __next__(self) -> list[Scalar]: if self.buffer is not None: try: line = next(self.buffer) @@ -1242,13 +1255,15 @@ class FixedWidthFieldParser(PythonParser): See PythonParser for details. """ - def __init__(self, f, **kwds): + def __init__( + self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds + ) -> None: # Support iterators, convert to a list. self.colspecs = kwds.pop("colspecs") self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) - def _make_reader(self, f): + def _make_reader(self, f: IO[str]) -> None: self.data = FixedWidthReader( f, self.colspecs, @@ -1258,7 +1273,7 @@ def _make_reader(self, f): self.infer_nrows, ) - def _remove_empty_lines(self, lines) -> list: + def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: """ Returns the list of lines without the empty ones. With fixed-width fields, empty lines become arrays of empty strings. diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 63ab10e1e5362..51e9b39c099e8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -760,11 +760,11 @@ def read_table( def read_fwf( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], - colspecs="infer", - widths=None, - infer_nrows=100, + colspecs: list[tuple[int, int]] | str | None = "infer", + widths: list[int] | None = None, + infer_nrows: int = 100, **kwds, -): +) -> DataFrame: r""" Read a table of fixed-width formatted lines into DataFrame. @@ -799,7 +799,7 @@ def read_fwf( Returns ------- - DataFrame or TextParser + DataFrame A comma-separated values (csv) file is returned as two-dimensional data structure with labeled axes. @@ -825,6 +825,9 @@ def read_fwf( colspecs.append((col, col + w)) col += w + # for mypy + assert colspecs is not None + # GH#40830 # Ensure length of `colspecs` matches length of `names` names = kwds.get("names") From a4dede37f301778ecbeaff35624614aa4c0321c3 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 1 Dec 2021 20:32:02 +0100 Subject: [PATCH 2/2] Adress review --- pandas/io/parsers/python_parser.py | 7 +++---- pandas/io/parsers/readers.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 5d3cc41f4a291..b493c4f12fb31 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -13,6 +13,7 @@ DefaultDict, Hashable, Iterator, + Literal, Mapping, Sequence, cast, @@ -1139,7 +1140,7 @@ class FixedWidthReader(abc.Iterator): def __init__( self, f: IO[str], - colspecs: list[tuple[int, int]] | str, + colspecs: list[tuple[int, int]] | Literal["infer"], delimiter: str | None, comment: str | None, skiprows: set[int] | None = None, @@ -1154,8 +1155,6 @@ def __init__( infer_nrows=infer_nrows, skiprows=skiprows ) else: - # for mypy - assert not isinstance(colspecs, str) self.colspecs = colspecs if not isinstance(self.colspecs, (tuple, list)): @@ -1236,7 +1235,7 @@ def detect_colspecs( edge_pairs = list(zip(edges[::2], edges[1::2])) return edge_pairs - def __next__(self) -> list[Scalar]: + def __next__(self) -> list[str]: if self.buffer is not None: try: line = next(self.buffer) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 51e9b39c099e8..6a2a7b791c048 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -764,7 +764,7 @@ def read_fwf( widths: list[int] | None = None, infer_nrows: int = 100, **kwds, -) -> DataFrame: +) -> DataFrame | TextFileReader: r""" Read a table of fixed-width formatted lines into DataFrame. @@ -799,7 +799,7 @@ def read_fwf( Returns ------- - DataFrame + DataFrame or TextFileReader A comma-separated values (csv) file is returned as two-dimensional data structure with labeled axes.