From 92ed9b7e287b78bed80805273878b9aa5824b853 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 16:51:40 +0100 Subject: [PATCH 1/9] Start typing --- pandas/io/parsers/base_parser.py | 27 ++++++++++++++++++++++++--- pandas/io/parsers/c_parser_wrapper.py | 26 +++++++++++++++++++++----- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 339585810bec1..729928bf9e5c7 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -13,6 +13,7 @@ Sequence, cast, final, + overload, ) import warnings @@ -27,6 +28,7 @@ ArrayLike, DtypeArg, FilePathOrBuffer, + Scalar, ) from pandas.errors import ( ParserError, @@ -406,7 +408,9 @@ def _maybe_dedup_names(self, names): return names @final - def _maybe_make_multi_index_columns(self, columns, col_names=None): + def _maybe_make_multi_index_columns( + self, columns: list[Scalar | tuple], col_names=None + ): # possibly create a column mi here if _is_potential_multi_index(columns): columns = MultiIndex.from_tuples(columns, names=col_names) @@ -833,7 +837,23 @@ def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: stacklevel=6, ) - def _evaluate_usecols(self, usecols, names): + @overload + def _evaluate_usecols(self, usecols: Callable, names: list[Scalar]) -> set[int]: + ... + + @overload + def _evaluate_usecols( + self, + usecols: set[str | int], + names: list[Scalar], + ) -> set[str | int]: + ... + + def _evaluate_usecols( + self, + usecols: Callable | set[str | int], + names: list[Scalar], + ) -> set[str | int]: """ Check whether or not the 'usecols' parameter is a callable. If so, enumerates the 'names' @@ -1189,7 +1209,8 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na): def _is_potential_multi_index( - columns, index_col: bool | Sequence[int] | None = None + columns: list[Scalar] | list[tuple] | MultiIndex, + index_col: bool | Sequence[int] | None = None, ) -> bool: """ Check whether or not the `columns` parameter diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 352dd998dda0f..a3f0a12454b10 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -7,7 +7,10 @@ import pandas._libs.parsers as parsers from pandas._typing import ( ArrayLike, + DtypeArg, + DtypeObj, FilePathOrBuffer, + Scalar, ) from pandas.errors import DtypeWarning @@ -18,6 +21,10 @@ from pandas.core.dtypes.concat import union_categoricals from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas import ( + Index, + MultiIndex, +) from pandas.core.indexes.api import ensure_index_from_sequences from pandas.io.parsers.base_parser import ( @@ -30,7 +37,7 @@ class CParserWrapper(ParserBase): low_memory: bool _reader: parsers.TextReader - def __init__(self, src: FilePathOrBuffer, **kwds): + def __init__(self, src: FilePathOrBuffer, **kwds) -> None: self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) @@ -189,7 +196,7 @@ def close(self) -> None: except ValueError: pass - def _set_noconvert_columns(self): + def _set_noconvert_columns(self) -> None: """ Set the columns that should not undergo dtype conversions. @@ -210,7 +217,14 @@ def _set_noconvert_columns(self): for col in noconvert_columns: self._reader.set_noconvert(col) - def read(self, nrows=None): + def read( + self, + nrows: int | None = None, + ) -> tuple[ + Index | MultiIndex | None, + list[Scalar] | MultiIndex, + dict[Scalar | tuple, ArrayLike], + ]: try: if self.low_memory: chunks = self._reader.read_low_memory(nrows) @@ -306,7 +320,7 @@ def read(self, nrows=None): return index, names, data - def _filter_usecols(self, names): + def _filter_usecols(self, names: list[Scalar]) -> list[Scalar]: # hackish usecols = self._evaluate_usecols(self.usecols, names) if usecols is not None and len(names) != len(usecols): @@ -391,7 +405,9 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: return result -def ensure_dtype_objs(dtype): +def ensure_dtype_objs( + dtype: DtypeArg | dict[Scalar, DtypeArg] | None +) -> DtypeObj | dict[Scalar, DtypeObj] | None: """ Ensure we have either None, a dtype object, or a dictionary mapping to dtype objects. From 182217a35c2d75fb9e40f05bdb9f28566384ce7c Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 18:15:47 +0100 Subject: [PATCH 2/9] Continue typing --- pandas/io/parsers/base_parser.py | 28 ++++++++++++++++----------- pandas/io/parsers/c_parser_wrapper.py | 4 ++-- pandas/io/parsers/python_parser.py | 1 + 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 729928bf9e5c7..941c55526cdd8 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -10,7 +10,9 @@ Callable, DefaultDict, Iterable, + List, Sequence, + Tuple, cast, final, overload, @@ -409,11 +411,15 @@ def _maybe_dedup_names(self, names): @final def _maybe_make_multi_index_columns( - self, columns: list[Scalar | tuple], col_names=None - ): + self, + columns: list[Scalar] | list[tuple], + col_names: list[Scalar | None] | list[tuple] | None = None, + ) -> list[Scalar] | MultiIndex: # possibly create a column mi here if _is_potential_multi_index(columns): - columns = MultiIndex.from_tuples(columns, names=col_names) + columns = cast(List[Tuple], columns) + return MultiIndex.from_tuples(columns, names=col_names) + columns = cast(List[Scalar], columns) return columns @final @@ -440,7 +446,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): index = index.set_names(indexnamerow[:coffset]) # maybe create a mi on the columns - columns = self._maybe_make_multi_index_columns(columns, self.col_names) + columns = self._maybe_make_multi_index_columns(columns) return index, columns @@ -842,18 +848,18 @@ def _evaluate_usecols(self, usecols: Callable, names: list[Scalar]) -> set[int]: ... @overload - def _evaluate_usecols( - self, - usecols: set[str | int], - names: list[Scalar], - ) -> set[str | int]: + def _evaluate_usecols(self, usecols: set[int], names: list[Scalar]) -> set[int]: + ... + + @overload + def _evaluate_usecols(self, usecols: set[str], names: list[Scalar]) -> set[str]: ... def _evaluate_usecols( self, - usecols: Callable | set[str | int], + usecols: Callable | set[str] | set[int], names: list[Scalar], - ) -> set[str | int]: + ) -> set[str] | set[int]: """ Check whether or not the 'usecols' parameter is a callable. If so, enumerates the 'names' diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index a3f0a12454b10..3baa5340d74d8 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -413,7 +413,7 @@ def ensure_dtype_objs( dtype objects. """ if isinstance(dtype, dict): - dtype = {k: pandas_dtype(dtype[k]) for k in dtype} + return {k: pandas_dtype(dtype[k]) for k in dtype} elif dtype is not None: - dtype = pandas_dtype(dtype) + return pandas_dtype(dtype) return dtype diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index b0e868b260369..a1145f070e0cf 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -528,6 +528,7 @@ def _handle_usecols( usecols_key is used if there are string usecols. """ + col_indices: set[int] | list[int] if self.usecols is not None: if callable(self.usecols): col_indices = self._evaluate_usecols(self.usecols, usecols_key) From 091b052fd7ad7f36cf0039a82e4fc91fca38de60 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 29 Nov 2021 23:11:52 +0100 Subject: [PATCH 3/9] Resolve conflicts --- pandas/io/parsers/base_parser.py | 22 ++++++++++++++-------- pandas/io/parsers/c_parser_wrapper.py | 20 ++++++++++++-------- pandas/io/parsers/python_parser.py | 4 ++-- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 13c9a76e61f89..b3f3f7e3187c0 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -453,9 +453,9 @@ def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: @final def _maybe_make_multi_index_columns( self, - columns: list[Scalar] | list[tuple], - col_names: list[Scalar | None] | list[tuple] | None = None, - ) -> list[Scalar] | MultiIndex: + columns: Sequence[Hashable], + col_names: Sequence[Hashable] | None = None, + ) -> Sequence[Hashable] | MultiIndex: # possibly create a column mi here if _is_potential_multi_index(columns): columns = cast(List[Tuple], columns) @@ -940,21 +940,27 @@ def _check_data_length( ) @overload - def _evaluate_usecols(self, usecols: Callable, names: list[Scalar]) -> set[int]: + def _evaluate_usecols( + self, usecols: Callable, names: Sequence[Hashable] + ) -> set[int]: ... @overload - def _evaluate_usecols(self, usecols: set[int], names: list[Scalar]) -> set[int]: + def _evaluate_usecols( + self, usecols: set[int], names: Sequence[Hashable] + ) -> set[int]: ... @overload - def _evaluate_usecols(self, usecols: set[str], names: list[Scalar]) -> set[str]: + def _evaluate_usecols( + self, usecols: set[str], names: Sequence[Hashable] + ) -> set[str]: ... def _evaluate_usecols( self, usecols: Callable | set[str] | set[int], - names: list[Scalar], + names: Sequence[Hashable], ) -> set[str] | set[int]: """ Check whether or not the 'usecols' parameter @@ -1321,7 +1327,7 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na): def _is_potential_multi_index( - columns: list[Scalar] | list[tuple] | MultiIndex, + columns: Sequence[Hashable] | MultiIndex, index_col: bool | Sequence[int] | None = None, ) -> bool: """ diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 2d7181b9de18f..ddf19e841dc69 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,5 +1,10 @@ from __future__ import annotations +from typing import ( + Hashable, + Mapping, + Sequence, +) import warnings import numpy as np @@ -11,7 +16,6 @@ DtypeObj, FilePath, ReadCsvBuffer, - Scalar, ) from pandas.errors import DtypeWarning from pandas.util._exceptions import find_stack_level @@ -226,8 +230,8 @@ def read( nrows: int | None = None, ) -> tuple[ Index | MultiIndex | None, - list[Scalar] | MultiIndex, - dict[Scalar | tuple, ArrayLike], + Sequence[Hashable] | MultiIndex, + Mapping[Hashable, ArrayLike], ]: try: if self.low_memory: @@ -320,11 +324,11 @@ def read( index, names = self._make_index(date_data, alldata, names) # maybe create a mi on the columns - names = self._maybe_make_multi_index_columns(names, self.col_names) + conv_names = self._maybe_make_multi_index_columns(names, self.col_names) - return index, names, date_data + return index, conv_names, date_data - def _filter_usecols(self, names: list[Scalar]) -> list[Scalar]: + def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: # hackish usecols = self._evaluate_usecols(self.usecols, names) if usecols is not None and len(names) != len(usecols): @@ -410,8 +414,8 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: def ensure_dtype_objs( - dtype: DtypeArg | dict[Scalar, DtypeArg] | None -) -> DtypeObj | dict[Scalar, DtypeObj] | None: + dtype: DtypeArg | Mapping[Hashable, DtypeArg] | None +) -> DtypeObj | Mapping[Hashable, DtypeObj] | None: """ Ensure we have either None, a dtype object, or a dictionary mapping to dtype objects. diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 506ffd24a92cc..ba2805aded121 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -268,8 +268,8 @@ def read(self, rows: int | None = None): self.index_names, self.dtype, ) - columns = self._maybe_make_multi_index_columns(columns, self.col_names) - return index, columns, col_dict + conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names) + return index, conv_columns, col_dict # handle new style for names in index count_empty_content_vals = count_empty_vals(content[0]) From a9b4e0747ccee303809438d65ab6291db071a363 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 29 Nov 2021 23:19:45 +0100 Subject: [PATCH 4/9] Add argument back in --- pandas/io/parsers/base_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index b3f3f7e3187c0..f7b5d13cad1c4 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -487,7 +487,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): index = index.set_names(indexnamerow[:coffset]) # maybe create a mi on the columns - columns = self._maybe_make_multi_index_columns(columns) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns From 2596f199ea5377ca977e859f98d91527a14b874f Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 1 Dec 2021 20:41:48 +0100 Subject: [PATCH 5/9] Adress review --- pandas/io/parsers/base_parser.py | 14 +++----------- pandas/io/parsers/c_parser_wrapper.py | 4 ++-- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index f7b5d13cad1c4..ed3cc71018df6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -34,7 +34,6 @@ DtypeArg, FilePath, ReadCsvBuffer, - Scalar, ) from pandas.errors import ( ParserError, @@ -458,9 +457,8 @@ def _maybe_make_multi_index_columns( ) -> Sequence[Hashable] | MultiIndex: # possibly create a column mi here if _is_potential_multi_index(columns): - columns = cast(List[Tuple], columns) - return MultiIndex.from_tuples(columns, names=col_names) - columns = cast(List[Scalar], columns) + list_columns = cast(List[Tuple], columns) + return MultiIndex.from_tuples(list_columns, names=col_names) return columns @final @@ -941,13 +939,7 @@ def _check_data_length( @overload def _evaluate_usecols( - self, usecols: Callable, names: Sequence[Hashable] - ) -> set[int]: - ... - - @overload - def _evaluate_usecols( - self, usecols: set[int], names: Sequence[Hashable] + self, usecols: set[int] | Callable, names: Sequence[Hashable] ) -> set[int]: ... diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index ddf19e841dc69..c58e166bb0002 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -414,8 +414,8 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: def ensure_dtype_objs( - dtype: DtypeArg | Mapping[Hashable, DtypeArg] | None -) -> DtypeObj | Mapping[Hashable, DtypeObj] | None: + dtype: DtypeArg | dict[Hashable, DtypeArg] | None +) -> DtypeObj | dict[Hashable, DtypeObj] | None: """ Ensure we have either None, a dtype object, or a dictionary mapping to dtype objects. From 95a0de0cc11d242417c7b10f17cb94aae2c7db9d Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 1 Dec 2021 21:16:27 +0100 Subject: [PATCH 6/9] Improve callable --- pandas/io/parsers/base_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index ed3cc71018df6..d9ed6cbe11e55 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -939,7 +939,7 @@ def _check_data_length( @overload def _evaluate_usecols( - self, usecols: set[int] | Callable, names: Sequence[Hashable] + self, usecols: set[int] | Callable[[Hashable], int], names: Sequence[Hashable] ) -> set[int]: ... @@ -951,7 +951,7 @@ def _evaluate_usecols( def _evaluate_usecols( self, - usecols: Callable | set[str] | set[int], + usecols: Callable[[Hashable], int] | set[str] | set[int], names: Sequence[Hashable], ) -> set[str] | set[int]: """ From 34795f818c617fe99fe65f3aadf5faf9fb23d218 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 18:50:42 +0100 Subject: [PATCH 7/9] Remove return from init --- pandas/io/parsers/c_parser_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index c58e166bb0002..988dcd3d8a124 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -45,7 +45,7 @@ class CParserWrapper(ParserBase): def __init__( self, src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds - ) -> None: + ): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) From fa8fc9ba866f73844de26cfbf3ebc216462a885b Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 19:45:03 +0100 Subject: [PATCH 8/9] Change callable --- pandas/io/parsers/base_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index d9ed6cbe11e55..8f8c496af498a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -939,7 +939,7 @@ def _check_data_length( @overload def _evaluate_usecols( - self, usecols: set[int] | Callable[[Hashable], int], names: Sequence[Hashable] + self, usecols: set[int] | Callable[[Hashable], bool], names: Sequence[Hashable] ) -> set[int]: ... @@ -951,7 +951,7 @@ def _evaluate_usecols( def _evaluate_usecols( self, - usecols: Callable[[Hashable], int] | set[str] | set[int], + usecols: Callable[[Hashable], bool] | set[str] | set[int], names: Sequence[Hashable], ) -> set[str] | set[int]: """ From 6df4cdca200b71a14f59e578c8dce3bf1cb3fb5b Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 22:52:22 +0100 Subject: [PATCH 9/9] Change callable --- pandas/io/parsers/base_parser.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 8f8c496af498a..93c082c9bb758 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -939,7 +939,9 @@ def _check_data_length( @overload def _evaluate_usecols( - self, usecols: set[int] | Callable[[Hashable], bool], names: Sequence[Hashable] + self, + usecols: set[int] | Callable[[Hashable], object], + names: Sequence[Hashable], ) -> set[int]: ... @@ -951,7 +953,7 @@ def _evaluate_usecols( def _evaluate_usecols( self, - usecols: Callable[[Hashable], bool] | set[str] | set[int], + usecols: Callable[[Hashable], object] | set[str] | set[int], names: Sequence[Hashable], ) -> set[str] | set[int]: """