From 354153797aee48dfa5c45cc8a2815793b239e8b1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jul 2024 11:35:11 -0700 Subject: [PATCH 1/6] Move some set parse_date_cols to pythong parser --- pandas/io/parsers/base_parser.py | 184 +++++++------------------- pandas/io/parsers/c_parser_wrapper.py | 3 +- pandas/io/parsers/python_parser.py | 95 ++++++++++++- 3 files changed, 145 insertions(+), 137 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e7473aabdff87..50096181b32f0 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -32,14 +32,12 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_dict_like, - is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, - pandas_dtype, ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -127,7 +125,6 @@ def __init__(self, kwds) -> None: "for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates - self._parse_date_cols: set = set() self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) @@ -187,52 +184,6 @@ def __init__(self, kwds) -> None: # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: - """ - Check if parse_dates are in columns. - - If user has provided names for parse_dates, check if those columns - are available. - - Parameters - ---------- - columns : list - List of names of the dataframe. - - Returns - ------- - The names of the columns which will get parsed later if a list - is given as specification. - - Raises - ------ - ValueError - If column to parse_date is not in dataframe. - - """ - if not isinstance(self.parse_dates, list): - return set() - - # get only columns that are references using names (str), not by index - missing_cols = ", ".join( - sorted( - { - col - for col in self.parse_dates - if isinstance(col, str) and col not in columns - } - ) - ) - if missing_cols: - raise ValueError( - f"Missing column provided to 'parse_dates': '{missing_cols}'" - ) - # Convert positions to actual column names - return { - col if (isinstance(col, str) or col in columns) else columns[col] - for col in self.parse_dates - } - def close(self) -> None: pass @@ -420,7 +371,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: assert self.index_names is not None col_name = self.index_names[i] if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values( + col_na_values, col_na_fvalues = get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na ) else: @@ -451,90 +402,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: return index - @final - def _convert_to_ndarrays( - self, - dct: Mapping, - na_values, - na_fvalues, - converters=None, - dtypes=None, - ) -> dict[Any, np.ndarray]: - result = {} - for c, values in dct.items(): - conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): - cast_type = dtypes.get(c, None) - else: - # single dtype or None - cast_type = dtypes - - if self.na_filter: - col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na - ) - else: - col_na_values, col_na_fvalues = set(), set() - - if c in self._parse_date_cols: - # GH#26203 Do not convert columns which get converted to dates - # but replace nans to ensure to_datetime works - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) - np.putmask(values, mask, np.nan) - result[c] = values - continue - - if conv_f is not None: - # conv_f applied to data before inference - if cast_type is not None: - warnings.warn( - ( - "Both a converter and dtype were specified " - f"for column {c} - only the converter will be used." - ), - ParserWarning, - stacklevel=find_stack_level(), - ) - - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = algorithms.isin(values, list(na_values)).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool=False, - ) - else: - is_ea = is_extension_array_dtype(cast_type) - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) - - # general type inference and conversion - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool, - ) - - # type specified in dtype param or cast_type is an EA - if cast_type is not None: - cast_type = pandas_dtype(cast_type) - if cast_type and (cvals.dtype != cast_type or is_ea): - if not is_ea and na_count > 0: - if is_bool_dtype(cast_type): - raise ValueError(f"Bool column has NA values in column {c}") - cvals = self._cast_types(cvals, cast_type, c) - - result[c] = cvals - return result - @final def _set_noconvert_dtype_columns( self, col_indices: list[int], names: Sequence[Hashable] @@ -580,6 +447,7 @@ def _set(x) -> int: return x if isinstance(self.parse_dates, list): + validate_parse_dates_presence(self.parse_dates, names) for val in self.parse_dates: noconvert_columns.add(_set(val)) @@ -1154,7 +1022,7 @@ def _process_date_conversion( return data_dict -def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): +def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. @@ -1191,3 +1059,49 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): def is_index_col(col) -> bool: return col is not None and col is not False + + +def validate_parse_dates_presence( + parse_dates: bool | list, columns: Sequence[Hashable] +) -> set: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Returns + ------- + The names of the columns which will get parsed later if a list + is given as specification. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + if not isinstance(parse_dates, list): + return set() + + missing = set() + unique_cols = set() + for col in parse_dates: + if isinstance(col, str): + if col not in columns: + missing.add(col) + else: + unique_cols.add(col) + elif col in columns: + unique_cols.add(col) + else: + unique_cols.add(columns[col]) + if missing: + missing_cols = ", ".join(sorted(missing)) + raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") + return unique_cols diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4de626288aa41..6444718006e93 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -31,6 +31,7 @@ ParserBase, ParserError, is_index_col, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -160,7 +161,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: ) # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + validate_parse_dates_presence(self.parse_dates, self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f7d2aa2419429..9d3345651bf91 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -10,9 +10,11 @@ from typing import ( IO, TYPE_CHECKING, + Any, DefaultDict, Literal, cast, + final, ) import warnings @@ -29,18 +31,25 @@ from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_integer, is_numeric_dtype, + is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core import algorithms + from pandas.io.common import ( dedup_names, is_potential_multi_index, ) from pandas.io.parsers.base_parser import ( ParserBase, + get_na_values, parser_defaults, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -157,7 +166,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: if self._col_indices is None: self._col_indices = list(range(len(self.columns))) - self._parse_date_cols = self._validate_parse_dates_presence(self.columns) self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: @@ -370,6 +378,91 @@ def _convert_data( clean_dtypes, ) + @final + def _convert_to_ndarrays( + self, + dct: Mapping, + na_values, + na_fvalues, + converters=None, + dtypes=None, + ) -> dict[Any, np.ndarray]: + result = {} + parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns) + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if c in parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) + np.putmask(values, mask, np.nan) + result[c] = values + continue + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool=False, + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, + ) + + # type specified in dtype param or cast_type is an EA + if cast_type is not None: + cast_type = pandas_dtype(cast_type) + if cast_type and (cvals.dtype != cast_type or is_ea): + if not is_ea and na_count > 0: + if is_bool_dtype(cast_type): + raise ValueError(f"Bool column has NA values in column {c}") + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + return result + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: From 8dbdf348f821e6c6154182384cbac7a070a93413 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:01:00 -0700 Subject: [PATCH 2/6] Clean up do_date_conversions --- pandas/io/parsers/base_parser.py | 128 ++++++++++---------------- pandas/io/parsers/c_parser_wrapper.py | 6 +- 2 files changed, 52 insertions(+), 82 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 50096181b32f0..eb05e7b096c8f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -46,7 +46,6 @@ from pandas.core.dtypes.missing import isna from pandas import ( - ArrowDtype, DataFrame, DatetimeIndex, StringDtype, @@ -142,12 +141,6 @@ def __init__(self, kwds) -> None: self.false_values = kwds.get("false_values") self.cache_dates = kwds.pop("cache_dates", True) - self._date_conv = _make_date_converter( - date_format=self.date_format, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - ) - # validate header options for mi self.header = kwds.get("header") if is_list_like(self.header, allow_sets=False): @@ -355,9 +348,12 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: for i, arr in enumerate(index): if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv( + arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) if self.na_filter: @@ -667,16 +663,25 @@ def _do_date_conversions( names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, ) -> Mapping[Hashable, ArrayLike] | DataFrame: - if isinstance(self.parse_dates, list): - return _process_date_conversion( - data, - self._date_conv, - self.parse_dates, - self.index_col, - self.index_names, - names, - dtype_backend=self.dtype_backend, + if not isinstance(self.parse_dates, list): + return data + for colspec in self.parse_dates: + if isinstance(colspec, int) and colspec not in data: + colspec = names[colspec] + if (isinstance(self.index_col, list) and colspec in self.index_col) or ( + isinstance(self.index_names, list) and colspec in self.index_names + ): + continue + result = date_converter( + data[colspec], + col=colspec, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data[colspec] = result # type: ignore[index] return data @@ -910,40 +915,37 @@ def _get_empty_meta( return index, columns, col_dict -def _make_date_converter( +def date_converter( + date_col, + col: Hashable, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - def converter(date_col, col: Hashable): - if date_col.dtype.kind in "Mm": - return date_col - - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format + if date_col.dtype.kind in "Mm": + return date_col + + date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format + + str_objs = lib.ensure_string_array(np.asarray(date_col)) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs - str_objs = lib.ensure_string_array(date_col) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - # test_multi_index_parse_dates - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - - return converter + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values parser_defaults = { @@ -986,42 +988,6 @@ def converter(date_col, col: Hashable): } -def _process_date_conversion( - data_dict: Mapping[Hashable, ArrayLike] | DataFrame, - converter: Callable, - parse_spec: list, - index_col, - index_names, - columns: Sequence[Hashable] | Index, - dtype_backend=lib.no_default, -) -> Mapping[Hashable, ArrayLike] | DataFrame: - for colspec in parse_spec: - if isinstance(colspec, int) and colspec not in data_dict: - colspec = columns[colspec] - if (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - result = converter(np.asarray(data_dict[colspec]), col=colspec) - # error: Unsupported target for indexed assignment - # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") - data_dict[colspec] = result # type: ignore[index] - - return data_dict - - def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 6444718006e93..b59a778624c49 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,6 +30,7 @@ from pandas.io.parsers.base_parser import ( ParserBase, ParserError, + date_converter, is_index_col, validate_parse_dates_presence, ) @@ -345,9 +346,12 @@ def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv( + values = date_converter( values, col=self.index_names[index] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) return values From 33a11feb637023ac4c6fdeccbb9c036ea0724d3b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:11:53 -0700 Subject: [PATCH 3/6] Move can cast to python parser --- pandas/io/parsers/base_parser.py | 185 ++++++++--------------------- pandas/io/parsers/python_parser.py | 88 +++++++++++++- 2 files changed, 138 insertions(+), 135 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index eb05e7b096c8f..e8faea76897c6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -28,7 +28,6 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, is_dict_like, @@ -39,10 +38,6 @@ is_object_dtype, is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, -) from pandas.core.dtypes.missing import isna from pandas import ( @@ -55,12 +50,9 @@ ArrowExtensionArray, BaseMaskedArray, BooleanArray, - Categorical, - ExtensionArray, FloatingArray, IntegerArray, ) -from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -83,7 +75,6 @@ from pandas._typing import ( ArrayLike, DtypeArg, - DtypeObj, Hashable, HashableT, Scalar, @@ -171,7 +162,7 @@ def __init__(self, kwds) -> None: self._first_chunk = True - self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) # Normally, this arg would get pre-processed earlier on @@ -569,80 +560,6 @@ def _infer_types( return result, na_count - @final - def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: - """ - Cast values to specified type - - Parameters - ---------- - values : ndarray or ExtensionArray - cast_type : np.dtype or ExtensionDtype - dtype to cast values to - column : string - column name - used only for error reporting - - Returns - ------- - converted : ndarray or ExtensionArray - """ - if isinstance(cast_type, CategoricalDtype): - known_cats = cast_type.categories is not None - - if not is_object_dtype(values.dtype) and not known_cats: - # TODO: this is for consistency with - # c-parser which parses all categories - # as strings - values = lib.ensure_string_array( - values, skipna=False, convert_na_value=False - ) - - cats = Index(values).unique().dropna() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, true_values=self.true_values - ) - - # use the EA's implementation of casting - elif isinstance(cast_type, ExtensionDtype): - array_type = cast_type.construct_array_type() - try: - if isinstance(cast_type, BooleanDtype): - # error: Unexpected keyword argument "true_values" for - # "_from_sequence_of_strings" of "ExtensionArray" - values_str = [str(val) for val in values] - return array_type._from_sequence_of_strings( # type: ignore[call-arg] - values_str, - dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - none_values=self.na_values, - ) - else: - return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError as err: - raise NotImplementedError( - f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" - ) from err - - elif isinstance(values, ExtensionArray): - values = values.astype(cast_type, copy=False) - elif issubclass(cast_type.type, str): - # TODO: why skipna=True here and False above? some tests depend - # on it here, but nothing fails if we change it above - # (as no tests get there as of 2022-12-06) - values = lib.ensure_string_array( - values, skipna=True, convert_na_value=False - ) - else: - try: - values = astype_array(values, cast_type, copy=True) - except ValueError as err: - raise ValueError( - f"Unable to convert column {column} to type {cast_type}" - ) from err - return values - @overload def _do_date_conversions( self, @@ -776,56 +693,6 @@ def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> Sequen return usecols - @final - def _validate_usecols_arg(self, usecols): - """ - Validate the 'usecols' parameter. - - Checks whether or not the 'usecols' parameter contains all integers - (column selection by index), strings (column by name) or is a callable. - Raises a ValueError if that is not the case. - - Parameters - ---------- - usecols : list-like, callable, or None - List of columns to use when parsing or a callable that can be used - to filter a list of table columns. - - Returns - ------- - usecols_tuple : tuple - A tuple of (verified_usecols, usecols_dtype). - - 'verified_usecols' is either a set if an array-like is passed in or - 'usecols' if a callable or None is passed in. - - 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like - is passed in or None if a callable or None is passed in. - """ - msg = ( - "'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable." - ) - if usecols is not None: - if callable(usecols): - return usecols, None - - if not is_list_like(usecols): - # see gh-20529 - # - # Ensure it is iterable container but not string. - raise ValueError(msg) - - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - - if usecols_dtype not in ("empty", "integer", "string"): - raise ValueError(msg) - - usecols = set(usecols) - - return usecols, usecols_dtype - return usecols, None - @final def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: if not is_index_col(index_col): @@ -1071,3 +938,53 @@ def validate_parse_dates_presence( missing_cols = ", ".join(sorted(missing)) raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") return unique_cols + + +def _validate_usecols_arg(usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 9d3345651bf91..425319e0eeff9 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -29,17 +29,29 @@ from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, is_extension_array_dtype, is_integer, is_numeric_dtype, + is_object_dtype, is_string_dtype, pandas_dtype, ) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, +) from pandas.core.dtypes.inference import is_dict_like from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.indexes.api import Index from pandas.io.common import ( dedup_names, @@ -62,13 +74,13 @@ from pandas._typing import ( ArrayLike, + DtypeObj, ReadCsvBuffer, Scalar, T, ) from pandas import ( - Index, MultiIndex, Series, ) @@ -463,6 +475,80 @@ def _convert_to_ndarrays( result[c] = cvals return result + @final + def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray or ExtensionArray + cast_type : np.dtype or ExtensionDtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray or ExtensionArray + """ + if isinstance(cast_type, CategoricalDtype): + known_cats = cast_type.categories is not None + + if not is_object_dtype(values.dtype) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = lib.ensure_string_array( + values, skipna=False, convert_na_value=False + ) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif isinstance(cast_type, ExtensionDtype): + array_type = cast_type.construct_array_type() + try: + if isinstance(cast_type, BooleanDtype): + # error: Unexpected keyword argument "true_values" for + # "_from_sequence_of_strings" of "ExtensionArray" + values_str = [str(val) for val in values] + return array_type._from_sequence_of_strings( # type: ignore[call-arg] + values_str, + dtype=cast_type, + true_values=self.true_values, + false_values=self.false_values, + none_values=self.na_values, + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + elif isinstance(values, ExtensionArray): + values = values.astype(cast_type, copy=False) + elif issubclass(cast_type.type, str): + # TODO: why skipna=True here and False above? some tests depend + # on it here, but nothing fails if we change it above + # (as no tests get there as of 2022-12-06) + values = lib.ensure_string_array( + values, skipna=True, convert_na_value=False + ) + else: + try: + values = astype_array(values, cast_type, copy=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: From 99ca747bbc4c82934245384049a209459432221b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:16:43 -0700 Subject: [PATCH 4/6] Move can cast to python parser --- pandas/io/parsers/base_parser.py | 132 +---------------------------- pandas/io/parsers/python_parser.py | 129 +++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 132 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e8faea76897c6..1bbb0abb0975b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -15,13 +15,8 @@ import numpy as np -from pandas._libs import ( - lib, - parsers, -) -import pandas._libs.ops as libops +from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES -from pandas.compat._optional import import_optional_dependency from pandas.errors import ( ParserError, ParserWarning, @@ -29,11 +24,8 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( - is_bool_dtype, is_dict_like, - is_float_dtype, is_integer, - is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, @@ -43,15 +35,6 @@ from pandas import ( DataFrame, DatetimeIndex, - StringDtype, -) -from pandas.core import algorithms -from pandas.core.arrays import ( - ArrowExtensionArray, - BaseMaskedArray, - BooleanArray, - FloatingArray, - IntegerArray, ) from pandas.core.indexes.api import ( Index, @@ -447,119 +430,6 @@ def _set(x) -> int: return noconvert_columns - @final - def _infer_types( - self, values, na_values, no_dtype_specified, try_num_bool: bool = True - ) -> tuple[ArrayLike, int]: - """ - Infer types of values, possibly casting - - Parameters - ---------- - values : ndarray - na_values : set - no_dtype_specified: Specifies if we want to cast explicitly - try_num_bool : bool, default try - try to cast values to numeric (first preference) or boolean - - Returns - ------- - converted : ndarray or ExtensionArray - na_count : int - """ - na_count = 0 - if issubclass(values.dtype.type, (np.number, np.bool_)): - # If our array has numeric dtype, we don't have to check for strings in isin - na_values = np.array([val for val in na_values if not isinstance(val, str)]) - mask = algorithms.isin(values, na_values) - na_count = mask.astype("uint8", copy=False).sum() - if na_count > 0: - if is_integer_dtype(values): - values = values.astype(np.float64) - np.putmask(values, mask, np.nan) - return values, na_count - - dtype_backend = self.dtype_backend - non_default_dtype_backend = ( - no_dtype_specified and dtype_backend is not lib.no_default - ) - result: ArrayLike - - if try_num_bool and is_object_dtype(values.dtype): - # exclude e.g DatetimeIndex here - try: - result, result_mask = lib.maybe_convert_numeric( - values, - na_values, - False, - convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] - ) - except (ValueError, TypeError): - # e.g. encountering datetime string gets ValueError - # TypeError can be raised in floatify - na_count = parsers.sanitize_objects(values, na_values) - result = values - else: - if non_default_dtype_backend: - if result_mask is None: - result_mask = np.zeros(result.shape, dtype=np.bool_) - - if result_mask.all(): - result = IntegerArray( - np.ones(result_mask.shape, dtype=np.int64), result_mask - ) - elif is_integer_dtype(result): - result = IntegerArray(result, result_mask) - elif is_bool_dtype(result): - result = BooleanArray(result, result_mask) - elif is_float_dtype(result): - result = FloatingArray(result, result_mask) - - na_count = result_mask.sum() - else: - na_count = isna(result).sum() - else: - result = values - if values.dtype == np.object_: - na_count = parsers.sanitize_objects(values, na_values) - - if result.dtype == np.object_ and try_num_bool: - result, bool_mask = libops.maybe_convert_bool( - np.asarray(values), - true_values=self.true_values, - false_values=self.false_values, - convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] - ) - if result.dtype == np.bool_ and non_default_dtype_backend: - if bool_mask is None: - bool_mask = np.zeros(result.shape, dtype=np.bool_) - result = BooleanArray(result, bool_mask) - elif result.dtype == np.object_ and non_default_dtype_backend: - # read_excel sends array of datetime objects - if not lib.is_datetime_array(result, skipna=True): - dtype = StringDtype() - cls = dtype.construct_array_type() - result = cls._from_sequence(values, dtype=dtype) - - if dtype_backend == "pyarrow": - pa = import_optional_dependency("pyarrow") - if isinstance(result, np.ndarray): - result = ArrowExtensionArray(pa.array(result, from_pandas=True)) - elif isinstance(result, BaseMaskedArray): - if result._mask.all(): - # We want an arrow null array here - result = ArrowExtensionArray(pa.array([None] * len(result))) - else: - result = ArrowExtensionArray( - pa.array(result._data, mask=result._mask) - ) - else: - result = ArrowExtensionArray( - pa.array(result.to_numpy(), from_pandas=True) - ) - - return result, na_count - @overload def _do_date_conversions( self, diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 425319e0eeff9..bc04a83274a8f 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -20,7 +20,12 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + parsers, +) +import pandas._libs.ops as libops +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, ParserError, @@ -33,7 +38,9 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_extension_array_dtype, + is_float_dtype, is_integer, + is_integer_dtype, is_numeric_dtype, is_object_dtype, is_string_dtype, @@ -44,13 +51,20 @@ ExtensionDtype, ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core.dtypes.missing import isna from pandas.core import algorithms from pandas.core.arrays import ( + ArrowExtensionArray, + BaseMaskedArray, + BooleanArray, Categorical, ExtensionArray, + FloatingArray, + IntegerArray, ) from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexes.api import Index from pandas.io.common import ( @@ -549,6 +563,119 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi ) from err return values + @final + def _infer_types( + self, values, na_values, no_dtype_specified, try_num_bool: bool = True + ) -> tuple[ArrayLike, int]: + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + no_dtype_specified: Specifies if we want to cast explicitly + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns + ------- + converted : ndarray or ExtensionArray + na_count : int + """ + na_count = 0 + if issubclass(values.dtype.type, (np.number, np.bool_)): + # If our array has numeric dtype, we don't have to check for strings in isin + na_values = np.array([val for val in na_values if not isinstance(val, str)]) + mask = algorithms.isin(values, na_values) + na_count = mask.astype("uint8", copy=False).sum() + if na_count > 0: + if is_integer_dtype(values): + values = values.astype(np.float64) + np.putmask(values, mask, np.nan) + return values, na_count + + dtype_backend = self.dtype_backend + non_default_dtype_backend = ( + no_dtype_specified and dtype_backend is not lib.no_default + ) + result: ArrayLike + + if try_num_bool and is_object_dtype(values.dtype): + # exclude e.g DatetimeIndex here + try: + result, result_mask = lib.maybe_convert_numeric( + values, + na_values, + False, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] + ) + except (ValueError, TypeError): + # e.g. encountering datetime string gets ValueError + # TypeError can be raised in floatify + na_count = parsers.sanitize_objects(values, na_values) + result = values + else: + if non_default_dtype_backend: + if result_mask is None: + result_mask = np.zeros(result.shape, dtype=np.bool_) + + if result_mask.all(): + result = IntegerArray( + np.ones(result_mask.shape, dtype=np.int64), result_mask + ) + elif is_integer_dtype(result): + result = IntegerArray(result, result_mask) + elif is_bool_dtype(result): + result = BooleanArray(result, result_mask) + elif is_float_dtype(result): + result = FloatingArray(result, result_mask) + + na_count = result_mask.sum() + else: + na_count = isna(result).sum() + else: + result = values + if values.dtype == np.object_: + na_count = parsers.sanitize_objects(values, na_values) + + if result.dtype == np.object_ and try_num_bool: + result, bool_mask = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] + ) + if result.dtype == np.bool_ and non_default_dtype_backend: + if bool_mask is None: + bool_mask = np.zeros(result.shape, dtype=np.bool_) + result = BooleanArray(result, bool_mask) + elif result.dtype == np.object_ and non_default_dtype_backend: + # read_excel sends array of datetime objects + if not lib.is_datetime_array(result, skipna=True): + dtype = StringDtype() + cls = dtype.construct_array_type() + result = cls._from_sequence(values, dtype=dtype) + + if dtype_backend == "pyarrow": + pa = import_optional_dependency("pyarrow") + if isinstance(result, np.ndarray): + result = ArrowExtensionArray(pa.array(result, from_pandas=True)) + elif isinstance(result, BaseMaskedArray): + if result._mask.all(): + # We want an arrow null array here + result = ArrowExtensionArray(pa.array([None] * len(result))) + else: + result = ArrowExtensionArray( + pa.array(result._data, mask=result._mask) + ) + else: + result = ArrowExtensionArray( + pa.array(result.to_numpy(), from_pandas=True) + ) + + return result, na_count + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: From cbf29f24d04afc12ee03826f7e3fed4c3caaeef0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:19:44 -0700 Subject: [PATCH 5/6] Revert "Move can cast to python parser" This reverts commit 99ca747bbc4c82934245384049a209459432221b. --- pandas/io/parsers/base_parser.py | 132 ++++++++++++++++++++++++++++- pandas/io/parsers/python_parser.py | 129 +--------------------------- 2 files changed, 132 insertions(+), 129 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 1bbb0abb0975b..e8faea76897c6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -15,8 +15,13 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + parsers, +) +import pandas._libs.ops as libops from pandas._libs.parsers import STR_NA_VALUES +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( ParserError, ParserWarning, @@ -24,8 +29,11 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( + is_bool_dtype, is_dict_like, + is_float_dtype, is_integer, + is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, @@ -35,6 +43,15 @@ from pandas import ( DataFrame, DatetimeIndex, + StringDtype, +) +from pandas.core import algorithms +from pandas.core.arrays import ( + ArrowExtensionArray, + BaseMaskedArray, + BooleanArray, + FloatingArray, + IntegerArray, ) from pandas.core.indexes.api import ( Index, @@ -430,6 +447,119 @@ def _set(x) -> int: return noconvert_columns + @final + def _infer_types( + self, values, na_values, no_dtype_specified, try_num_bool: bool = True + ) -> tuple[ArrayLike, int]: + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + no_dtype_specified: Specifies if we want to cast explicitly + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns + ------- + converted : ndarray or ExtensionArray + na_count : int + """ + na_count = 0 + if issubclass(values.dtype.type, (np.number, np.bool_)): + # If our array has numeric dtype, we don't have to check for strings in isin + na_values = np.array([val for val in na_values if not isinstance(val, str)]) + mask = algorithms.isin(values, na_values) + na_count = mask.astype("uint8", copy=False).sum() + if na_count > 0: + if is_integer_dtype(values): + values = values.astype(np.float64) + np.putmask(values, mask, np.nan) + return values, na_count + + dtype_backend = self.dtype_backend + non_default_dtype_backend = ( + no_dtype_specified and dtype_backend is not lib.no_default + ) + result: ArrayLike + + if try_num_bool and is_object_dtype(values.dtype): + # exclude e.g DatetimeIndex here + try: + result, result_mask = lib.maybe_convert_numeric( + values, + na_values, + False, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] + ) + except (ValueError, TypeError): + # e.g. encountering datetime string gets ValueError + # TypeError can be raised in floatify + na_count = parsers.sanitize_objects(values, na_values) + result = values + else: + if non_default_dtype_backend: + if result_mask is None: + result_mask = np.zeros(result.shape, dtype=np.bool_) + + if result_mask.all(): + result = IntegerArray( + np.ones(result_mask.shape, dtype=np.int64), result_mask + ) + elif is_integer_dtype(result): + result = IntegerArray(result, result_mask) + elif is_bool_dtype(result): + result = BooleanArray(result, result_mask) + elif is_float_dtype(result): + result = FloatingArray(result, result_mask) + + na_count = result_mask.sum() + else: + na_count = isna(result).sum() + else: + result = values + if values.dtype == np.object_: + na_count = parsers.sanitize_objects(values, na_values) + + if result.dtype == np.object_ and try_num_bool: + result, bool_mask = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] + ) + if result.dtype == np.bool_ and non_default_dtype_backend: + if bool_mask is None: + bool_mask = np.zeros(result.shape, dtype=np.bool_) + result = BooleanArray(result, bool_mask) + elif result.dtype == np.object_ and non_default_dtype_backend: + # read_excel sends array of datetime objects + if not lib.is_datetime_array(result, skipna=True): + dtype = StringDtype() + cls = dtype.construct_array_type() + result = cls._from_sequence(values, dtype=dtype) + + if dtype_backend == "pyarrow": + pa = import_optional_dependency("pyarrow") + if isinstance(result, np.ndarray): + result = ArrowExtensionArray(pa.array(result, from_pandas=True)) + elif isinstance(result, BaseMaskedArray): + if result._mask.all(): + # We want an arrow null array here + result = ArrowExtensionArray(pa.array([None] * len(result))) + else: + result = ArrowExtensionArray( + pa.array(result._data, mask=result._mask) + ) + else: + result = ArrowExtensionArray( + pa.array(result.to_numpy(), from_pandas=True) + ) + + return result, na_count + @overload def _do_date_conversions( self, diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index bc04a83274a8f..425319e0eeff9 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -20,12 +20,7 @@ import numpy as np -from pandas._libs import ( - lib, - parsers, -) -import pandas._libs.ops as libops -from pandas.compat._optional import import_optional_dependency +from pandas._libs import lib from pandas.errors import ( EmptyDataError, ParserError, @@ -38,9 +33,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_extension_array_dtype, - is_float_dtype, is_integer, - is_integer_dtype, is_numeric_dtype, is_object_dtype, is_string_dtype, @@ -51,20 +44,13 @@ ExtensionDtype, ) from pandas.core.dtypes.inference import is_dict_like -from pandas.core.dtypes.missing import isna from pandas.core import algorithms from pandas.core.arrays import ( - ArrowExtensionArray, - BaseMaskedArray, - BooleanArray, Categorical, ExtensionArray, - FloatingArray, - IntegerArray, ) from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.string_ import StringDtype from pandas.core.indexes.api import Index from pandas.io.common import ( @@ -563,119 +549,6 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi ) from err return values - @final - def _infer_types( - self, values, na_values, no_dtype_specified, try_num_bool: bool = True - ) -> tuple[ArrayLike, int]: - """ - Infer types of values, possibly casting - - Parameters - ---------- - values : ndarray - na_values : set - no_dtype_specified: Specifies if we want to cast explicitly - try_num_bool : bool, default try - try to cast values to numeric (first preference) or boolean - - Returns - ------- - converted : ndarray or ExtensionArray - na_count : int - """ - na_count = 0 - if issubclass(values.dtype.type, (np.number, np.bool_)): - # If our array has numeric dtype, we don't have to check for strings in isin - na_values = np.array([val for val in na_values if not isinstance(val, str)]) - mask = algorithms.isin(values, na_values) - na_count = mask.astype("uint8", copy=False).sum() - if na_count > 0: - if is_integer_dtype(values): - values = values.astype(np.float64) - np.putmask(values, mask, np.nan) - return values, na_count - - dtype_backend = self.dtype_backend - non_default_dtype_backend = ( - no_dtype_specified and dtype_backend is not lib.no_default - ) - result: ArrayLike - - if try_num_bool and is_object_dtype(values.dtype): - # exclude e.g DatetimeIndex here - try: - result, result_mask = lib.maybe_convert_numeric( - values, - na_values, - False, - convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] - ) - except (ValueError, TypeError): - # e.g. encountering datetime string gets ValueError - # TypeError can be raised in floatify - na_count = parsers.sanitize_objects(values, na_values) - result = values - else: - if non_default_dtype_backend: - if result_mask is None: - result_mask = np.zeros(result.shape, dtype=np.bool_) - - if result_mask.all(): - result = IntegerArray( - np.ones(result_mask.shape, dtype=np.int64), result_mask - ) - elif is_integer_dtype(result): - result = IntegerArray(result, result_mask) - elif is_bool_dtype(result): - result = BooleanArray(result, result_mask) - elif is_float_dtype(result): - result = FloatingArray(result, result_mask) - - na_count = result_mask.sum() - else: - na_count = isna(result).sum() - else: - result = values - if values.dtype == np.object_: - na_count = parsers.sanitize_objects(values, na_values) - - if result.dtype == np.object_ and try_num_bool: - result, bool_mask = libops.maybe_convert_bool( - np.asarray(values), - true_values=self.true_values, - false_values=self.false_values, - convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] - ) - if result.dtype == np.bool_ and non_default_dtype_backend: - if bool_mask is None: - bool_mask = np.zeros(result.shape, dtype=np.bool_) - result = BooleanArray(result, bool_mask) - elif result.dtype == np.object_ and non_default_dtype_backend: - # read_excel sends array of datetime objects - if not lib.is_datetime_array(result, skipna=True): - dtype = StringDtype() - cls = dtype.construct_array_type() - result = cls._from_sequence(values, dtype=dtype) - - if dtype_backend == "pyarrow": - pa = import_optional_dependency("pyarrow") - if isinstance(result, np.ndarray): - result = ArrowExtensionArray(pa.array(result, from_pandas=True)) - elif isinstance(result, BaseMaskedArray): - if result._mask.all(): - # We want an arrow null array here - result = ArrowExtensionArray(pa.array([None] * len(result))) - else: - result = ArrowExtensionArray( - pa.array(result._data, mask=result._mask) - ) - else: - result = ArrowExtensionArray( - pa.array(result.to_numpy(), from_pandas=True) - ) - - return result, na_count - @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: From 87758bf717b023d6e60496fe1083689fbb10bdae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:31:46 -0700 Subject: [PATCH 6/6] Typing issues --- pandas/io/parsers/python_parser.py | 8 ++++---- pandas/tests/io/parser/test_network.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 425319e0eeff9..05fe963e9b2b7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -419,7 +419,7 @@ def _convert_to_ndarrays( if c in parse_date_cols: # GH#26203 Do not convert columns which get converted to dates # but replace nans to ensure to_datetime works - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) # pyright: ignore[reportArgumentType] np.putmask(values, mask, np.nan) result[c] = values continue @@ -519,9 +519,9 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi return array_type._from_sequence_of_strings( # type: ignore[call-arg] values_str, dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - none_values=self.na_values, + true_values=self.true_values, # pyright: ignore[reportCallIssue] + false_values=self.false_values, # pyright: ignore[reportCallIssue] + none_values=self.na_values, # pyright: ignore[reportCallIssue] ) else: return array_type._from_sequence_of_strings(values, dtype=cast_type) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index f63cc3d56bf89..4ccfa8e81e883 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -75,6 +75,7 @@ def tips_df(datapath): @pytest.mark.single_cpu +@pytest.mark.network @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: