From 354153797aee48dfa5c45cc8a2815793b239e8b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Jul 2024 11:35:11 -0700
Subject: [PATCH 1/6] Move some set parse_date_cols to pythong parser

---
 pandas/io/parsers/base_parser.py      | 184 +++++++-------------------
 pandas/io/parsers/c_parser_wrapper.py |   3 +-
 pandas/io/parsers/python_parser.py    |  95 ++++++++++++-
 3 files changed, 145 insertions(+), 137 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index e7473aabdff87..50096181b32f0 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -32,14 +32,12 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_dict_like,
-    is_extension_array_dtype,
     is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_list_like,
     is_object_dtype,
     is_string_dtype,
-    pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
@@ -127,7 +125,6 @@ def __init__(self, kwds) -> None:
                 "for the 'parse_dates' parameter"
             )
         self.parse_dates: bool | list = parse_dates
-        self._parse_date_cols: set = set()
         self.date_parser = kwds.pop("date_parser", lib.no_default)
         self.date_format = kwds.pop("date_format", None)
         self.dayfirst = kwds.pop("dayfirst", False)
@@ -187,52 +184,6 @@ def __init__(self, kwds) -> None:
         # Normally, this arg would get pre-processed earlier on
         self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
 
-    def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set:
-        """
-        Check if parse_dates are in columns.
-
-        If user has provided names for parse_dates, check if those columns
-        are available.
-
-        Parameters
-        ----------
-        columns : list
-            List of names of the dataframe.
-
-        Returns
-        -------
-        The names of the columns which will get parsed later if a list
-        is given as specification.
-
-        Raises
-        ------
-        ValueError
-            If column to parse_date is not in dataframe.
-
-        """
-        if not isinstance(self.parse_dates, list):
-            return set()
-
-        # get only columns that are references using names (str), not by index
-        missing_cols = ", ".join(
-            sorted(
-                {
-                    col
-                    for col in self.parse_dates
-                    if isinstance(col, str) and col not in columns
-                }
-            )
-        )
-        if missing_cols:
-            raise ValueError(
-                f"Missing column provided to 'parse_dates': '{missing_cols}'"
-            )
-        # Convert positions to actual column names
-        return {
-            col if (isinstance(col, str) or col in columns) else columns[col]
-            for col in self.parse_dates
-        }
-
     def close(self) -> None:
         pass
 
@@ -420,7 +371,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
                 assert self.index_names is not None
                 col_name = self.index_names[i]
                 if col_name is not None:
-                    col_na_values, col_na_fvalues = _get_na_values(
+                    col_na_values, col_na_fvalues = get_na_values(
                         col_name, self.na_values, self.na_fvalues, self.keep_default_na
                     )
                 else:
@@ -451,90 +402,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
 
         return index
 
-    @final
-    def _convert_to_ndarrays(
-        self,
-        dct: Mapping,
-        na_values,
-        na_fvalues,
-        converters=None,
-        dtypes=None,
-    ) -> dict[Any, np.ndarray]:
-        result = {}
-        for c, values in dct.items():
-            conv_f = None if converters is None else converters.get(c, None)
-            if isinstance(dtypes, dict):
-                cast_type = dtypes.get(c, None)
-            else:
-                # single dtype or None
-                cast_type = dtypes
-
-            if self.na_filter:
-                col_na_values, col_na_fvalues = _get_na_values(
-                    c, na_values, na_fvalues, self.keep_default_na
-                )
-            else:
-                col_na_values, col_na_fvalues = set(), set()
-
-            if c in self._parse_date_cols:
-                # GH#26203 Do not convert columns which get converted to dates
-                # but replace nans to ensure to_datetime works
-                mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
-                np.putmask(values, mask, np.nan)
-                result[c] = values
-                continue
-
-            if conv_f is not None:
-                # conv_f applied to data before inference
-                if cast_type is not None:
-                    warnings.warn(
-                        (
-                            "Both a converter and dtype were specified "
-                            f"for column {c} - only the converter will be used."
-                        ),
-                        ParserWarning,
-                        stacklevel=find_stack_level(),
-                    )
-
-                try:
-                    values = lib.map_infer(values, conv_f)
-                except ValueError:
-                    mask = algorithms.isin(values, list(na_values)).view(np.uint8)
-                    values = lib.map_infer_mask(values, conv_f, mask)
-
-                cvals, na_count = self._infer_types(
-                    values,
-                    set(col_na_values) | col_na_fvalues,
-                    cast_type is None,
-                    try_num_bool=False,
-                )
-            else:
-                is_ea = is_extension_array_dtype(cast_type)
-                is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
-                # skip inference if specified dtype is object
-                # or casting to an EA
-                try_num_bool = not (cast_type and is_str_or_ea_dtype)
-
-                # general type inference and conversion
-                cvals, na_count = self._infer_types(
-                    values,
-                    set(col_na_values) | col_na_fvalues,
-                    cast_type is None,
-                    try_num_bool,
-                )
-
-                # type specified in dtype param or cast_type is an EA
-                if cast_type is not None:
-                    cast_type = pandas_dtype(cast_type)
-                if cast_type and (cvals.dtype != cast_type or is_ea):
-                    if not is_ea and na_count > 0:
-                        if is_bool_dtype(cast_type):
-                            raise ValueError(f"Bool column has NA values in column {c}")
-                    cvals = self._cast_types(cvals, cast_type, c)
-
-            result[c] = cvals
-        return result
-
     @final
     def _set_noconvert_dtype_columns(
         self, col_indices: list[int], names: Sequence[Hashable]
@@ -580,6 +447,7 @@ def _set(x) -> int:
             return x
 
         if isinstance(self.parse_dates, list):
+            validate_parse_dates_presence(self.parse_dates, names)
             for val in self.parse_dates:
                 noconvert_columns.add(_set(val))
 
@@ -1154,7 +1022,7 @@ def _process_date_conversion(
     return data_dict
 
 
-def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
+def get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
     """
     Get the NaN values for a given column.
 
@@ -1191,3 +1059,49 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
 
 def is_index_col(col) -> bool:
     return col is not None and col is not False
+
+
+def validate_parse_dates_presence(
+    parse_dates: bool | list, columns: Sequence[Hashable]
+) -> set:
+    """
+    Check if parse_dates are in columns.
+
+    If user has provided names for parse_dates, check if those columns
+    are available.
+
+    Parameters
+    ----------
+    columns : list
+        List of names of the dataframe.
+
+    Returns
+    -------
+    The names of the columns which will get parsed later if a list
+    is given as specification.
+
+    Raises
+    ------
+    ValueError
+        If column to parse_date is not in dataframe.
+
+    """
+    if not isinstance(parse_dates, list):
+        return set()
+
+    missing = set()
+    unique_cols = set()
+    for col in parse_dates:
+        if isinstance(col, str):
+            if col not in columns:
+                missing.add(col)
+            else:
+                unique_cols.add(col)
+        elif col in columns:
+            unique_cols.add(col)
+        else:
+            unique_cols.add(columns[col])
+    if missing:
+        missing_cols = ", ".join(sorted(missing))
+        raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'")
+    return unique_cols
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 4de626288aa41..6444718006e93 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -31,6 +31,7 @@
     ParserBase,
     ParserError,
     is_index_col,
+    validate_parse_dates_presence,
 )
 
 if TYPE_CHECKING:
@@ -160,7 +161,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
                 )
 
         # error: Cannot determine type of 'names'
-        self._validate_parse_dates_presence(self.names)  # type: ignore[has-type]
+        validate_parse_dates_presence(self.parse_dates, self.names)  # type: ignore[has-type]
         self._set_noconvert_columns()
 
         # error: Cannot determine type of 'names'
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index f7d2aa2419429..9d3345651bf91 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -10,9 +10,11 @@
 from typing import (
     IO,
     TYPE_CHECKING,
+    Any,
     DefaultDict,
     Literal,
     cast,
+    final,
 )
 import warnings
 
@@ -29,18 +31,25 @@
 
 from pandas.core.dtypes.common import (
     is_bool_dtype,
+    is_extension_array_dtype,
     is_integer,
     is_numeric_dtype,
+    is_string_dtype,
+    pandas_dtype,
 )
 from pandas.core.dtypes.inference import is_dict_like
 
+from pandas.core import algorithms
+
 from pandas.io.common import (
     dedup_names,
     is_potential_multi_index,
 )
 from pandas.io.parsers.base_parser import (
     ParserBase,
+    get_na_values,
     parser_defaults,
+    validate_parse_dates_presence,
 )
 
 if TYPE_CHECKING:
@@ -157,7 +166,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
         if self._col_indices is None:
             self._col_indices = list(range(len(self.columns)))
 
-        self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
         self._no_thousands_columns = self._set_no_thousand_columns()
 
         if len(self.decimal) != 1:
@@ -370,6 +378,91 @@ def _convert_data(
             clean_dtypes,
         )
 
+    @final
+    def _convert_to_ndarrays(
+        self,
+        dct: Mapping,
+        na_values,
+        na_fvalues,
+        converters=None,
+        dtypes=None,
+    ) -> dict[Any, np.ndarray]:
+        result = {}
+        parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns)
+        for c, values in dct.items():
+            conv_f = None if converters is None else converters.get(c, None)
+            if isinstance(dtypes, dict):
+                cast_type = dtypes.get(c, None)
+            else:
+                # single dtype or None
+                cast_type = dtypes
+
+            if self.na_filter:
+                col_na_values, col_na_fvalues = get_na_values(
+                    c, na_values, na_fvalues, self.keep_default_na
+                )
+            else:
+                col_na_values, col_na_fvalues = set(), set()
+
+            if c in parse_date_cols:
+                # GH#26203 Do not convert columns which get converted to dates
+                # but replace nans to ensure to_datetime works
+                mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
+                np.putmask(values, mask, np.nan)
+                result[c] = values
+                continue
+
+            if conv_f is not None:
+                # conv_f applied to data before inference
+                if cast_type is not None:
+                    warnings.warn(
+                        (
+                            "Both a converter and dtype were specified "
+                            f"for column {c} - only the converter will be used."
+                        ),
+                        ParserWarning,
+                        stacklevel=find_stack_level(),
+                    )
+
+                try:
+                    values = lib.map_infer(values, conv_f)
+                except ValueError:
+                    mask = algorithms.isin(values, list(na_values)).view(np.uint8)
+                    values = lib.map_infer_mask(values, conv_f, mask)
+
+                cvals, na_count = self._infer_types(
+                    values,
+                    set(col_na_values) | col_na_fvalues,
+                    cast_type is None,
+                    try_num_bool=False,
+                )
+            else:
+                is_ea = is_extension_array_dtype(cast_type)
+                is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
+                # skip inference if specified dtype is object
+                # or casting to an EA
+                try_num_bool = not (cast_type and is_str_or_ea_dtype)
+
+                # general type inference and conversion
+                cvals, na_count = self._infer_types(
+                    values,
+                    set(col_na_values) | col_na_fvalues,
+                    cast_type is None,
+                    try_num_bool,
+                )
+
+                # type specified in dtype param or cast_type is an EA
+                if cast_type is not None:
+                    cast_type = pandas_dtype(cast_type)
+                if cast_type and (cvals.dtype != cast_type or is_ea):
+                    if not is_ea and na_count > 0:
+                        if is_bool_dtype(cast_type):
+                            raise ValueError(f"Bool column has NA values in column {c}")
+                    cvals = self._cast_types(cvals, cast_type, c)
+
+            result[c] = cvals
+        return result
+
     @cache_readonly
     def _have_mi_columns(self) -> bool:
         if self.header is None:

From 8dbdf348f821e6c6154182384cbac7a070a93413 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Jul 2024 12:01:00 -0700
Subject: [PATCH 2/6] Clean up do_date_conversions

---
 pandas/io/parsers/base_parser.py      | 128 ++++++++++----------------
 pandas/io/parsers/c_parser_wrapper.py |   6 +-
 2 files changed, 52 insertions(+), 82 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 50096181b32f0..eb05e7b096c8f 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -46,7 +46,6 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas import (
-    ArrowDtype,
     DataFrame,
     DatetimeIndex,
     StringDtype,
@@ -142,12 +141,6 @@ def __init__(self, kwds) -> None:
         self.false_values = kwds.get("false_values")
         self.cache_dates = kwds.pop("cache_dates", True)
 
-        self._date_conv = _make_date_converter(
-            date_format=self.date_format,
-            dayfirst=self.dayfirst,
-            cache_dates=self.cache_dates,
-        )
-
         # validate header options for mi
         self.header = kwds.get("header")
         if is_list_like(self.header, allow_sets=False):
@@ -355,9 +348,12 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
 
         for i, arr in enumerate(index):
             if try_parse_dates and self._should_parse_dates(i):
-                arr = self._date_conv(
+                arr = date_converter(
                     arr,
                     col=self.index_names[i] if self.index_names is not None else None,
+                    dayfirst=self.dayfirst,
+                    cache_dates=self.cache_dates,
+                    date_format=self.date_format,
                 )
 
             if self.na_filter:
@@ -667,16 +663,25 @@ def _do_date_conversions(
         names: Sequence[Hashable] | Index,
         data: Mapping[Hashable, ArrayLike] | DataFrame,
     ) -> Mapping[Hashable, ArrayLike] | DataFrame:
-        if isinstance(self.parse_dates, list):
-            return _process_date_conversion(
-                data,
-                self._date_conv,
-                self.parse_dates,
-                self.index_col,
-                self.index_names,
-                names,
-                dtype_backend=self.dtype_backend,
+        if not isinstance(self.parse_dates, list):
+            return data
+        for colspec in self.parse_dates:
+            if isinstance(colspec, int) and colspec not in data:
+                colspec = names[colspec]
+            if (isinstance(self.index_col, list) and colspec in self.index_col) or (
+                isinstance(self.index_names, list) and colspec in self.index_names
+            ):
+                continue
+            result = date_converter(
+                data[colspec],
+                col=colspec,
+                dayfirst=self.dayfirst,
+                cache_dates=self.cache_dates,
+                date_format=self.date_format,
             )
+            # error: Unsupported target for indexed assignment
+            # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame")
+            data[colspec] = result  # type: ignore[index]
 
         return data
 
@@ -910,40 +915,37 @@ def _get_empty_meta(
         return index, columns, col_dict
 
 
-def _make_date_converter(
+def date_converter(
+    date_col,
+    col: Hashable,
     dayfirst: bool = False,
     cache_dates: bool = True,
     date_format: dict[Hashable, str] | str | None = None,
 ):
-    def converter(date_col, col: Hashable):
-        if date_col.dtype.kind in "Mm":
-            return date_col
-
-        date_fmt = (
-            date_format.get(col) if isinstance(date_format, dict) else date_format
+    if date_col.dtype.kind in "Mm":
+        return date_col
+
+    date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format
+
+    str_objs = lib.ensure_string_array(np.asarray(date_col))
+    try:
+        result = tools.to_datetime(
+            str_objs,
+            format=date_fmt,
+            utc=False,
+            dayfirst=dayfirst,
+            cache=cache_dates,
         )
+    except (ValueError, TypeError):
+        # test_usecols_with_parse_dates4
+        # test_multi_index_parse_dates
+        return str_objs
 
-        str_objs = lib.ensure_string_array(date_col)
-        try:
-            result = tools.to_datetime(
-                str_objs,
-                format=date_fmt,
-                utc=False,
-                dayfirst=dayfirst,
-                cache=cache_dates,
-            )
-        except (ValueError, TypeError):
-            # test_usecols_with_parse_dates4
-            # test_multi_index_parse_dates
-            return str_objs
-
-        if isinstance(result, DatetimeIndex):
-            arr = result.to_numpy()
-            arr.flags.writeable = True
-            return arr
-        return result._values
-
-    return converter
+    if isinstance(result, DatetimeIndex):
+        arr = result.to_numpy()
+        arr.flags.writeable = True
+        return arr
+    return result._values
 
 
 parser_defaults = {
@@ -986,42 +988,6 @@ def converter(date_col, col: Hashable):
 }
 
 
-def _process_date_conversion(
-    data_dict: Mapping[Hashable, ArrayLike] | DataFrame,
-    converter: Callable,
-    parse_spec: list,
-    index_col,
-    index_names,
-    columns: Sequence[Hashable] | Index,
-    dtype_backend=lib.no_default,
-) -> Mapping[Hashable, ArrayLike] | DataFrame:
-    for colspec in parse_spec:
-        if isinstance(colspec, int) and colspec not in data_dict:
-            colspec = columns[colspec]
-        if (isinstance(index_col, list) and colspec in index_col) or (
-            isinstance(index_names, list) and colspec in index_names
-        ):
-            continue
-        elif dtype_backend == "pyarrow":
-            import pyarrow as pa
-
-            dtype = data_dict[colspec].dtype
-            if isinstance(dtype, ArrowDtype) and (
-                pa.types.is_timestamp(dtype.pyarrow_dtype)
-                or pa.types.is_date(dtype.pyarrow_dtype)
-            ):
-                continue
-
-        # Pyarrow engine returns Series which we need to convert to
-        # numpy array before converter, its a no-op for other parsers
-        result = converter(np.asarray(data_dict[colspec]), col=colspec)
-        # error: Unsupported target for indexed assignment
-        # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame")
-        data_dict[colspec] = result  # type: ignore[index]
-
-    return data_dict
-
-
 def get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
     """
     Get the NaN values for a given column.
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 6444718006e93..b59a778624c49 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -30,6 +30,7 @@
 from pandas.io.parsers.base_parser import (
     ParserBase,
     ParserError,
+    date_converter,
     is_index_col,
     validate_parse_dates_presence,
 )
@@ -345,9 +346,12 @@ def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]:
 
     def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
         if try_parse_dates and self._should_parse_dates(index):
-            values = self._date_conv(
+            values = date_converter(
                 values,
                 col=self.index_names[index] if self.index_names is not None else None,
+                dayfirst=self.dayfirst,
+                cache_dates=self.cache_dates,
+                date_format=self.date_format,
             )
         return values
 

From 33a11feb637023ac4c6fdeccbb9c036ea0724d3b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Jul 2024 12:11:53 -0700
Subject: [PATCH 3/6] Move can cast to python parser

---
 pandas/io/parsers/base_parser.py   | 185 ++++++++---------------------
 pandas/io/parsers/python_parser.py |  88 +++++++++++++-
 2 files changed, 138 insertions(+), 135 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index eb05e7b096c8f..e8faea76897c6 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -28,7 +28,6 @@
 )
 from pandas.util._exceptions import find_stack_level
 
-from pandas.core.dtypes.astype import astype_array
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_dict_like,
@@ -39,10 +38,6 @@
     is_object_dtype,
     is_string_dtype,
 )
-from pandas.core.dtypes.dtypes import (
-    CategoricalDtype,
-    ExtensionDtype,
-)
 from pandas.core.dtypes.missing import isna
 
 from pandas import (
@@ -55,12 +50,9 @@
     ArrowExtensionArray,
     BaseMaskedArray,
     BooleanArray,
-    Categorical,
-    ExtensionArray,
     FloatingArray,
     IntegerArray,
 )
-from pandas.core.arrays.boolean import BooleanDtype
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
@@ -83,7 +75,6 @@
     from pandas._typing import (
         ArrayLike,
         DtypeArg,
-        DtypeObj,
         Hashable,
         HashableT,
         Scalar,
@@ -171,7 +162,7 @@ def __init__(self, kwds) -> None:
 
         self._first_chunk = True
 
-        self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
 
         # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
         # Normally, this arg would get pre-processed earlier on
@@ -569,80 +560,6 @@ def _infer_types(
 
         return result, na_count
 
-    @final
-    def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
-        """
-        Cast values to specified type
-
-        Parameters
-        ----------
-        values : ndarray or ExtensionArray
-        cast_type : np.dtype or ExtensionDtype
-           dtype to cast values to
-        column : string
-            column name - used only for error reporting
-
-        Returns
-        -------
-        converted : ndarray or ExtensionArray
-        """
-        if isinstance(cast_type, CategoricalDtype):
-            known_cats = cast_type.categories is not None
-
-            if not is_object_dtype(values.dtype) and not known_cats:
-                # TODO: this is for consistency with
-                # c-parser which parses all categories
-                # as strings
-                values = lib.ensure_string_array(
-                    values, skipna=False, convert_na_value=False
-                )
-
-            cats = Index(values).unique().dropna()
-            values = Categorical._from_inferred_categories(
-                cats, cats.get_indexer(values), cast_type, true_values=self.true_values
-            )
-
-        # use the EA's implementation of casting
-        elif isinstance(cast_type, ExtensionDtype):
-            array_type = cast_type.construct_array_type()
-            try:
-                if isinstance(cast_type, BooleanDtype):
-                    # error: Unexpected keyword argument "true_values" for
-                    # "_from_sequence_of_strings" of "ExtensionArray"
-                    values_str = [str(val) for val in values]
-                    return array_type._from_sequence_of_strings(  # type: ignore[call-arg]
-                        values_str,
-                        dtype=cast_type,
-                        true_values=self.true_values,
-                        false_values=self.false_values,
-                        none_values=self.na_values,
-                    )
-                else:
-                    return array_type._from_sequence_of_strings(values, dtype=cast_type)
-            except NotImplementedError as err:
-                raise NotImplementedError(
-                    f"Extension Array: {array_type} must implement "
-                    "_from_sequence_of_strings in order to be used in parser methods"
-                ) from err
-
-        elif isinstance(values, ExtensionArray):
-            values = values.astype(cast_type, copy=False)
-        elif issubclass(cast_type.type, str):
-            # TODO: why skipna=True here and False above? some tests depend
-            #  on it here, but nothing fails if we change it above
-            #  (as no tests get there as of 2022-12-06)
-            values = lib.ensure_string_array(
-                values, skipna=True, convert_na_value=False
-            )
-        else:
-            try:
-                values = astype_array(values, cast_type, copy=True)
-            except ValueError as err:
-                raise ValueError(
-                    f"Unable to convert column {column} to type {cast_type}"
-                ) from err
-        return values
-
     @overload
     def _do_date_conversions(
         self,
@@ -776,56 +693,6 @@ def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> Sequen
 
         return usecols
 
-    @final
-    def _validate_usecols_arg(self, usecols):
-        """
-        Validate the 'usecols' parameter.
-
-        Checks whether or not the 'usecols' parameter contains all integers
-        (column selection by index), strings (column by name) or is a callable.
-        Raises a ValueError if that is not the case.
-
-        Parameters
-        ----------
-        usecols : list-like, callable, or None
-            List of columns to use when parsing or a callable that can be used
-            to filter a list of table columns.
-
-        Returns
-        -------
-        usecols_tuple : tuple
-            A tuple of (verified_usecols, usecols_dtype).
-
-            'verified_usecols' is either a set if an array-like is passed in or
-            'usecols' if a callable or None is passed in.
-
-            'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
-            is passed in or None if a callable or None is passed in.
-        """
-        msg = (
-            "'usecols' must either be list-like of all strings, all unicode, "
-            "all integers or a callable."
-        )
-        if usecols is not None:
-            if callable(usecols):
-                return usecols, None
-
-            if not is_list_like(usecols):
-                # see gh-20529
-                #
-                # Ensure it is iterable container but not string.
-                raise ValueError(msg)
-
-            usecols_dtype = lib.infer_dtype(usecols, skipna=False)
-
-            if usecols_dtype not in ("empty", "integer", "string"):
-                raise ValueError(msg)
-
-            usecols = set(usecols)
-
-            return usecols, usecols_dtype
-        return usecols, None
-
     @final
     def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
         if not is_index_col(index_col):
@@ -1071,3 +938,53 @@ def validate_parse_dates_presence(
         missing_cols = ", ".join(sorted(missing))
         raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'")
     return unique_cols
+
+
+def _validate_usecols_arg(usecols):
+    """
+    Validate the 'usecols' parameter.
+
+    Checks whether or not the 'usecols' parameter contains all integers
+    (column selection by index), strings (column by name) or is a callable.
+    Raises a ValueError if that is not the case.
+
+    Parameters
+    ----------
+    usecols : list-like, callable, or None
+        List of columns to use when parsing or a callable that can be used
+        to filter a list of table columns.
+
+    Returns
+    -------
+    usecols_tuple : tuple
+        A tuple of (verified_usecols, usecols_dtype).
+
+        'verified_usecols' is either a set if an array-like is passed in or
+        'usecols' if a callable or None is passed in.
+
+        'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
+        is passed in or None if a callable or None is passed in.
+    """
+    msg = (
+        "'usecols' must either be list-like of all strings, all unicode, "
+        "all integers or a callable."
+    )
+    if usecols is not None:
+        if callable(usecols):
+            return usecols, None
+
+        if not is_list_like(usecols):
+            # see gh-20529
+            #
+            # Ensure it is iterable container but not string.
+            raise ValueError(msg)
+
+        usecols_dtype = lib.infer_dtype(usecols, skipna=False)
+
+        if usecols_dtype not in ("empty", "integer", "string"):
+            raise ValueError(msg)
+
+        usecols = set(usecols)
+
+        return usecols, usecols_dtype
+    return usecols, None
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 9d3345651bf91..425319e0eeff9 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -29,17 +29,29 @@
 from pandas.util._decorators import cache_readonly
 from pandas.util._exceptions import find_stack_level
 
+from pandas.core.dtypes.astype import astype_array
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_extension_array_dtype,
     is_integer,
     is_numeric_dtype,
+    is_object_dtype,
     is_string_dtype,
     pandas_dtype,
 )
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    ExtensionDtype,
+)
 from pandas.core.dtypes.inference import is_dict_like
 
 from pandas.core import algorithms
+from pandas.core.arrays import (
+    Categorical,
+    ExtensionArray,
+)
+from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.indexes.api import Index
 
 from pandas.io.common import (
     dedup_names,
@@ -62,13 +74,13 @@
 
     from pandas._typing import (
         ArrayLike,
+        DtypeObj,
         ReadCsvBuffer,
         Scalar,
         T,
     )
 
     from pandas import (
-        Index,
         MultiIndex,
         Series,
     )
@@ -463,6 +475,80 @@ def _convert_to_ndarrays(
             result[c] = cvals
         return result
 
+    @final
+    def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
+        """
+        Cast values to specified type
+
+        Parameters
+        ----------
+        values : ndarray or ExtensionArray
+        cast_type : np.dtype or ExtensionDtype
+           dtype to cast values to
+        column : string
+            column name - used only for error reporting
+
+        Returns
+        -------
+        converted : ndarray or ExtensionArray
+        """
+        if isinstance(cast_type, CategoricalDtype):
+            known_cats = cast_type.categories is not None
+
+            if not is_object_dtype(values.dtype) and not known_cats:
+                # TODO: this is for consistency with
+                # c-parser which parses all categories
+                # as strings
+                values = lib.ensure_string_array(
+                    values, skipna=False, convert_na_value=False
+                )
+
+            cats = Index(values).unique().dropna()
+            values = Categorical._from_inferred_categories(
+                cats, cats.get_indexer(values), cast_type, true_values=self.true_values
+            )
+
+        # use the EA's implementation of casting
+        elif isinstance(cast_type, ExtensionDtype):
+            array_type = cast_type.construct_array_type()
+            try:
+                if isinstance(cast_type, BooleanDtype):
+                    # error: Unexpected keyword argument "true_values" for
+                    # "_from_sequence_of_strings" of "ExtensionArray"
+                    values_str = [str(val) for val in values]
+                    return array_type._from_sequence_of_strings(  # type: ignore[call-arg]
+                        values_str,
+                        dtype=cast_type,
+                        true_values=self.true_values,
+                        false_values=self.false_values,
+                        none_values=self.na_values,
+                    )
+                else:
+                    return array_type._from_sequence_of_strings(values, dtype=cast_type)
+            except NotImplementedError as err:
+                raise NotImplementedError(
+                    f"Extension Array: {array_type} must implement "
+                    "_from_sequence_of_strings in order to be used in parser methods"
+                ) from err
+
+        elif isinstance(values, ExtensionArray):
+            values = values.astype(cast_type, copy=False)
+        elif issubclass(cast_type.type, str):
+            # TODO: why skipna=True here and False above? some tests depend
+            #  on it here, but nothing fails if we change it above
+            #  (as no tests get there as of 2022-12-06)
+            values = lib.ensure_string_array(
+                values, skipna=True, convert_na_value=False
+            )
+        else:
+            try:
+                values = astype_array(values, cast_type, copy=True)
+            except ValueError as err:
+                raise ValueError(
+                    f"Unable to convert column {column} to type {cast_type}"
+                ) from err
+        return values
+
     @cache_readonly
     def _have_mi_columns(self) -> bool:
         if self.header is None:

From 99ca747bbc4c82934245384049a209459432221b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Jul 2024 12:16:43 -0700
Subject: [PATCH 4/6] Move can cast to python parser

---
 pandas/io/parsers/base_parser.py   | 132 +----------------------------
 pandas/io/parsers/python_parser.py | 129 +++++++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 132 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index e8faea76897c6..1bbb0abb0975b 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -15,13 +15,8 @@
 
 import numpy as np
 
-from pandas._libs import (
-    lib,
-    parsers,
-)
-import pandas._libs.ops as libops
+from pandas._libs import lib
 from pandas._libs.parsers import STR_NA_VALUES
-from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     ParserError,
     ParserWarning,
@@ -29,11 +24,8 @@
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
-    is_bool_dtype,
     is_dict_like,
-    is_float_dtype,
     is_integer,
-    is_integer_dtype,
     is_list_like,
     is_object_dtype,
     is_string_dtype,
@@ -43,15 +35,6 @@
 from pandas import (
     DataFrame,
     DatetimeIndex,
-    StringDtype,
-)
-from pandas.core import algorithms
-from pandas.core.arrays import (
-    ArrowExtensionArray,
-    BaseMaskedArray,
-    BooleanArray,
-    FloatingArray,
-    IntegerArray,
 )
 from pandas.core.indexes.api import (
     Index,
@@ -447,119 +430,6 @@ def _set(x) -> int:
 
         return noconvert_columns
 
-    @final
-    def _infer_types(
-        self, values, na_values, no_dtype_specified, try_num_bool: bool = True
-    ) -> tuple[ArrayLike, int]:
-        """
-        Infer types of values, possibly casting
-
-        Parameters
-        ----------
-        values : ndarray
-        na_values : set
-        no_dtype_specified: Specifies if we want to cast explicitly
-        try_num_bool : bool, default try
-           try to cast values to numeric (first preference) or boolean
-
-        Returns
-        -------
-        converted : ndarray or ExtensionArray
-        na_count : int
-        """
-        na_count = 0
-        if issubclass(values.dtype.type, (np.number, np.bool_)):
-            # If our array has numeric dtype, we don't have to check for strings in isin
-            na_values = np.array([val for val in na_values if not isinstance(val, str)])
-            mask = algorithms.isin(values, na_values)
-            na_count = mask.astype("uint8", copy=False).sum()
-            if na_count > 0:
-                if is_integer_dtype(values):
-                    values = values.astype(np.float64)
-                np.putmask(values, mask, np.nan)
-            return values, na_count
-
-        dtype_backend = self.dtype_backend
-        non_default_dtype_backend = (
-            no_dtype_specified and dtype_backend is not lib.no_default
-        )
-        result: ArrayLike
-
-        if try_num_bool and is_object_dtype(values.dtype):
-            # exclude e.g DatetimeIndex here
-            try:
-                result, result_mask = lib.maybe_convert_numeric(
-                    values,
-                    na_values,
-                    False,
-                    convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
-                )
-            except (ValueError, TypeError):
-                # e.g. encountering datetime string gets ValueError
-                #  TypeError can be raised in floatify
-                na_count = parsers.sanitize_objects(values, na_values)
-                result = values
-            else:
-                if non_default_dtype_backend:
-                    if result_mask is None:
-                        result_mask = np.zeros(result.shape, dtype=np.bool_)
-
-                    if result_mask.all():
-                        result = IntegerArray(
-                            np.ones(result_mask.shape, dtype=np.int64), result_mask
-                        )
-                    elif is_integer_dtype(result):
-                        result = IntegerArray(result, result_mask)
-                    elif is_bool_dtype(result):
-                        result = BooleanArray(result, result_mask)
-                    elif is_float_dtype(result):
-                        result = FloatingArray(result, result_mask)
-
-                    na_count = result_mask.sum()
-                else:
-                    na_count = isna(result).sum()
-        else:
-            result = values
-            if values.dtype == np.object_:
-                na_count = parsers.sanitize_objects(values, na_values)
-
-        if result.dtype == np.object_ and try_num_bool:
-            result, bool_mask = libops.maybe_convert_bool(
-                np.asarray(values),
-                true_values=self.true_values,
-                false_values=self.false_values,
-                convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
-            )
-            if result.dtype == np.bool_ and non_default_dtype_backend:
-                if bool_mask is None:
-                    bool_mask = np.zeros(result.shape, dtype=np.bool_)
-                result = BooleanArray(result, bool_mask)
-            elif result.dtype == np.object_ and non_default_dtype_backend:
-                # read_excel sends array of datetime objects
-                if not lib.is_datetime_array(result, skipna=True):
-                    dtype = StringDtype()
-                    cls = dtype.construct_array_type()
-                    result = cls._from_sequence(values, dtype=dtype)
-
-        if dtype_backend == "pyarrow":
-            pa = import_optional_dependency("pyarrow")
-            if isinstance(result, np.ndarray):
-                result = ArrowExtensionArray(pa.array(result, from_pandas=True))
-            elif isinstance(result, BaseMaskedArray):
-                if result._mask.all():
-                    # We want an arrow null array here
-                    result = ArrowExtensionArray(pa.array([None] * len(result)))
-                else:
-                    result = ArrowExtensionArray(
-                        pa.array(result._data, mask=result._mask)
-                    )
-            else:
-                result = ArrowExtensionArray(
-                    pa.array(result.to_numpy(), from_pandas=True)
-                )
-
-        return result, na_count
-
     @overload
     def _do_date_conversions(
         self,
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 425319e0eeff9..bc04a83274a8f 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -20,7 +20,12 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    parsers,
+)
+import pandas._libs.ops as libops
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -33,7 +38,9 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_extension_array_dtype,
+    is_float_dtype,
     is_integer,
+    is_integer_dtype,
     is_numeric_dtype,
     is_object_dtype,
     is_string_dtype,
@@ -44,13 +51,20 @@
     ExtensionDtype,
 )
 from pandas.core.dtypes.inference import is_dict_like
+from pandas.core.dtypes.missing import isna
 
 from pandas.core import algorithms
 from pandas.core.arrays import (
+    ArrowExtensionArray,
+    BaseMaskedArray,
+    BooleanArray,
     Categorical,
     ExtensionArray,
+    FloatingArray,
+    IntegerArray,
 )
 from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexes.api import Index
 
 from pandas.io.common import (
@@ -549,6 +563,119 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
                 ) from err
         return values
 
+    @final
+    def _infer_types(
+        self, values, na_values, no_dtype_specified, try_num_bool: bool = True
+    ) -> tuple[ArrayLike, int]:
+        """
+        Infer types of values, possibly casting
+
+        Parameters
+        ----------
+        values : ndarray
+        na_values : set
+        no_dtype_specified: Specifies if we want to cast explicitly
+        try_num_bool : bool, default try
+           try to cast values to numeric (first preference) or boolean
+
+        Returns
+        -------
+        converted : ndarray or ExtensionArray
+        na_count : int
+        """
+        na_count = 0
+        if issubclass(values.dtype.type, (np.number, np.bool_)):
+            # If our array has numeric dtype, we don't have to check for strings in isin
+            na_values = np.array([val for val in na_values if not isinstance(val, str)])
+            mask = algorithms.isin(values, na_values)
+            na_count = mask.astype("uint8", copy=False).sum()
+            if na_count > 0:
+                if is_integer_dtype(values):
+                    values = values.astype(np.float64)
+                np.putmask(values, mask, np.nan)
+            return values, na_count
+
+        dtype_backend = self.dtype_backend
+        non_default_dtype_backend = (
+            no_dtype_specified and dtype_backend is not lib.no_default
+        )
+        result: ArrayLike
+
+        if try_num_bool and is_object_dtype(values.dtype):
+            # exclude e.g DatetimeIndex here
+            try:
+                result, result_mask = lib.maybe_convert_numeric(
+                    values,
+                    na_values,
+                    False,
+                    convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
+                )
+            except (ValueError, TypeError):
+                # e.g. encountering datetime string gets ValueError
+                #  TypeError can be raised in floatify
+                na_count = parsers.sanitize_objects(values, na_values)
+                result = values
+            else:
+                if non_default_dtype_backend:
+                    if result_mask is None:
+                        result_mask = np.zeros(result.shape, dtype=np.bool_)
+
+                    if result_mask.all():
+                        result = IntegerArray(
+                            np.ones(result_mask.shape, dtype=np.int64), result_mask
+                        )
+                    elif is_integer_dtype(result):
+                        result = IntegerArray(result, result_mask)
+                    elif is_bool_dtype(result):
+                        result = BooleanArray(result, result_mask)
+                    elif is_float_dtype(result):
+                        result = FloatingArray(result, result_mask)
+
+                    na_count = result_mask.sum()
+                else:
+                    na_count = isna(result).sum()
+        else:
+            result = values
+            if values.dtype == np.object_:
+                na_count = parsers.sanitize_objects(values, na_values)
+
+        if result.dtype == np.object_ and try_num_bool:
+            result, bool_mask = libops.maybe_convert_bool(
+                np.asarray(values),
+                true_values=self.true_values,
+                false_values=self.false_values,
+                convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
+            )
+            if result.dtype == np.bool_ and non_default_dtype_backend:
+                if bool_mask is None:
+                    bool_mask = np.zeros(result.shape, dtype=np.bool_)
+                result = BooleanArray(result, bool_mask)
+            elif result.dtype == np.object_ and non_default_dtype_backend:
+                # read_excel sends array of datetime objects
+                if not lib.is_datetime_array(result, skipna=True):
+                    dtype = StringDtype()
+                    cls = dtype.construct_array_type()
+                    result = cls._from_sequence(values, dtype=dtype)
+
+        if dtype_backend == "pyarrow":
+            pa = import_optional_dependency("pyarrow")
+            if isinstance(result, np.ndarray):
+                result = ArrowExtensionArray(pa.array(result, from_pandas=True))
+            elif isinstance(result, BaseMaskedArray):
+                if result._mask.all():
+                    # We want an arrow null array here
+                    result = ArrowExtensionArray(pa.array([None] * len(result)))
+                else:
+                    result = ArrowExtensionArray(
+                        pa.array(result._data, mask=result._mask)
+                    )
+            else:
+                result = ArrowExtensionArray(
+                    pa.array(result.to_numpy(), from_pandas=True)
+                )
+
+        return result, na_count
+
     @cache_readonly
     def _have_mi_columns(self) -> bool:
         if self.header is None:

From cbf29f24d04afc12ee03826f7e3fed4c3caaeef0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Jul 2024 12:19:44 -0700
Subject: [PATCH 5/6] Revert "Move can cast to python parser"

This reverts commit 99ca747bbc4c82934245384049a209459432221b.
---
 pandas/io/parsers/base_parser.py   | 132 ++++++++++++++++++++++++++++-
 pandas/io/parsers/python_parser.py | 129 +---------------------------
 2 files changed, 132 insertions(+), 129 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 1bbb0abb0975b..e8faea76897c6 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -15,8 +15,13 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    parsers,
+)
+import pandas._libs.ops as libops
 from pandas._libs.parsers import STR_NA_VALUES
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     ParserError,
     ParserWarning,
@@ -24,8 +29,11 @@
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
+    is_bool_dtype,
     is_dict_like,
+    is_float_dtype,
     is_integer,
+    is_integer_dtype,
     is_list_like,
     is_object_dtype,
     is_string_dtype,
@@ -35,6 +43,15 @@
 from pandas import (
     DataFrame,
     DatetimeIndex,
+    StringDtype,
+)
+from pandas.core import algorithms
+from pandas.core.arrays import (
+    ArrowExtensionArray,
+    BaseMaskedArray,
+    BooleanArray,
+    FloatingArray,
+    IntegerArray,
 )
 from pandas.core.indexes.api import (
     Index,
@@ -430,6 +447,119 @@ def _set(x) -> int:
 
         return noconvert_columns
 
+    @final
+    def _infer_types(
+        self, values, na_values, no_dtype_specified, try_num_bool: bool = True
+    ) -> tuple[ArrayLike, int]:
+        """
+        Infer types of values, possibly casting
+
+        Parameters
+        ----------
+        values : ndarray
+        na_values : set
+        no_dtype_specified: Specifies if we want to cast explicitly
+        try_num_bool : bool, default try
+           try to cast values to numeric (first preference) or boolean
+
+        Returns
+        -------
+        converted : ndarray or ExtensionArray
+        na_count : int
+        """
+        na_count = 0
+        if issubclass(values.dtype.type, (np.number, np.bool_)):
+            # If our array has numeric dtype, we don't have to check for strings in isin
+            na_values = np.array([val for val in na_values if not isinstance(val, str)])
+            mask = algorithms.isin(values, na_values)
+            na_count = mask.astype("uint8", copy=False).sum()
+            if na_count > 0:
+                if is_integer_dtype(values):
+                    values = values.astype(np.float64)
+                np.putmask(values, mask, np.nan)
+            return values, na_count
+
+        dtype_backend = self.dtype_backend
+        non_default_dtype_backend = (
+            no_dtype_specified and dtype_backend is not lib.no_default
+        )
+        result: ArrayLike
+
+        if try_num_bool and is_object_dtype(values.dtype):
+            # exclude e.g DatetimeIndex here
+            try:
+                result, result_mask = lib.maybe_convert_numeric(
+                    values,
+                    na_values,
+                    False,
+                    convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
+                )
+            except (ValueError, TypeError):
+                # e.g. encountering datetime string gets ValueError
+                #  TypeError can be raised in floatify
+                na_count = parsers.sanitize_objects(values, na_values)
+                result = values
+            else:
+                if non_default_dtype_backend:
+                    if result_mask is None:
+                        result_mask = np.zeros(result.shape, dtype=np.bool_)
+
+                    if result_mask.all():
+                        result = IntegerArray(
+                            np.ones(result_mask.shape, dtype=np.int64), result_mask
+                        )
+                    elif is_integer_dtype(result):
+                        result = IntegerArray(result, result_mask)
+                    elif is_bool_dtype(result):
+                        result = BooleanArray(result, result_mask)
+                    elif is_float_dtype(result):
+                        result = FloatingArray(result, result_mask)
+
+                    na_count = result_mask.sum()
+                else:
+                    na_count = isna(result).sum()
+        else:
+            result = values
+            if values.dtype == np.object_:
+                na_count = parsers.sanitize_objects(values, na_values)
+
+        if result.dtype == np.object_ and try_num_bool:
+            result, bool_mask = libops.maybe_convert_bool(
+                np.asarray(values),
+                true_values=self.true_values,
+                false_values=self.false_values,
+                convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
+            )
+            if result.dtype == np.bool_ and non_default_dtype_backend:
+                if bool_mask is None:
+                    bool_mask = np.zeros(result.shape, dtype=np.bool_)
+                result = BooleanArray(result, bool_mask)
+            elif result.dtype == np.object_ and non_default_dtype_backend:
+                # read_excel sends array of datetime objects
+                if not lib.is_datetime_array(result, skipna=True):
+                    dtype = StringDtype()
+                    cls = dtype.construct_array_type()
+                    result = cls._from_sequence(values, dtype=dtype)
+
+        if dtype_backend == "pyarrow":
+            pa = import_optional_dependency("pyarrow")
+            if isinstance(result, np.ndarray):
+                result = ArrowExtensionArray(pa.array(result, from_pandas=True))
+            elif isinstance(result, BaseMaskedArray):
+                if result._mask.all():
+                    # We want an arrow null array here
+                    result = ArrowExtensionArray(pa.array([None] * len(result)))
+                else:
+                    result = ArrowExtensionArray(
+                        pa.array(result._data, mask=result._mask)
+                    )
+            else:
+                result = ArrowExtensionArray(
+                    pa.array(result.to_numpy(), from_pandas=True)
+                )
+
+        return result, na_count
+
     @overload
     def _do_date_conversions(
         self,
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index bc04a83274a8f..425319e0eeff9 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -20,12 +20,7 @@
 
 import numpy as np
 
-from pandas._libs import (
-    lib,
-    parsers,
-)
-import pandas._libs.ops as libops
-from pandas.compat._optional import import_optional_dependency
+from pandas._libs import lib
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -38,9 +33,7 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_extension_array_dtype,
-    is_float_dtype,
     is_integer,
-    is_integer_dtype,
     is_numeric_dtype,
     is_object_dtype,
     is_string_dtype,
@@ -51,20 +44,13 @@
     ExtensionDtype,
 )
 from pandas.core.dtypes.inference import is_dict_like
-from pandas.core.dtypes.missing import isna
 
 from pandas.core import algorithms
 from pandas.core.arrays import (
-    ArrowExtensionArray,
-    BaseMaskedArray,
-    BooleanArray,
     Categorical,
     ExtensionArray,
-    FloatingArray,
-    IntegerArray,
 )
 from pandas.core.arrays.boolean import BooleanDtype
-from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexes.api import Index
 
 from pandas.io.common import (
@@ -563,119 +549,6 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
                 ) from err
         return values
 
-    @final
-    def _infer_types(
-        self, values, na_values, no_dtype_specified, try_num_bool: bool = True
-    ) -> tuple[ArrayLike, int]:
-        """
-        Infer types of values, possibly casting
-
-        Parameters
-        ----------
-        values : ndarray
-        na_values : set
-        no_dtype_specified: Specifies if we want to cast explicitly
-        try_num_bool : bool, default try
-           try to cast values to numeric (first preference) or boolean
-
-        Returns
-        -------
-        converted : ndarray or ExtensionArray
-        na_count : int
-        """
-        na_count = 0
-        if issubclass(values.dtype.type, (np.number, np.bool_)):
-            # If our array has numeric dtype, we don't have to check for strings in isin
-            na_values = np.array([val for val in na_values if not isinstance(val, str)])
-            mask = algorithms.isin(values, na_values)
-            na_count = mask.astype("uint8", copy=False).sum()
-            if na_count > 0:
-                if is_integer_dtype(values):
-                    values = values.astype(np.float64)
-                np.putmask(values, mask, np.nan)
-            return values, na_count
-
-        dtype_backend = self.dtype_backend
-        non_default_dtype_backend = (
-            no_dtype_specified and dtype_backend is not lib.no_default
-        )
-        result: ArrayLike
-
-        if try_num_bool and is_object_dtype(values.dtype):
-            # exclude e.g DatetimeIndex here
-            try:
-                result, result_mask = lib.maybe_convert_numeric(
-                    values,
-                    na_values,
-                    False,
-                    convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
-                )
-            except (ValueError, TypeError):
-                # e.g. encountering datetime string gets ValueError
-                #  TypeError can be raised in floatify
-                na_count = parsers.sanitize_objects(values, na_values)
-                result = values
-            else:
-                if non_default_dtype_backend:
-                    if result_mask is None:
-                        result_mask = np.zeros(result.shape, dtype=np.bool_)
-
-                    if result_mask.all():
-                        result = IntegerArray(
-                            np.ones(result_mask.shape, dtype=np.int64), result_mask
-                        )
-                    elif is_integer_dtype(result):
-                        result = IntegerArray(result, result_mask)
-                    elif is_bool_dtype(result):
-                        result = BooleanArray(result, result_mask)
-                    elif is_float_dtype(result):
-                        result = FloatingArray(result, result_mask)
-
-                    na_count = result_mask.sum()
-                else:
-                    na_count = isna(result).sum()
-        else:
-            result = values
-            if values.dtype == np.object_:
-                na_count = parsers.sanitize_objects(values, na_values)
-
-        if result.dtype == np.object_ and try_num_bool:
-            result, bool_mask = libops.maybe_convert_bool(
-                np.asarray(values),
-                true_values=self.true_values,
-                false_values=self.false_values,
-                convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
-            )
-            if result.dtype == np.bool_ and non_default_dtype_backend:
-                if bool_mask is None:
-                    bool_mask = np.zeros(result.shape, dtype=np.bool_)
-                result = BooleanArray(result, bool_mask)
-            elif result.dtype == np.object_ and non_default_dtype_backend:
-                # read_excel sends array of datetime objects
-                if not lib.is_datetime_array(result, skipna=True):
-                    dtype = StringDtype()
-                    cls = dtype.construct_array_type()
-                    result = cls._from_sequence(values, dtype=dtype)
-
-        if dtype_backend == "pyarrow":
-            pa = import_optional_dependency("pyarrow")
-            if isinstance(result, np.ndarray):
-                result = ArrowExtensionArray(pa.array(result, from_pandas=True))
-            elif isinstance(result, BaseMaskedArray):
-                if result._mask.all():
-                    # We want an arrow null array here
-                    result = ArrowExtensionArray(pa.array([None] * len(result)))
-                else:
-                    result = ArrowExtensionArray(
-                        pa.array(result._data, mask=result._mask)
-                    )
-            else:
-                result = ArrowExtensionArray(
-                    pa.array(result.to_numpy(), from_pandas=True)
-                )
-
-        return result, na_count
-
     @cache_readonly
     def _have_mi_columns(self) -> bool:
         if self.header is None:

From 87758bf717b023d6e60496fe1083689fbb10bdae Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:31:46 -0700
Subject: [PATCH 6/6] Typing issues

---
 pandas/io/parsers/python_parser.py     | 8 ++++----
 pandas/tests/io/parser/test_network.py | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 425319e0eeff9..05fe963e9b2b7 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -419,7 +419,7 @@ def _convert_to_ndarrays(
             if c in parse_date_cols:
                 # GH#26203 Do not convert columns which get converted to dates
                 # but replace nans to ensure to_datetime works
-                mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
+                mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)  # pyright: ignore[reportArgumentType]
                 np.putmask(values, mask, np.nan)
                 result[c] = values
                 continue
@@ -519,9 +519,9 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
                     return array_type._from_sequence_of_strings(  # type: ignore[call-arg]
                         values_str,
                         dtype=cast_type,
-                        true_values=self.true_values,
-                        false_values=self.false_values,
-                        none_values=self.na_values,
+                        true_values=self.true_values,  # pyright: ignore[reportCallIssue]
+                        false_values=self.false_values,  # pyright: ignore[reportCallIssue]
+                        none_values=self.na_values,  # pyright: ignore[reportCallIssue]
                     )
                 else:
                     return array_type._from_sequence_of_strings(values, dtype=cast_type)
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index f63cc3d56bf89..4ccfa8e81e883 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -75,6 +75,7 @@ def tips_df(datapath):
 
 
 @pytest.mark.single_cpu
+@pytest.mark.network
 @pytest.mark.usefixtures("s3_resource")
 @td.skip_if_not_us_locale()
 class TestS3: