Move can cast to python parser

mroeschke · mroeschke · commit 99ca747bbc4c · 2024-07-05T12:16:43.000-07:00
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -15,25 +15,17 @@
 
 import numpy as np
 
-from pandas._libs import (
-    lib,
-    parsers,
-)
-import pandas._libs.ops as libops
+from pandas._libs import lib
 from pandas._libs.parsers import STR_NA_VALUES
-from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     ParserError,
     ParserWarning,
 )
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
-    is_bool_dtype,
     is_dict_like,
-    is_float_dtype,
     is_integer,
-    is_integer_dtype,
     is_list_like,
     is_object_dtype,
     is_string_dtype,
@@ -43,15 +35,6 @@
 from pandas import (
     DataFrame,
     DatetimeIndex,
-    StringDtype,
-)
-from pandas.core import algorithms
-from pandas.core.arrays import (
-    ArrowExtensionArray,
-    BaseMaskedArray,
-    BooleanArray,
-    FloatingArray,
-    IntegerArray,
 )
 from pandas.core.indexes.api import (
     Index,
@@ -447,119 +430,6 @@ def _set(x) -> int:
 
         return noconvert_columns
 
-    @final
-    def _infer_types(
-        self, values, na_values, no_dtype_specified, try_num_bool: bool = True
-    ) -> tuple[ArrayLike, int]:
-        """
-        Infer types of values, possibly casting
-
-        Parameters
-        ----------
-        values : ndarray
-        na_values : set
-        no_dtype_specified: Specifies if we want to cast explicitly
-        try_num_bool : bool, default try
-           try to cast values to numeric (first preference) or boolean
-
-        Returns
-        -------
-        converted : ndarray or ExtensionArray
-        na_count : int
-        """
-        na_count = 0
-        if issubclass(values.dtype.type, (np.number, np.bool_)):
-            # If our array has numeric dtype, we don't have to check for strings in isin
-            na_values = np.array([val for val in na_values if not isinstance(val, str)])
-            mask = algorithms.isin(values, na_values)
-            na_count = mask.astype("uint8", copy=False).sum()
-            if na_count > 0:
-                if is_integer_dtype(values):
-                    values = values.astype(np.float64)
-                np.putmask(values, mask, np.nan)
-            return values, na_count
-
-        dtype_backend = self.dtype_backend
-        non_default_dtype_backend = (
-            no_dtype_specified and dtype_backend is not lib.no_default
-        )
-        result: ArrayLike
-
-        if try_num_bool and is_object_dtype(values.dtype):
-            # exclude e.g DatetimeIndex here
-            try:
-                result, result_mask = lib.maybe_convert_numeric(
-                    values,
-                    na_values,
-                    False,
-                    convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
-                )
-            except (ValueError, TypeError):
-                # e.g. encountering datetime string gets ValueError
-                #  TypeError can be raised in floatify
-                na_count = parsers.sanitize_objects(values, na_values)
-                result = values
-            else:
-                if non_default_dtype_backend:
-                    if result_mask is None:
-                        result_mask = np.zeros(result.shape, dtype=np.bool_)
-
-                    if result_mask.all():
-                        result = IntegerArray(
-                            np.ones(result_mask.shape, dtype=np.int64), result_mask
-                        )
-                    elif is_integer_dtype(result):
-                        result = IntegerArray(result, result_mask)
-                    elif is_bool_dtype(result):
-                        result = BooleanArray(result, result_mask)
-                    elif is_float_dtype(result):
-                        result = FloatingArray(result, result_mask)
-
-                    na_count = result_mask.sum()
-                else:
-                    na_count = isna(result).sum()
-        else:
-            result = values
-            if values.dtype == np.object_:
-                na_count = parsers.sanitize_objects(values, na_values)
-
-        if result.dtype == np.object_ and try_num_bool:
-            result, bool_mask = libops.maybe_convert_bool(
-                np.asarray(values),
-                true_values=self.true_values,
-                false_values=self.false_values,
-                convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
-            )
-            if result.dtype == np.bool_ and non_default_dtype_backend:
-                if bool_mask is None:
-                    bool_mask = np.zeros(result.shape, dtype=np.bool_)
-                result = BooleanArray(result, bool_mask)
-            elif result.dtype == np.object_ and non_default_dtype_backend:
-                # read_excel sends array of datetime objects
-                if not lib.is_datetime_array(result, skipna=True):
-                    dtype = StringDtype()
-                    cls = dtype.construct_array_type()
-                    result = cls._from_sequence(values, dtype=dtype)
-
-        if dtype_backend == "pyarrow":
-            pa = import_optional_dependency("pyarrow")
-            if isinstance(result, np.ndarray):
-                result = ArrowExtensionArray(pa.array(result, from_pandas=True))
-            elif isinstance(result, BaseMaskedArray):
-                if result._mask.all():
-                    # We want an arrow null array here
-                    result = ArrowExtensionArray(pa.array([None] * len(result)))
-                else:
-                    result = ArrowExtensionArray(
-                        pa.array(result._data, mask=result._mask)
-                    )
-            else:
-                result = ArrowExtensionArray(
-                    pa.array(result.to_numpy(), from_pandas=True)
-                )
-
-        return result, na_count
-
     @overload
     def _do_date_conversions(
         self,
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -20,7 +20,12 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    parsers,
+)
+import pandas._libs.ops as libops
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -33,7 +38,9 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_extension_array_dtype,
+    is_float_dtype,
     is_integer,
+    is_integer_dtype,
     is_numeric_dtype,
     is_object_dtype,
     is_string_dtype,
@@ -44,13 +51,20 @@
     ExtensionDtype,
 )
 from pandas.core.dtypes.inference import is_dict_like
+from pandas.core.dtypes.missing import isna
 
 from pandas.core import algorithms
 from pandas.core.arrays import (
+    ArrowExtensionArray,
+    BaseMaskedArray,
+    BooleanArray,
     Categorical,
     ExtensionArray,
+    FloatingArray,
+    IntegerArray,
 )
 from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexes.api import Index
 
 from pandas.io.common import (
@@ -549,6 +563,119 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
                 ) from err
         return values
 
+    @final
+    def _infer_types(
+        self, values, na_values, no_dtype_specified, try_num_bool: bool = True
+    ) -> tuple[ArrayLike, int]:
+        """
+        Infer types of values, possibly casting
+
+        Parameters
+        ----------
+        values : ndarray
+        na_values : set
+        no_dtype_specified: Specifies if we want to cast explicitly
+        try_num_bool : bool, default try
+           try to cast values to numeric (first preference) or boolean
+
+        Returns
+        -------
+        converted : ndarray or ExtensionArray
+        na_count : int
+        """
+        na_count = 0
+        if issubclass(values.dtype.type, (np.number, np.bool_)):
+            # If our array has numeric dtype, we don't have to check for strings in isin
+            na_values = np.array([val for val in na_values if not isinstance(val, str)])
+            mask = algorithms.isin(values, na_values)
+            na_count = mask.astype("uint8", copy=False).sum()
+            if na_count > 0:
+                if is_integer_dtype(values):
+                    values = values.astype(np.float64)
+                np.putmask(values, mask, np.nan)
+            return values, na_count
+
+        dtype_backend = self.dtype_backend
+        non_default_dtype_backend = (
+            no_dtype_specified and dtype_backend is not lib.no_default
+        )
+        result: ArrayLike
+
+        if try_num_bool and is_object_dtype(values.dtype):
+            # exclude e.g DatetimeIndex here
+            try:
+                result, result_mask = lib.maybe_convert_numeric(
+                    values,
+                    na_values,
+                    False,
+                    convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
+                )
+            except (ValueError, TypeError):
+                # e.g. encountering datetime string gets ValueError
+                #  TypeError can be raised in floatify
+                na_count = parsers.sanitize_objects(values, na_values)
+                result = values
+            else:
+                if non_default_dtype_backend:
+                    if result_mask is None:
+                        result_mask = np.zeros(result.shape, dtype=np.bool_)
+
+                    if result_mask.all():
+                        result = IntegerArray(
+                            np.ones(result_mask.shape, dtype=np.int64), result_mask
+                        )
+                    elif is_integer_dtype(result):
+                        result = IntegerArray(result, result_mask)
+                    elif is_bool_dtype(result):
+                        result = BooleanArray(result, result_mask)
+                    elif is_float_dtype(result):
+                        result = FloatingArray(result, result_mask)
+
+                    na_count = result_mask.sum()
+                else:
+                    na_count = isna(result).sum()
+        else:
+            result = values
+            if values.dtype == np.object_:
+                na_count = parsers.sanitize_objects(values, na_values)
+
+        if result.dtype == np.object_ and try_num_bool:
+            result, bool_mask = libops.maybe_convert_bool(
+                np.asarray(values),
+                true_values=self.true_values,
+                false_values=self.false_values,
+                convert_to_masked_nullable=non_default_dtype_backend,  # type: ignore[arg-type]
+            )
+            if result.dtype == np.bool_ and non_default_dtype_backend:
+                if bool_mask is None:
+                    bool_mask = np.zeros(result.shape, dtype=np.bool_)
+                result = BooleanArray(result, bool_mask)
+            elif result.dtype == np.object_ and non_default_dtype_backend:
+                # read_excel sends array of datetime objects
+                if not lib.is_datetime_array(result, skipna=True):
+                    dtype = StringDtype()
+                    cls = dtype.construct_array_type()
+                    result = cls._from_sequence(values, dtype=dtype)
+
+        if dtype_backend == "pyarrow":
+            pa = import_optional_dependency("pyarrow")
+            if isinstance(result, np.ndarray):
+                result = ArrowExtensionArray(pa.array(result, from_pandas=True))
+            elif isinstance(result, BaseMaskedArray):
+                if result._mask.all():
+                    # We want an arrow null array here
+                    result = ArrowExtensionArray(pa.array([None] * len(result)))
+                else:
+                    result = ArrowExtensionArray(
+                        pa.array(result._data, mask=result._mask)
+                    )
+            else:
+                result = ArrowExtensionArray(
+                    pa.array(result.to_numpy(), from_pandas=True)
+                )
+
+        return result, na_count
+
     @cache_readonly
     def _have_mi_columns(self) -> bool:
         if self.header is None: