Skip to content

Commit 99ca747

Browse files
committed
Move can cast to python parser
1 parent 33a11fe commit 99ca747

File tree

2 files changed

+129
-132
lines changed

2 files changed

+129
-132
lines changed

pandas/io/parsers/base_parser.py

Lines changed: 1 addition & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,17 @@
1515

1616
import numpy as np
1717

18-
from pandas._libs import (
19-
lib,
20-
parsers,
21-
)
22-
import pandas._libs.ops as libops
18+
from pandas._libs import lib
2319
from pandas._libs.parsers import STR_NA_VALUES
24-
from pandas.compat._optional import import_optional_dependency
2520
from pandas.errors import (
2621
ParserError,
2722
ParserWarning,
2823
)
2924
from pandas.util._exceptions import find_stack_level
3025

3126
from pandas.core.dtypes.common import (
32-
is_bool_dtype,
3327
is_dict_like,
34-
is_float_dtype,
3528
is_integer,
36-
is_integer_dtype,
3729
is_list_like,
3830
is_object_dtype,
3931
is_string_dtype,
@@ -43,15 +35,6 @@
4335
from pandas import (
4436
DataFrame,
4537
DatetimeIndex,
46-
StringDtype,
47-
)
48-
from pandas.core import algorithms
49-
from pandas.core.arrays import (
50-
ArrowExtensionArray,
51-
BaseMaskedArray,
52-
BooleanArray,
53-
FloatingArray,
54-
IntegerArray,
5538
)
5639
from pandas.core.indexes.api import (
5740
Index,
@@ -447,119 +430,6 @@ def _set(x) -> int:
447430

448431
return noconvert_columns
449432

450-
@final
451-
def _infer_types(
452-
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
453-
) -> tuple[ArrayLike, int]:
454-
"""
455-
Infer types of values, possibly casting
456-
457-
Parameters
458-
----------
459-
values : ndarray
460-
na_values : set
461-
no_dtype_specified: Specifies if we want to cast explicitly
462-
try_num_bool : bool, default try
463-
try to cast values to numeric (first preference) or boolean
464-
465-
Returns
466-
-------
467-
converted : ndarray or ExtensionArray
468-
na_count : int
469-
"""
470-
na_count = 0
471-
if issubclass(values.dtype.type, (np.number, np.bool_)):
472-
# If our array has numeric dtype, we don't have to check for strings in isin
473-
na_values = np.array([val for val in na_values if not isinstance(val, str)])
474-
mask = algorithms.isin(values, na_values)
475-
na_count = mask.astype("uint8", copy=False).sum()
476-
if na_count > 0:
477-
if is_integer_dtype(values):
478-
values = values.astype(np.float64)
479-
np.putmask(values, mask, np.nan)
480-
return values, na_count
481-
482-
dtype_backend = self.dtype_backend
483-
non_default_dtype_backend = (
484-
no_dtype_specified and dtype_backend is not lib.no_default
485-
)
486-
result: ArrayLike
487-
488-
if try_num_bool and is_object_dtype(values.dtype):
489-
# exclude e.g DatetimeIndex here
490-
try:
491-
result, result_mask = lib.maybe_convert_numeric(
492-
values,
493-
na_values,
494-
False,
495-
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
496-
)
497-
except (ValueError, TypeError):
498-
# e.g. encountering datetime string gets ValueError
499-
# TypeError can be raised in floatify
500-
na_count = parsers.sanitize_objects(values, na_values)
501-
result = values
502-
else:
503-
if non_default_dtype_backend:
504-
if result_mask is None:
505-
result_mask = np.zeros(result.shape, dtype=np.bool_)
506-
507-
if result_mask.all():
508-
result = IntegerArray(
509-
np.ones(result_mask.shape, dtype=np.int64), result_mask
510-
)
511-
elif is_integer_dtype(result):
512-
result = IntegerArray(result, result_mask)
513-
elif is_bool_dtype(result):
514-
result = BooleanArray(result, result_mask)
515-
elif is_float_dtype(result):
516-
result = FloatingArray(result, result_mask)
517-
518-
na_count = result_mask.sum()
519-
else:
520-
na_count = isna(result).sum()
521-
else:
522-
result = values
523-
if values.dtype == np.object_:
524-
na_count = parsers.sanitize_objects(values, na_values)
525-
526-
if result.dtype == np.object_ and try_num_bool:
527-
result, bool_mask = libops.maybe_convert_bool(
528-
np.asarray(values),
529-
true_values=self.true_values,
530-
false_values=self.false_values,
531-
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
532-
)
533-
if result.dtype == np.bool_ and non_default_dtype_backend:
534-
if bool_mask is None:
535-
bool_mask = np.zeros(result.shape, dtype=np.bool_)
536-
result = BooleanArray(result, bool_mask)
537-
elif result.dtype == np.object_ and non_default_dtype_backend:
538-
# read_excel sends array of datetime objects
539-
if not lib.is_datetime_array(result, skipna=True):
540-
dtype = StringDtype()
541-
cls = dtype.construct_array_type()
542-
result = cls._from_sequence(values, dtype=dtype)
543-
544-
if dtype_backend == "pyarrow":
545-
pa = import_optional_dependency("pyarrow")
546-
if isinstance(result, np.ndarray):
547-
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
548-
elif isinstance(result, BaseMaskedArray):
549-
if result._mask.all():
550-
# We want an arrow null array here
551-
result = ArrowExtensionArray(pa.array([None] * len(result)))
552-
else:
553-
result = ArrowExtensionArray(
554-
pa.array(result._data, mask=result._mask)
555-
)
556-
else:
557-
result = ArrowExtensionArray(
558-
pa.array(result.to_numpy(), from_pandas=True)
559-
)
560-
561-
return result, na_count
562-
563433
@overload
564434
def _do_date_conversions(
565435
self,

pandas/io/parsers/python_parser.py

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@
2020

2121
import numpy as np
2222

23-
from pandas._libs import lib
23+
from pandas._libs import (
24+
lib,
25+
parsers,
26+
)
27+
import pandas._libs.ops as libops
28+
from pandas.compat._optional import import_optional_dependency
2429
from pandas.errors import (
2530
EmptyDataError,
2631
ParserError,
@@ -33,7 +38,9 @@
3338
from pandas.core.dtypes.common import (
3439
is_bool_dtype,
3540
is_extension_array_dtype,
41+
is_float_dtype,
3642
is_integer,
43+
is_integer_dtype,
3744
is_numeric_dtype,
3845
is_object_dtype,
3946
is_string_dtype,
@@ -44,13 +51,20 @@
4451
ExtensionDtype,
4552
)
4653
from pandas.core.dtypes.inference import is_dict_like
54+
from pandas.core.dtypes.missing import isna
4755

4856
from pandas.core import algorithms
4957
from pandas.core.arrays import (
58+
ArrowExtensionArray,
59+
BaseMaskedArray,
60+
BooleanArray,
5061
Categorical,
5162
ExtensionArray,
63+
FloatingArray,
64+
IntegerArray,
5265
)
5366
from pandas.core.arrays.boolean import BooleanDtype
67+
from pandas.core.arrays.string_ import StringDtype
5468
from pandas.core.indexes.api import Index
5569

5670
from pandas.io.common import (
@@ -549,6 +563,119 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
549563
) from err
550564
return values
551565

566+
@final
567+
def _infer_types(
568+
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
569+
) -> tuple[ArrayLike, int]:
570+
"""
571+
Infer types of values, possibly casting
572+
573+
Parameters
574+
----------
575+
values : ndarray
576+
na_values : set
577+
no_dtype_specified: Specifies if we want to cast explicitly
578+
try_num_bool : bool, default try
579+
try to cast values to numeric (first preference) or boolean
580+
581+
Returns
582+
-------
583+
converted : ndarray or ExtensionArray
584+
na_count : int
585+
"""
586+
na_count = 0
587+
if issubclass(values.dtype.type, (np.number, np.bool_)):
588+
# If our array has numeric dtype, we don't have to check for strings in isin
589+
na_values = np.array([val for val in na_values if not isinstance(val, str)])
590+
mask = algorithms.isin(values, na_values)
591+
na_count = mask.astype("uint8", copy=False).sum()
592+
if na_count > 0:
593+
if is_integer_dtype(values):
594+
values = values.astype(np.float64)
595+
np.putmask(values, mask, np.nan)
596+
return values, na_count
597+
598+
dtype_backend = self.dtype_backend
599+
non_default_dtype_backend = (
600+
no_dtype_specified and dtype_backend is not lib.no_default
601+
)
602+
result: ArrayLike
603+
604+
if try_num_bool and is_object_dtype(values.dtype):
605+
# exclude e.g DatetimeIndex here
606+
try:
607+
result, result_mask = lib.maybe_convert_numeric(
608+
values,
609+
na_values,
610+
False,
611+
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
612+
)
613+
except (ValueError, TypeError):
614+
# e.g. encountering datetime string gets ValueError
615+
# TypeError can be raised in floatify
616+
na_count = parsers.sanitize_objects(values, na_values)
617+
result = values
618+
else:
619+
if non_default_dtype_backend:
620+
if result_mask is None:
621+
result_mask = np.zeros(result.shape, dtype=np.bool_)
622+
623+
if result_mask.all():
624+
result = IntegerArray(
625+
np.ones(result_mask.shape, dtype=np.int64), result_mask
626+
)
627+
elif is_integer_dtype(result):
628+
result = IntegerArray(result, result_mask)
629+
elif is_bool_dtype(result):
630+
result = BooleanArray(result, result_mask)
631+
elif is_float_dtype(result):
632+
result = FloatingArray(result, result_mask)
633+
634+
na_count = result_mask.sum()
635+
else:
636+
na_count = isna(result).sum()
637+
else:
638+
result = values
639+
if values.dtype == np.object_:
640+
na_count = parsers.sanitize_objects(values, na_values)
641+
642+
if result.dtype == np.object_ and try_num_bool:
643+
result, bool_mask = libops.maybe_convert_bool(
644+
np.asarray(values),
645+
true_values=self.true_values,
646+
false_values=self.false_values,
647+
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
648+
)
649+
if result.dtype == np.bool_ and non_default_dtype_backend:
650+
if bool_mask is None:
651+
bool_mask = np.zeros(result.shape, dtype=np.bool_)
652+
result = BooleanArray(result, bool_mask)
653+
elif result.dtype == np.object_ and non_default_dtype_backend:
654+
# read_excel sends array of datetime objects
655+
if not lib.is_datetime_array(result, skipna=True):
656+
dtype = StringDtype()
657+
cls = dtype.construct_array_type()
658+
result = cls._from_sequence(values, dtype=dtype)
659+
660+
if dtype_backend == "pyarrow":
661+
pa = import_optional_dependency("pyarrow")
662+
if isinstance(result, np.ndarray):
663+
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
664+
elif isinstance(result, BaseMaskedArray):
665+
if result._mask.all():
666+
# We want an arrow null array here
667+
result = ArrowExtensionArray(pa.array([None] * len(result)))
668+
else:
669+
result = ArrowExtensionArray(
670+
pa.array(result._data, mask=result._mask)
671+
)
672+
else:
673+
result = ArrowExtensionArray(
674+
pa.array(result.to_numpy(), from_pandas=True)
675+
)
676+
677+
return result, na_count
678+
552679
@cache_readonly
553680
def _have_mi_columns(self) -> bool:
554681
if self.header is None:

0 commit comments

Comments
 (0)