|
20 | 20 |
|
21 | 21 | import numpy as np
|
22 | 22 |
|
23 |
| -from pandas._libs import lib |
| 23 | +from pandas._libs import ( |
| 24 | + lib, |
| 25 | + parsers, |
| 26 | +) |
| 27 | +import pandas._libs.ops as libops |
| 28 | +from pandas.compat._optional import import_optional_dependency |
24 | 29 | from pandas.errors import (
|
25 | 30 | EmptyDataError,
|
26 | 31 | ParserError,
|
|
33 | 38 | from pandas.core.dtypes.common import (
|
34 | 39 | is_bool_dtype,
|
35 | 40 | is_extension_array_dtype,
|
| 41 | + is_float_dtype, |
36 | 42 | is_integer,
|
| 43 | + is_integer_dtype, |
37 | 44 | is_numeric_dtype,
|
38 | 45 | is_object_dtype,
|
39 | 46 | is_string_dtype,
|
|
44 | 51 | ExtensionDtype,
|
45 | 52 | )
|
46 | 53 | from pandas.core.dtypes.inference import is_dict_like
|
| 54 | +from pandas.core.dtypes.missing import isna |
47 | 55 |
|
48 | 56 | from pandas.core import algorithms
|
49 | 57 | from pandas.core.arrays import (
|
| 58 | + ArrowExtensionArray, |
| 59 | + BaseMaskedArray, |
| 60 | + BooleanArray, |
50 | 61 | Categorical,
|
51 | 62 | ExtensionArray,
|
| 63 | + FloatingArray, |
| 64 | + IntegerArray, |
52 | 65 | )
|
53 | 66 | from pandas.core.arrays.boolean import BooleanDtype
|
| 67 | +from pandas.core.arrays.string_ import StringDtype |
54 | 68 | from pandas.core.indexes.api import Index
|
55 | 69 |
|
56 | 70 | from pandas.io.common import (
|
@@ -549,6 +563,119 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
|
549 | 563 | ) from err
|
550 | 564 | return values
|
551 | 565 |
|
| 566 | + @final |
| 567 | + def _infer_types( |
| 568 | + self, values, na_values, no_dtype_specified, try_num_bool: bool = True |
| 569 | + ) -> tuple[ArrayLike, int]: |
| 570 | + """ |
| 571 | + Infer types of values, possibly casting |
| 572 | +
|
| 573 | + Parameters |
| 574 | + ---------- |
| 575 | + values : ndarray |
| 576 | + na_values : set |
| 577 | + no_dtype_specified: Specifies if we want to cast explicitly |
| 578 | + try_num_bool : bool, default try |
| 579 | + try to cast values to numeric (first preference) or boolean |
| 580 | +
|
| 581 | + Returns |
| 582 | + ------- |
| 583 | + converted : ndarray or ExtensionArray |
| 584 | + na_count : int |
| 585 | + """ |
| 586 | + na_count = 0 |
| 587 | + if issubclass(values.dtype.type, (np.number, np.bool_)): |
| 588 | + # If our array has numeric dtype, we don't have to check for strings in isin |
| 589 | + na_values = np.array([val for val in na_values if not isinstance(val, str)]) |
| 590 | + mask = algorithms.isin(values, na_values) |
| 591 | + na_count = mask.astype("uint8", copy=False).sum() |
| 592 | + if na_count > 0: |
| 593 | + if is_integer_dtype(values): |
| 594 | + values = values.astype(np.float64) |
| 595 | + np.putmask(values, mask, np.nan) |
| 596 | + return values, na_count |
| 597 | + |
| 598 | + dtype_backend = self.dtype_backend |
| 599 | + non_default_dtype_backend = ( |
| 600 | + no_dtype_specified and dtype_backend is not lib.no_default |
| 601 | + ) |
| 602 | + result: ArrayLike |
| 603 | + |
| 604 | + if try_num_bool and is_object_dtype(values.dtype): |
| 605 | + # exclude e.g DatetimeIndex here |
| 606 | + try: |
| 607 | + result, result_mask = lib.maybe_convert_numeric( |
| 608 | + values, |
| 609 | + na_values, |
| 610 | + False, |
| 611 | + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] |
| 612 | + ) |
| 613 | + except (ValueError, TypeError): |
| 614 | + # e.g. encountering datetime string gets ValueError |
| 615 | + # TypeError can be raised in floatify |
| 616 | + na_count = parsers.sanitize_objects(values, na_values) |
| 617 | + result = values |
| 618 | + else: |
| 619 | + if non_default_dtype_backend: |
| 620 | + if result_mask is None: |
| 621 | + result_mask = np.zeros(result.shape, dtype=np.bool_) |
| 622 | + |
| 623 | + if result_mask.all(): |
| 624 | + result = IntegerArray( |
| 625 | + np.ones(result_mask.shape, dtype=np.int64), result_mask |
| 626 | + ) |
| 627 | + elif is_integer_dtype(result): |
| 628 | + result = IntegerArray(result, result_mask) |
| 629 | + elif is_bool_dtype(result): |
| 630 | + result = BooleanArray(result, result_mask) |
| 631 | + elif is_float_dtype(result): |
| 632 | + result = FloatingArray(result, result_mask) |
| 633 | + |
| 634 | + na_count = result_mask.sum() |
| 635 | + else: |
| 636 | + na_count = isna(result).sum() |
| 637 | + else: |
| 638 | + result = values |
| 639 | + if values.dtype == np.object_: |
| 640 | + na_count = parsers.sanitize_objects(values, na_values) |
| 641 | + |
| 642 | + if result.dtype == np.object_ and try_num_bool: |
| 643 | + result, bool_mask = libops.maybe_convert_bool( |
| 644 | + np.asarray(values), |
| 645 | + true_values=self.true_values, |
| 646 | + false_values=self.false_values, |
| 647 | + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] |
| 648 | + ) |
| 649 | + if result.dtype == np.bool_ and non_default_dtype_backend: |
| 650 | + if bool_mask is None: |
| 651 | + bool_mask = np.zeros(result.shape, dtype=np.bool_) |
| 652 | + result = BooleanArray(result, bool_mask) |
| 653 | + elif result.dtype == np.object_ and non_default_dtype_backend: |
| 654 | + # read_excel sends array of datetime objects |
| 655 | + if not lib.is_datetime_array(result, skipna=True): |
| 656 | + dtype = StringDtype() |
| 657 | + cls = dtype.construct_array_type() |
| 658 | + result = cls._from_sequence(values, dtype=dtype) |
| 659 | + |
| 660 | + if dtype_backend == "pyarrow": |
| 661 | + pa = import_optional_dependency("pyarrow") |
| 662 | + if isinstance(result, np.ndarray): |
| 663 | + result = ArrowExtensionArray(pa.array(result, from_pandas=True)) |
| 664 | + elif isinstance(result, BaseMaskedArray): |
| 665 | + if result._mask.all(): |
| 666 | + # We want an arrow null array here |
| 667 | + result = ArrowExtensionArray(pa.array([None] * len(result))) |
| 668 | + else: |
| 669 | + result = ArrowExtensionArray( |
| 670 | + pa.array(result._data, mask=result._mask) |
| 671 | + ) |
| 672 | + else: |
| 673 | + result = ArrowExtensionArray( |
| 674 | + pa.array(result.to_numpy(), from_pandas=True) |
| 675 | + ) |
| 676 | + |
| 677 | + return result, na_count |
| 678 | + |
552 | 679 | @cache_readonly
|
553 | 680 | def _have_mi_columns(self) -> bool:
|
554 | 681 | if self.header is None:
|
|
0 commit comments