API: Infer extension types in array (#29799)

TomAugspurger · jreback · commit 83812e1ba93b · 2019-12-02T12:38:48.000-05:00
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
@@ -25,8 +25,7 @@ numbers.
 
 Pandas can represent integer data with possibly missing values using
 :class:`arrays.IntegerArray`. This is an :ref:`extension types <extending.extension-types>`
-implemented within pandas. It is not the default dtype for integers, and will not be inferred;
-you must explicitly pass the dtype into :meth:`array` or :class:`Series`:
+implemented within pandas.
 
 .. ipython:: python
 
@@ -50,24 +49,43 @@ NumPy array.
 You can also pass the list-like object to the :class:`Series` constructor
 with the dtype.
 
-.. ipython:: python
+.. warning::
 
-   s = pd.Series([1, 2, np.nan], dtype="Int64")
-   s
+   Currently :meth:`pandas.array` and :meth:`pandas.Series` use different
+   rules for dtype inference. :meth:`pandas.array` will infer a nullable-
+   integer dtype
 
-By default (if you don't specify ``dtype``), NumPy is used, and you'll end
-up with a ``float64`` dtype Series:
+   .. ipython:: python
 
-.. ipython:: python
+      pd.array([1, None])
+      pd.array([1, 2])
+
+   For backwards-compatibility, :class:`Series` infers these as either
+   integer or float dtype
+
+   .. ipython:: python
+
+      pd.Series([1, None])
+      pd.Series([1, 2])
 
-   pd.Series([1, 2, np.nan])
+   We recommend explicitly providing the dtype to avoid confusion.
+
+   .. ipython:: python
+
+      pd.array([1, None], dtype="Int64")
+      pd.Series([1, None], dtype="Int64")
+
+   In the future, we may provide an option for :class:`Series` to infer a
+   nullable-integer dtype.
 
 Operations involving an integer array will behave similar to NumPy arrays.
 Missing values will be propagated, and the data will be coerced to another
 dtype if needed.
 
 .. ipython:: python
 
+   s = pd.Series([1, 2, None], dtype="Int64")
+
    # arithmetic
    s + 1
 
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -303,6 +303,38 @@ The following methods now also correctly output values for unobserved categories
 
    df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
 
+:meth:`pandas.array` inference changes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`pandas.array` now infers pandas' new extension types in several cases (:issue:`29791`):
+
+1. String data (including missing values) now returns a :class:`arrays.StringArray`.
+2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`.
+3. Boolean data (including missing values) now returns the new :class:`arrays.BooleanArray`
+
+*pandas 0.25.x*
+
+.. code-block:: python
+
+   >>> pd.array(["a", None])
+   <PandasArray>
+   ['a', None]
+   Length: 2, dtype: object
+
+   >>> pd.array([1, None])
+   <PandasArray>
+   [1, None]
+   Length: 2, dtype: object
+
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+   pd.array(["a", None])
+   pd.array([1, None])
+
+As a reminder, you can specify the ``dtype`` to disable all inference.
 
 By default :meth:`Categorical.min` now returns the minimum instead of np.nan
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -408,7 +440,6 @@ Other API changes
 - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter.
   Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`)
 - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`)
--
 
 
 .. _whatsnew_1000.api.documentation:
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1313,7 +1313,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
 
     elif isinstance(val, str):
         if is_string_array(values, skipna=skipna):
-            return 'string'
+            return "string"
 
     elif isinstance(val, bytes):
         if is_bytes_array(values, skipna=skipna):
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -94,10 +94,19 @@ def array(
         :class:`pandas.Period`         :class:`pandas.arrays.PeriodArray`
         :class:`datetime.datetime`     :class:`pandas.arrays.DatetimeArray`
         :class:`datetime.timedelta`    :class:`pandas.arrays.TimedeltaArray`
+        :class:`int`                   :class:`pandas.arrays.IntegerArray`
+        :class:`str`                   :class:`pandas.arrays.StringArray`
+        :class:`bool`                  :class:`pandas.arrays.BooleanArray`
         ============================== =====================================
 
         For all other cases, NumPy's usual inference rules will be used.
 
+        .. versionchanged:: 1.0.0
+
+           Pandas infers nullable-integer dtype for integer data,
+           string dtype for string data, and nullable-boolean dtype
+           for boolean data.
+
     copy : bool, default True
         Whether to copy the data, even if not necessary. Depending
         on the type of `data`, creating the new array may require
@@ -154,14 +163,6 @@ def array(
     ['a', 'b']
     Length: 2, dtype: str32
 
-    Or use the dedicated constructor for the array you're expecting, and
-    wrap that in a PandasArray
-
-    >>> pd.array(np.array(['a', 'b'], dtype='<U1'))
-    <PandasArray>
-    ['a', 'b']
-    Length: 2, dtype: str32
-
     Finally, Pandas has arrays that mostly overlap with NumPy
 
       * :class:`arrays.DatetimeArray`
@@ -184,20 +185,28 @@ def array(
 
     Examples
     --------
-    If a dtype is not specified, `data` is passed through to
-    :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned.
+    If a dtype is not specified, pandas will infer the best dtype from the values.
+    See the description of `dtype` for the types pandas infers for.
 
     >>> pd.array([1, 2])
-    <PandasArray>
+    <IntegerArray>
     [1, 2]
-    Length: 2, dtype: int64
+    Length: 2, dtype: Int64
 
-    Or the NumPy dtype can be specified
+    >>> pd.array([1, 2, np.nan])
+    <IntegerArray>
+    [1, 2, NaN]
+    Length: 3, dtype: Int64
 
-    >>> pd.array([1, 2], dtype=np.dtype("int32"))
-    <PandasArray>
-    [1, 2]
-    Length: 2, dtype: int32
+    >>> pd.array(["a", None, "c"])
+    <StringArray>
+    ['a', nan, 'c']
+    Length: 3, dtype: string
+
+    >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
+    <PeriodArray>
+    ['2000-01-01', '2000-01-01']
+    Length: 2, dtype: period[D]
 
     You can use the string alias for `dtype`
 
@@ -212,29 +221,24 @@ def array(
     [a, b, a]
     Categories (3, object): [a < b < c]
 
-    Because omitting the `dtype` passes the data through to NumPy,
-    a mixture of valid integers and NA will return a floating-point
-    NumPy array.
+    If pandas does not infer a dedicated extension type a
+    :class:`arrays.PandasArray` is returned.
 
-    >>> pd.array([1, 2, np.nan])
+    >>> pd.array([1.1, 2.2])
     <PandasArray>
-    [1.0,  2.0, nan]
-    Length: 3, dtype: float64
-
-    To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify
-    the dtype:
+    [1.1, 2.2]
+    Length: 2, dtype: float64
 
-    >>> pd.array([1, 2, np.nan], dtype='Int64')
-    <IntegerArray>
-    [1, 2, NaN]
-    Length: 3, dtype: Int64
+    As mentioned in the "Notes" section, new extension types may be added
+    in the future (by pandas or 3rd party libraries), causing the return
+    value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype`
+    as a NumPy dtype if you need to ensure there's no future change in
+    behavior.
 
-    Pandas will infer an ExtensionArray for some types of data:
-
-    >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
-    <PeriodArray>
-    ['2000-01-01', '2000-01-01']
-    Length: 2, dtype: period[D]
+    >>> pd.array([1, 2], dtype=np.dtype("int32"))
+    <PandasArray>
+    [1, 2]
+    Length: 2, dtype: int32
 
     `data` must be 1-dimensional. A ValueError is raised when the input
     has the wrong dimensionality.
@@ -246,21 +250,26 @@ def array(
     """
     from pandas.core.arrays import (
         period_array,
+        BooleanArray,
+        IntegerArray,
         IntervalArray,
         PandasArray,
         DatetimeArray,
         TimedeltaArray,
+        StringArray,
     )
 
     if lib.is_scalar(data):
         msg = "Cannot pass scalar '{}' to 'pandas.array'."
         raise ValueError(msg.format(data))
 
-    data = extract_array(data, extract_numpy=True)
-
-    if dtype is None and isinstance(data, ABCExtensionArray):
+    if dtype is None and isinstance(
+        data, (ABCSeries, ABCIndexClass, ABCExtensionArray)
+    ):
         dtype = data.dtype
 
+    data = extract_array(data, extract_numpy=True)
+
     # this returns None for not-found dtypes.
     if isinstance(dtype, str):
         dtype = registry.find(dtype) or dtype
@@ -270,7 +279,7 @@ def array(
         return cls._from_sequence(data, dtype=dtype, copy=copy)
 
     if dtype is None:
-        inferred_dtype = lib.infer_dtype(data, skipna=False)
+        inferred_dtype = lib.infer_dtype(data, skipna=True)
         if inferred_dtype == "period":
             try:
                 return period_array(data, copy=copy)
@@ -298,7 +307,14 @@ def array(
             # timedelta, timedelta64
             return TimedeltaArray._from_sequence(data, copy=copy)
 
-        # TODO(BooleanArray): handle this type
+        elif inferred_dtype == "string":
+            return StringArray._from_sequence(data, copy=copy)
+
+        elif inferred_dtype == "integer":
+            return IntegerArray._from_sequence(data, copy=copy)
+
+        elif inferred_dtype == "boolean":
+            return BooleanArray._from_sequence(data, copy=copy)
 
     # Pandas overrides NumPy for
     #   1. datetime64[ns]
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
@@ -19,14 +19,18 @@
     "data, dtype, expected",
     [
         # Basic NumPy defaults.
-        ([1, 2], None, PandasArray(np.array([1, 2]))),
+        ([1, 2], None, pd.arrays.IntegerArray._from_sequence([1, 2])),
         ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
         (
             [1, 2],
             np.dtype("float32"),
             PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
         ),
-        (np.array([1, 2]), None, PandasArray(np.array([1, 2]))),
+        (
+            np.array([1, 2], dtype="int64"),
+            None,
+            pd.arrays.IntegerArray._from_sequence([1, 2]),
+        ),
         # String alias passes through to NumPy
         ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))),
         # Period alias
@@ -113,6 +117,20 @@
         # IntegerNA
         ([1, None], "Int16", integer_array([1, None], dtype="Int16")),
         (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
+        # String
+        (["a", None], "string", pd.arrays.StringArray._from_sequence(["a", None])),
+        (
+            ["a", None],
+            pd.StringDtype(),
+            pd.arrays.StringArray._from_sequence(["a", None]),
+        ),
+        # Boolean
+        ([True, None], "boolean", pd.arrays.BooleanArray._from_sequence([True, None])),
+        (
+            [True, None],
+            pd.BooleanDtype(),
+            pd.arrays.BooleanArray._from_sequence([True, None]),
+        ),
         # Index
         (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
         # Series[EA] returns the EA
@@ -139,15 +157,15 @@ def test_array(data, dtype, expected):
 def test_array_copy():
     a = np.array([1, 2])
     # default is to copy
-    b = pd.array(a)
+    b = pd.array(a, dtype=a.dtype)
     assert np.shares_memory(a, b._ndarray) is False
 
     # copy=True
-    b = pd.array(a, copy=True)
+    b = pd.array(a, dtype=a.dtype, copy=True)
     assert np.shares_memory(a, b._ndarray) is False
 
     # copy=False
-    b = pd.array(a, copy=False)
+    b = pd.array(a, dtype=a.dtype, copy=False)
     assert np.shares_memory(a, b._ndarray) is True
 
 
@@ -211,6 +229,15 @@ def test_array_copy():
             np.array([1, 2], dtype="m8[us]"),
             pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")),
         ),
+        # integer
+        ([1, 2], pd.arrays.IntegerArray._from_sequence([1, 2])),
+        ([1, None], pd.arrays.IntegerArray._from_sequence([1, None])),
+        # string
+        (["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])),
+        (["a", None], pd.arrays.StringArray._from_sequence(["a", None])),
+        # Boolean
+        ([True, False], pd.arrays.BooleanArray._from_sequence([True, False])),
+        ([True, None], pd.arrays.BooleanArray._from_sequence([True, None])),
     ],
 )
 def test_array_inference(data, expected):
@@ -241,7 +268,7 @@ def test_array_inference_fails(data):
 @pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]])
 def test_nd_raises(data):
     with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"):
-        pd.array(data)
+        pd.array(data, dtype="int64")
 
 
 def test_scalar_raises():
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -732,12 +732,17 @@ def test_string(self):
     def test_unicode(self):
         arr = ["a", np.nan, "c"]
         result = lib.infer_dtype(arr, skipna=False)
+        # This currently returns "mixed", but it's not clear that's optimal.
+        # This could also return "string" or "mixed-string"
         assert result == "mixed"
 
         arr = ["a", np.nan, "c"]
         result = lib.infer_dtype(arr, skipna=True)
-        expected = "string"
-        assert result == expected
+        assert result == "string"
+
+        arr = ["a", "c"]
+        result = lib.infer_dtype(arr, skipna=False)
+        assert result == "string"
 
     @pytest.mark.parametrize(
         "dtype, missing, skipna, expected",
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py