From ca6c97924eb1ec288422da59cd031f650d037b5d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 16:31:55 -0500 Subject: [PATCH 01/21] BUG: Fix FloatingArray output formatting --- pandas/io/formats/format.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b9d41f142c2b5..7df50b815f32d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1311,7 +1311,7 @@ def _format_strings(self) -> List[str]: float_format = get_option("display.float_format") if float_format is None: precision = get_option("display.precision") - float_format = lambda x: f"{x: .{precision:d}g}" + float_format = lambda x: f"{x: .{precision:d}f}" else: float_format = self.float_format @@ -1372,6 +1372,8 @@ def _format(x): tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) + fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") + return fmt_values @@ -1473,9 +1475,9 @@ def format_values_with(float_format): if self.fixed_width: if is_complex: - result = _trim_zeros_complex(values, self.decimal, na_rep) + result = _trim_zeros_complex(values, self.decimal) else: - result = _trim_zeros_float(values, self.decimal, na_rep) + result = _trim_zeros_float(values, self.decimal) return np.asarray(result, dtype="object") return values @@ -1855,21 +1857,19 @@ def just(x): return result -def _trim_zeros_complex( - str_complexes: np.ndarray, decimal: str = ".", na_rep: str = "NaN" -) -> List[str]: +def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ return [ - "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal, na_rep)) + "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal)) for x in str_complexes ] def _trim_zeros_float( - str_floats: Union[np.ndarray, List[str]], decimal: str = ".", na_rep: str = "NaN" + str_floats: Union[np.ndarray, List[str]], decimal: str = "." ) -> List[str]: """ Trims zeros, leaving just one before the decimal points if need be. @@ -1877,7 +1877,7 @@ def _trim_zeros_float( trimmed = str_floats def _is_number(x): - return x != na_rep and not x.endswith("inf") + return re.match(r"-?[0-9]+(\.[0-9]+)?", x.strip()) is not None def _cond(values): finite = [x for x in values if _is_number(x)] From adcc26c355b3cec087514e8354d855c9ebad2c2c Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 16:34:41 -0500 Subject: [PATCH 02/21] fixup --- pandas/tests/io/formats/test_format.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 68f5386fff7be..5125606845eaf 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3432,3 +3432,7 @@ def test_format_remove_leading_space_dataframe(input_array, expected): # GH: 24980 df = pd.DataFrame(input_array).to_string(index=False) assert df == expected + + +def test_to_string_nullable_float(): + assert False From 4663e1076d177a953347c0e45d7d08919f095918 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 18:29:00 -0500 Subject: [PATCH 03/21] fixup --- pandas/io/formats/format.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7df50b815f32d..488f5ac57bf6c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1862,6 +1862,7 @@ def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[s Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ + # somehow pad to same length? return [ "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal)) for x in str_complexes @@ -1877,7 +1878,7 @@ def _trim_zeros_float( trimmed = str_floats def _is_number(x): - return re.match(r"-?[0-9]+(\.[0-9]+)?", x.strip()) is not None + return re.match(r"\s*-?[0-9]+(\.[0-9]+)?\s*", x) is not None def _cond(values): finite = [x for x in values if _is_number(x)] From 406ed4d41e65ad408c0d0ca357751adaa5b16437 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 19:56:19 -0500 Subject: [PATCH 04/21] CLN: Clean float / complex string formatting --- pandas/io/formats/format.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b9d41f142c2b5..52c3acf4d52e2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1473,9 +1473,9 @@ def format_values_with(float_format): if self.fixed_width: if is_complex: - result = _trim_zeros_complex(values, self.decimal, na_rep) + result = values else: - result = _trim_zeros_float(values, self.decimal, na_rep) + result = _trim_zeros_float(values, self.decimal) return np.asarray(result, dtype="object") return values @@ -1855,21 +1855,8 @@ def just(x): return result -def _trim_zeros_complex( - str_complexes: np.ndarray, decimal: str = ".", na_rep: str = "NaN" -) -> List[str]: - """ - Separates the real and imaginary parts from the complex number, and - executes the _trim_zeros_float method on each of those. - """ - return [ - "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal, na_rep)) - for x in str_complexes - ] - - def _trim_zeros_float( - str_floats: Union[np.ndarray, List[str]], decimal: str = ".", na_rep: str = "NaN" + str_floats: Union[np.ndarray, List[str]], decimal: str = "." ) -> List[str]: """ Trims zeros, leaving just one before the decimal points if need be. @@ -1877,7 +1864,7 @@ def _trim_zeros_float( trimmed = str_floats def _is_number(x): - return x != na_rep and not x.endswith("inf") + return re.match(r"\s*-?[0-9]+(\.[0-9]+)?", x) is not None def _cond(values): finite = [x for x in values if _is_number(x)] From a1228c9f9d50d05dce435da2f27cd55903b547ef Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 20:22:05 -0500 Subject: [PATCH 05/21] Fix --- pandas/io/formats/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 52c3acf4d52e2..653aec2f2925d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1864,7 +1864,7 @@ def _trim_zeros_float( trimmed = str_floats def _is_number(x): - return re.match(r"\s*-?[0-9]+(\.[0-9]+)?", x) is not None + return re.match(r"\s*-?[0-9]+(\.[0-9]*)?", x) is not None def _cond(values): finite = [x for x in values if _is_number(x)] From 77b88fc5b4c39da04339f6348b5a848459b7173f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 20:33:58 -0500 Subject: [PATCH 06/21] Test --- pandas/tests/io/formats/test_format.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 5125606845eaf..104b21f6edad3 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3434,5 +3434,11 @@ def test_format_remove_leading_space_dataframe(input_array, expected): assert df == expected -def test_to_string_nullable_float(): - assert False +@pytest.mark.parametrize("dtype", ["Float32", "Float64"]) +def test_to_string_nullable_float(dtype): + s = pd.Series([0.0, 1.0, None], dtype=dtype) + result = s.to_string() + expected = """0 0.0 +1 1.0 +2 """ + assert result == expected From 8743b5bd27944885ec86f3427a4f7640d49ea8ed Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 1 Oct 2020 20:35:04 -0500 Subject: [PATCH 07/21] Test --- pandas/tests/io/formats/test_format.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 104b21f6edad3..d57ed96effe37 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3435,7 +3435,8 @@ def test_format_remove_leading_space_dataframe(input_array, expected): @pytest.mark.parametrize("dtype", ["Float32", "Float64"]) -def test_to_string_nullable_float(dtype): +def test_nullable_float_to_string(dtype): + # https://github.com/pandas-dev/pandas/issues/36775 s = pd.Series([0.0, 1.0, None], dtype=dtype) result = s.to_string() expected = """0 0.0 From a235611d64bb1118965e7297c3e16ed06d36eba6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 2 Oct 2020 09:07:28 -0500 Subject: [PATCH 08/21] Fixture --- pandas/tests/io/formats/test_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d57ed96effe37..4344ef9a856b3 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3434,9 +3434,9 @@ def test_format_remove_leading_space_dataframe(input_array, expected): assert df == expected -@pytest.mark.parametrize("dtype", ["Float32", "Float64"]) -def test_nullable_float_to_string(dtype): +def test_nullable_float_to_string(float_ea_dtype): # https://github.com/pandas-dev/pandas/issues/36775 + dtype = float_ea_dtype s = pd.Series([0.0, 1.0, None], dtype=dtype) result = s.to_string() expected = """0 0.0 From d775c33f45a3b9391a36c7c3446b9c89a23c44d8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 09:46:46 -0500 Subject: [PATCH 09/21] Update format.py --- pandas/io/formats/format.py | 38 +++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 515ff168464de..f5908cf7ca7bb 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1311,7 +1311,7 @@ def _format_strings(self) -> List[str]: float_format = get_option("display.float_format") if float_format is None: precision = get_option("display.precision") - float_format = lambda x: f"{x: .{precision:d}f}" + float_format = lambda x: f"{x: .{precision:d}g}" else: float_format = self.float_format @@ -1372,8 +1372,6 @@ def _format(x): tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) - fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") - return fmt_values @@ -1409,6 +1407,7 @@ def _value_formatter( if float_format: def base_formatter(v): + assert float_format is not None # for mypy return float_format(value=v) if notna(v) else self.na_rep else: @@ -1475,7 +1474,7 @@ def format_values_with(float_format): if self.fixed_width: if is_complex: - result = values + result = _trim_zeros_complex(values, self.decimal) else: result = _trim_zeros_float(values, self.decimal) return np.asarray(result, dtype="object") @@ -1679,7 +1678,8 @@ def is_dates_only( values: Union[np.ndarray, DatetimeArray, Index, DatetimeIndex] ) -> bool: # return a boolean if we are only dates (and don't have a timezone) - values = values.ravel() + if not isinstance(values, Index): + values = values.ravel() values = DatetimeIndex(values) if values.tz is not None: @@ -1857,6 +1857,31 @@ def just(x): return result +def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[str]: + """ + Separates the real and imaginary parts from the complex number, and + executes the _trim_zeros_float method on each of those. + """ + trimmed = [ + "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal)) + for x in str_complexes + ] + + # pad strings to the length of the longest trimmed string for alignment + lengths = [len(s) for s in trimmed] + max_length = max(lengths) + padded = [ + s[: -((k - 1) // 2 + 1)] # real part + + (max_length - k) // 2 * "0" + + s[-((k - 1) // 2 + 1) : -((k - 1) // 2)] # + / - + + s[-((k - 1) // 2) : -1] # imaginary part + + (max_length - k) // 2 * "0" + + s[-1] + for s, k in zip(trimmed, lengths) + ] + return padded + + def _trim_zeros_float( str_floats: Union[np.ndarray, List[str]], decimal: str = "." ) -> List[str]: @@ -1864,9 +1889,10 @@ def _trim_zeros_float( Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats + number_regex = re.compile(fr"\s*[\+-]?[0-9]+(\{decimal}[0-9]*)?") def _is_number(x): - return re.match(r"\s*-?[0-9]+(\.[0-9]*)?", x) is not None + return re.match(number_regex, x) is not None def _cond(values): finite = [x for x in values if _is_number(x)] From 00c15ba017e857dc21ae844363648a26327b6e54 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 09:51:09 -0500 Subject: [PATCH 10/21] Reapply --- pandas/io/formats/format.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index f5908cf7ca7bb..bf44bfe61921c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1311,7 +1311,7 @@ def _format_strings(self) -> List[str]: float_format = get_option("display.float_format") if float_format is None: precision = get_option("display.precision") - float_format = lambda x: f"{x: .{precision:d}g}" + float_format = lambda x: f"{x: .{precision:d}f}" else: float_format = self.float_format @@ -1372,6 +1372,8 @@ def _format(x): tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) + fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") + return fmt_values From 06ef3376a38edef54e6d12649f58ec31996b6b89 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 09:59:13 -0500 Subject: [PATCH 11/21] Update --- pandas/core/frame.py | 248 ++++++++++++++++++++++--------------------- 1 file changed, 127 insertions(+), 121 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9b2540a1ce043..f8b8a2c6b6d10 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -83,11 +83,11 @@ infer_dtype_from_scalar, invalidate_string_dtypes, maybe_cast_to_datetime, + maybe_casted_values, maybe_convert_platform, maybe_downcast_to_dtype, maybe_infer_to_datetimelike, maybe_upcast, - maybe_upcast_putmask, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -100,6 +100,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_float, is_float_dtype, is_hashable, is_integer, @@ -113,21 +114,23 @@ needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna +from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import reconstruct_func, relabel_result, transform +from pandas.core.aggregation import ( + aggregate, + reconstruct_func, + relabel_result, + transform, +) from pandas.core.arrays import Categorical, ExtensionArray -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import extract_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences -from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import MultiIndex, maybe_droplevels -from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( @@ -630,7 +633,6 @@ def _is_homogeneous_type(self) -> bool: if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: - # Note: consolidates inplace return not self._is_mixed_type @property @@ -638,10 +640,10 @@ def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ - if self._data.any_extension_types: + if self._mgr.any_extension_types: # TODO(EA2D) special case would be unnecessary with 2D EAs return False - return len(self._data.blocks) == 1 + return len(self._mgr.blocks) == 1 # ---------------------------------------------------------------------- # Rendering Methods @@ -1216,7 +1218,14 @@ def __rmatmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - return self.T.dot(np.transpose(other)).T + try: + return self.T.dot(np.transpose(other)).T + except ValueError as err: + if "shape mismatch" not in str(err): + raise + # GH#21581 give exception message for original shapes + msg = f"shapes {np.shape(other)} and {self.shape} not aligned" + raise ValueError(msg) from err # ---------------------------------------------------------------------- # IO methods (to / from other formats) @@ -2337,7 +2346,7 @@ def to_parquet( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -2650,11 +2659,11 @@ def memory_usage(self, index=True, deep=False) -> Series: >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool - 0 1 1.0 1.000000+0.000000j 1 True - 1 1 1.0 1.000000+0.000000j 1 True - 2 1 1.0 1.000000+0.000000j 1 True - 3 1 1.0 1.000000+0.000000j 1 True - 4 1 1.0 1.000000+0.000000j 1 True + 0 1 1.0 1.0+0.0j 1 True + 1 1 1.0 1.0+0.0j 1 True + 2 1 1.0 1.0+0.0j 1 True + 3 1 1.0 1.0+0.0j 1 True + 4 1 1.0 1.0+0.0j 1 True >>> df.memory_usage() Index 128 @@ -2872,7 +2881,7 @@ def _get_column_array(self, i: int) -> ArrayLike: Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) """ - return self._data.iget_values(i) + return self._mgr.iget_values(i) def _iter_column_arrays(self) -> Iterator[ArrayLike]: """ @@ -3244,8 +3253,9 @@ def query(self, expr, inplace=False, **kwargs): Returns ------- - DataFrame - DataFrame resulting from the provided query expression. + DataFrame or None + DataFrame resulting from the provided query expression or + None if ``inplace=True``. See Also -------- @@ -3392,8 +3402,8 @@ def eval(self, expr, inplace=False, **kwargs): Returns ------- - ndarray, scalar, or pandas object - The result of the evaluation. + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. See Also -------- @@ -4110,8 +4120,9 @@ def drop( Returns ------- - DataFrame - DataFrame without the removed index or column labels. + DataFrame or None + DataFrame without the removed index or column labels or + None if ``inplace=True``. Raises ------ @@ -4265,8 +4276,8 @@ def rename( Returns ------- - DataFrame - DataFrame with the renamed axis labels. + DataFrame or None + DataFrame with the renamed axis labels or None if ``inplace=True``. Raises ------ @@ -4316,7 +4327,7 @@ def rename( Traceback (most recent call last): KeyError: ['C'] not found in axis - Using axis-style parameters + Using axis-style parameters: >>> df.rename(str.lower, axis='columns') a b @@ -4458,7 +4469,34 @@ def _replace_columnwise( return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> DataFrame: + def shift( + self, periods=1, freq=None, axis=0, fill_value=lib.no_default + ) -> DataFrame: + axis = self._get_axis_number(axis) + + ncols = len(self.columns) + if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: + # We will infer fill_value to match the closest column + + if periods > 0: + result = self.iloc[:, :-periods] + for col in range(min(ncols, abs(periods))): + # TODO(EA2D): doing this in a loop unnecessary with 2D EAs + # Define filler inside loop so we get a copy + filler = self.iloc[:, 0].shift(len(self)) + result.insert(0, col, filler, allow_duplicates=True) + else: + result = self.iloc[:, -periods:] + for col in range(min(ncols, abs(periods))): + # Define filler inside loop so we get a copy + filler = self.iloc[:, -1].shift(len(self)) + result.insert( + len(result.columns), col, filler, allow_duplicates=True + ) + + result.columns = self.columns.copy() + return result + return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4494,8 +4532,8 @@ def set_index( Returns ------- - DataFrame - Changed row labels. + DataFrame or None + Changed row labels or None if ``inplace=True``. See Also -------- @@ -4808,46 +4846,6 @@ class max type else: new_obj = self.copy() - def _maybe_casted_values(index, labels=None): - values = index._values - if not isinstance(index, (PeriodIndex, DatetimeIndex)): - if values.dtype == np.object_: - values = lib.maybe_convert_objects(values) - - # if we have the labels, extract the values with a mask - if labels is not None: - mask = labels == -1 - - # we can have situations where the whole mask is -1, - # meaning there is nothing found in labels, so make all nan's - if mask.size > 0 and mask.all(): - dtype = index.dtype - fill_value = na_value_for_dtype(dtype) - values = construct_1d_arraylike_from_scalar( - fill_value, len(mask), dtype - ) - else: - values = values.take(labels) - - # TODO(https://github.com/pandas-dev/pandas/issues/24206) - # Push this into maybe_upcast_putmask? - # We can't pass EAs there right now. Looks a bit - # complicated. - # So we unbox the ndarray_values, op, re-box. - values_type = type(values) - values_dtype = values.dtype - - if issubclass(values_type, DatetimeLikeArray): - values = values._data # TODO: can we de-kludge yet? - - if mask.any(): - values, _ = maybe_upcast_putmask(values, mask, np.nan) - - if issubclass(values_type, DatetimeLikeArray): - values = values_type(values, dtype=values_dtype) - - return values - new_index = ibase.default_index(len(new_obj)) if level is not None: if not isinstance(level, (tuple, list)): @@ -4890,7 +4888,7 @@ def _maybe_casted_values(index, labels=None): name_lst += [col_fill] * missing name = tuple(name_lst) # to ndarray and maybe infer different dtype - level_values = _maybe_casted_values(lev, lab) + level_values = maybe_casted_values(lev, lab) new_obj.insert(0, name, level_values) new_obj.index = new_index @@ -4904,7 +4902,7 @@ def _maybe_casted_values(index, labels=None): @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> DataFrame: - result = self._constructor(self._data.isna(func=isna)) + result = self._constructor(self._mgr.isna(func=isna)) return result.__finalize__(self, method="isna") @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) @@ -4957,8 +4955,8 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): Returns ------- - DataFrame - DataFrame with NA entries dropped from it. + DataFrame or None + DataFrame with NA entries dropped from it or None if ``inplace=True``. See Also -------- @@ -5093,7 +5091,7 @@ def drop_duplicates( Returns ------- - DataFrame + DataFrame or None DataFrame with duplicates removed or None if ``inplace=True``. See Also @@ -5416,8 +5414,8 @@ def sort_index( Returns ------- - DataFrame - The original DataFrame sorted by the labels. + DataFrame or None + The original DataFrame sorted by the labels or None if ``inplace=True``. See Also -------- @@ -7208,13 +7206,13 @@ def melt( Difference with previous column >>> df.diff(axis=1) - a b c - 0 NaN 0.0 0.0 - 1 NaN -1.0 3.0 - 2 NaN -1.0 7.0 - 3 NaN -1.0 13.0 - 4 NaN 0.0 20.0 - 5 NaN 2.0 28.0 + a b c + 0 NaN 0 0 + 1 NaN -1 3 + 2 NaN -1 7 + 3 NaN -1 13 + 4 NaN 0 20 + 5 NaN 2 28 Difference with 3rd previous row @@ -7248,12 +7246,15 @@ def melt( ), ) def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: + if not isinstance(periods, int): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") + periods = int(periods) bm_axis = self._get_block_manager_axis(axis) - self._consolidate_inplace() if bm_axis == 0 and periods != 0: - return self.T.diff(periods, axis=0).T + return self - self.shift(periods, axis=axis) # type: ignore[operator] new_data = self._mgr.diff(n=periods, axis=bm_axis) return self._constructor(new_data) @@ -7263,7 +7264,7 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: def _gotitem( self, - key: Union[str, List[str]], + key: Union[Label, List[Label]], ndim: int, subset: Optional[FrameOrSeriesUnion] = None, ) -> FrameOrSeriesUnion: @@ -7369,7 +7370,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): result = None try: - result, how = self._aggregate(func, axis=axis, *args, **kwargs) + result, how = self._aggregate(func, axis, *args, **kwargs) except TypeError as err: exc = TypeError( "DataFrame constructor called with " @@ -7397,10 +7398,10 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): if axis == 1: # NDFrame.aggregate returns a tuple, and we need to transpose # only result - result, how = self.T._aggregate(arg, *args, **kwargs) + result, how = aggregate(self.T, arg, *args, **kwargs) result = result.T if result is not None else result return result, how - return super()._aggregate(arg, *args, **kwargs) + return aggregate(self, arg, *args, **kwargs) agg = aggregate @@ -8568,6 +8569,7 @@ def _reduce( ): assert filter_type is None or filter_type == "bool", filter_type + out_dtype = "bool" if filter_type == "bool" else None dtype_is_dt = np.array( [ @@ -8582,15 +8584,15 @@ def _reduce( "will include datetime64 and datetime64tz columns in a " "future version.", FutureWarning, - stacklevel=3, + stacklevel=5, ) cols = self.columns[~dtype_is_dt] self = self[cols] - # TODO: Make other agg func handle axis=None properly + any_object = self.dtypes.apply(is_object_dtype).any() + # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) labels = self._get_agg_axis(axis) - constructor = self._constructor assert axis in [0, 1] def func(values): @@ -8599,21 +8601,32 @@ def func(values): else: return op(values, axis=axis, skipna=skipna, **kwds) + def blk_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) + def _get_data() -> DataFrame: if filter_type is None: data = self._get_numeric_data() - elif filter_type == "bool": + else: # GH#25101, GH#24434 + assert filter_type == "bool" data = self._get_bool_data() - else: # pragma: no cover - msg = ( - f"Generating numeric_only data with filter_type {filter_type} " - "not supported." - ) - raise NotImplementedError(msg) return data - if numeric_only is not None: + if numeric_only is not None or ( + numeric_only is None + and axis == 0 + and not any_object + and not self._mgr.any_extension_types + ): + # For numeric_only non-None and axis non-None, we know + # which blocks to use and no try/except is needed. + # For numeric_only=None only the case with axis==0 and no object + # dtypes are unambiguous can be handled with BlockManager.reduce + # Case with EAs see GH#35881 df = self if numeric_only is True: df = _get_data() @@ -8621,22 +8634,18 @@ def _get_data() -> DataFrame: df = df.T axis = 0 - out_dtype = "bool" if filter_type == "bool" else None - - def blk_func(values): - if isinstance(values, ExtensionArray): - return values._reduce(name, skipna=skipna, **kwds) - else: - return op(values, axis=1, skipna=skipna, **kwds) + ignore_failures = numeric_only is None # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce - res = df._mgr.reduce(blk_func) - out = df._constructor(res).iloc[0].rename(None) + res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) + out = df._constructor(res).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) if axis == 0 and is_object_dtype(out.dtype): - out[:] = coerce_to_dtypes(out.values, df.dtypes) + # GH#35865 careful to cast explicitly to object + nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)]) + out[:] = np.array(nvs, dtype=object) return out assert numeric_only is None @@ -8644,11 +8653,10 @@ def blk_func(values): if not self._is_homogeneous_type or self._mgr.any_extension_types: # try to avoid self.values call - if filter_type is None and axis == 0 and len(self) > 0: + if filter_type is None and axis == 0: # operate column-wise # numeric_only must be None here, as other cases caught above - # require len(self) > 0 bc frame_apply messes up empty prod/sum # this can end up with a non-reduction # but not always. if the types are mixed @@ -8684,19 +8692,17 @@ def blk_func(values): with np.errstate(all="ignore"): result = func(values) - if is_object_dtype(result.dtype): + if filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) + elif filter_type is None and is_object_dtype(result.dtype): try: - if filter_type is None: - result = result.astype(np.float64) - elif filter_type == "bool" and notna(result).all(): - result = result.astype(np.bool_) + result = result.astype(np.float64) except (ValueError, TypeError): # try to coerce to the original dtypes item by item if we can if axis == 0: result = coerce_to_dtypes(result, data.dtypes) - if constructor is not None: - result = self._constructor_sliced(result, index=labels) + result = self._constructor_sliced(result, index=labels) return result def nunique(self, axis=0, dropna=True) -> Series: @@ -9279,8 +9285,8 @@ def _AXIS_NAMES(self) -> Dict[int, str]: ops.add_special_arithmetic_methods(DataFrame) -def _from_nested_dict(data): - new_data = collections.defaultdict(dict) +def _from_nested_dict(data) -> collections.defaultdict: + new_data: collections.defaultdict = collections.defaultdict(dict) for index, s in data.items(): for col, v in s.items(): new_data[col][index] = v From 32264e94c15de704423d8a30460cde5c9ebb7629 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 20:04:08 -0500 Subject: [PATCH 12/21] Hack --- pandas/io/formats/format.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index bf44bfe61921c..7ce2ddc504ff9 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1891,27 +1891,31 @@ def _trim_zeros_float( Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats - number_regex = re.compile(fr"\s*[\+-]?[0-9]+(\{decimal}[0-9]*)?") + number_regex = re.compile(r"\s*[\+-]?[0-9]+[\.,]?([0-9]*)?") - def _is_number(x): - return re.match(number_regex, x) is not None - - def _cond(values): - finite = [x for x in values if _is_number(x)] - has_decimal = [decimal in x for x in finite] + def is_number_with_decimal(x): + return re.match(number_regex, x) is not None and decimal in x + def should_trim(values): + numbers = [x for x in values if is_number_with_decimal(x)] return ( - len(finite) > 0 - and all(has_decimal) - and all(x.endswith("0") for x in finite) - and not (any(("e" in x) or ("E" in x) for x in finite)) + len(numbers) > 0 + and all(x.endswith("0") for x in numbers) + and not any(("e" in x) or ("E" in x) for x in numbers) ) - while _cond(trimmed): - trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] + while should_trim(trimmed): + trimmed = [ + x[:-1] if is_number_with_decimal(x) and x.endswith("0") else x + for x in trimmed + ] # leave one 0 after the decimal points if need be. - return [x + "0" if x.endswith(decimal) and _is_number(x) else x for x in trimmed] + result = [ + x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x + for x in trimmed + ] + return result def _has_names(index: Index) -> bool: From ba3ac468479edae5eb9db99771924d68099c026b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 20:31:05 -0500 Subject: [PATCH 13/21] Simplify --- pandas/io/formats/format.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7ce2ddc504ff9..f5d5974f27761 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1891,18 +1891,14 @@ def _trim_zeros_float( Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats - number_regex = re.compile(r"\s*[\+-]?[0-9]+[\.,]?([0-9]*)?") + number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}([0-9]*)?$") def is_number_with_decimal(x): - return re.match(number_regex, x) is not None and decimal in x + return re.match(number_regex, x) is not None def should_trim(values): numbers = [x for x in values if is_number_with_decimal(x)] - return ( - len(numbers) > 0 - and all(x.endswith("0") for x in numbers) - and not any(("e" in x) or ("E" in x) for x in numbers) - ) + return len(numbers) > 0 and all(x.endswith("0") for x in numbers) while should_trim(trimmed): trimmed = [ From f8a3216fa990d3530b2b7bf85bc0e9742bfac62a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 20:39:45 -0500 Subject: [PATCH 14/21] Fix regex --- pandas/io/formats/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index f5d5974f27761..476844de7360e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1891,7 +1891,7 @@ def _trim_zeros_float( Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats - number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}([0-9]*)?$") + number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$") def is_number_with_decimal(x): return re.match(number_regex, x) is not None From 546b44e69dc33253de29dcf7f24f8b0cf25fa07d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 22:03:42 -0500 Subject: [PATCH 15/21] Fix doc --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f8b8a2c6b6d10..64d7cd98dc421 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2654,7 +2654,7 @@ def memory_usage(self, index=True, deep=False) -> Series: Examples -------- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000).astype(t)) + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) ... for t in dtypes]) >>> df = pd.DataFrame(data) >>> df.head() @@ -2788,7 +2788,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: >>> df2_transposed 0 1 name Alice Bob - score 9.5 8 + score 9.5 8.0 employed False True kids 0 0 From 99d69059d144970a04e485afe0cf9c6261e02ffb Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 22:09:06 -0500 Subject: [PATCH 16/21] Fix --- pandas/io/formats/format.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 476844de7360e..688b398521a06 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1901,10 +1901,7 @@ def should_trim(values): return len(numbers) > 0 and all(x.endswith("0") for x in numbers) while should_trim(trimmed): - trimmed = [ - x[:-1] if is_number_with_decimal(x) and x.endswith("0") else x - for x in trimmed - ] + trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed] # leave one 0 after the decimal points if need be. result = [ From 319529feb9c0960a4db735921a3ad43ac04f3eeb Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 11 Oct 2020 22:32:41 -0500 Subject: [PATCH 17/21] Again --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 64d7cd98dc421..1920d0aa38539 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2689,7 +2689,7 @@ def memory_usage(self, index=True, deep=False) -> Series: int64 40000 float64 40000 complex128 80000 - object 160000 + object 180000 bool 5000 dtype: int64 From 7a477dc59865b0cc77230ff50ff3ff74efd82151 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 14 Oct 2020 09:55:14 -0500 Subject: [PATCH 18/21] Fix merge --- pandas/io/formats/format.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 688b398521a06..b346d6f8ea22a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1837,9 +1837,11 @@ def _make_fixed_width( return strings if adj is None: - adj = get_adjustment() + adjustment = get_adjustment() + else: + adjustment = adj - max_len = max(adj.len(x) for x in strings) + max_len = max(adjustment.len(x) for x in strings) if minimum is not None: max_len = max(minimum, max_len) @@ -1848,14 +1850,14 @@ def _make_fixed_width( if conf_max is not None and max_len > conf_max: max_len = conf_max - def just(x): + def just(x: str) -> str: if conf_max is not None: - if (conf_max > 3) & (adj.len(x) > max_len): + if (conf_max > 3) & (adjustment.len(x) > max_len): x = x[: max_len - 3] + "..." return x strings = [just(x) for x in strings] - result = adj.justify(strings, max_len, mode=justify) + result = adjustment.justify(strings, max_len, mode=justify) return result From 7d4452acb4a7dceaaeb8a049ca706d7507b06224 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 14 Oct 2020 20:25:13 -0500 Subject: [PATCH 19/21] Type and doc --- bisect.sh | 4 ++++ pandas/io/formats/format.py | 9 ++++++++- sample.py | 8 ++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100755 bisect.sh create mode 100644 sample.py diff --git a/bisect.sh b/bisect.sh new file mode 100755 index 0000000000000..5b2c9f77ebbc3 --- /dev/null +++ b/bisect.sh @@ -0,0 +1,4 @@ +git bisect start ${1} ${2} +git bisect run bash -c "python setup.py build_ext --inplace -j 4; / + python -m pip install -e . --no-build-isolation --no-use-pep517; / + python ${3}" diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b346d6f8ea22a..dcd91b3a12294 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1898,7 +1898,14 @@ def _trim_zeros_float( def is_number_with_decimal(x): return re.match(number_regex, x) is not None - def should_trim(values): + def should_trim(values: Union[np.ndarray, List[str]]) -> bool: + """ + Determine if an array of strings should be trimmed. + + Returns True if all numbers containing decimals (defined by the + above regular expression) within the array end in a zero, otherwise + returns False. + """ numbers = [x for x in values if is_number_with_decimal(x)] return len(numbers) > 0 and all(x.endswith("0") for x in numbers) diff --git a/sample.py b/sample.py new file mode 100644 index 0000000000000..620079d84c5eb --- /dev/null +++ b/sample.py @@ -0,0 +1,8 @@ +import pandas as pd + +df = pd.DataFrame( + {"A": ["x", "x", "y"], "B": ["a", "b", "b"], "C": [1, 1, 1]} +).set_index(["A", "B"]) +df["D"] = 2 +result = df.unstack("B").fillna(0) +assert result.notna().all().all() From 80b0103c3b2db15bee54b04c074e7db19d7d9a03 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 14 Oct 2020 20:26:32 -0500 Subject: [PATCH 20/21] Oops --- bisect.sh | 4 ---- sample.py | 8 -------- 2 files changed, 12 deletions(-) delete mode 100755 bisect.sh delete mode 100644 sample.py diff --git a/bisect.sh b/bisect.sh deleted file mode 100755 index 5b2c9f77ebbc3..0000000000000 --- a/bisect.sh +++ /dev/null @@ -1,4 +0,0 @@ -git bisect start ${1} ${2} -git bisect run bash -c "python setup.py build_ext --inplace -j 4; / - python -m pip install -e . --no-build-isolation --no-use-pep517; / - python ${3}" diff --git a/sample.py b/sample.py deleted file mode 100644 index 620079d84c5eb..0000000000000 --- a/sample.py +++ /dev/null @@ -1,8 +0,0 @@ -import pandas as pd - -df = pd.DataFrame( - {"A": ["x", "x", "y"], "B": ["a", "b", "b"], "C": [1, 1, 1]} -).set_index(["A", "B"]) -df["D"] = 2 -result = df.unstack("B").fillna(0) -assert result.notna().all().all() From 8fd5a9fa2549c0b26f50bbaa83bdaa79aa1e3ffe Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 15 Oct 2020 13:50:30 -0500 Subject: [PATCH 21/21] Add tests --- pandas/tests/io/formats/test_format.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index eb30f5a344eae..78cb8ccc05077 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3434,6 +3434,13 @@ def test_format_remove_leading_space_dataframe(input_array, expected): assert df == expected +def test_to_string_complex_number_trims_zeros(): + s = pd.Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j]) + result = s.to_string() + expected = "0 1.00+1.00j\n1 1.00+1.00j\n2 1.05+1.00j" + assert result == expected + + def test_nullable_float_to_string(float_ea_dtype): # https://github.com/pandas-dev/pandas/issues/36775 dtype = float_ea_dtype @@ -3445,8 +3452,12 @@ def test_nullable_float_to_string(float_ea_dtype): assert result == expected -def test_to_string_complex_number_trims_zeros(): - s = pd.Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j]) +def test_nullable_int_to_string(any_nullable_int_dtype): + # https://github.com/pandas-dev/pandas/issues/36775 + dtype = any_nullable_int_dtype + s = pd.Series([0, 1, None], dtype=dtype) result = s.to_string() - expected = "0 1.00+1.00j\n1 1.00+1.00j\n2 1.05+1.00j" + expected = """0 0 +1 1 +2 """ assert result == expected