From c43d4e3bf3fbe6ee0a472042b6382bd946dcbce3 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 13 Feb 2024 22:38:26 +0000 Subject: [PATCH 1/7] feat: add `DataFrames.corr()` method --- bigframes/core/blocks.py | 33 +++++++++++++++++ bigframes/dataframe.py | 21 +++++++++++ tests/system/small/test_dataframe.py | 29 +++++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 35 +++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 2 +- 5 files changed, 119 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 34df7231cc..ee6871731b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1089,6 +1089,39 @@ def summarize( labels = self._get_labels_for_columns(column_ids) return Block(expr, column_labels=labels, index_columns=[label_col_id]) + def corr(self): + """Returns a block object to compute the self-correlation on this block.""" + aggregations = [ + ( + ex.BinaryAggregation( + agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col) + ), + f"{left_col}-{right_col}", + ) + for left_col in self.value_columns + for right_col in self.value_columns + ] + expr = self.expr.aggregate(aggregations) + + label_col_id = guid.generate_guid() + input_count = len(self.value_columns) + unpivot_columns = tuple( + ( + guid.generate_guid(), + tuple(expr.column_ids[input_count * i : input_count * (i + 1)]), + ) + for i in range(input_count) + ) + labels = self._get_labels_for_columns(self.value_columns) + + expr = expr.unpivot( + row_labels=labels, + index_col_ids=[label_col_id], + unpivot_columns=unpivot_columns, + ) + + return Block(expr, column_labels=labels, index_columns=[label_col_id]) + def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]: """ Gets a standard set of stats to preemptively fetch for a column if diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9db567a497..6f30e546bd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1017,6 +1017,27 @@ def combine( def combine_first(self, other: DataFrame): return self._apply_dataframe_binop(other, ops.fillna_op) + def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFrame: + if method != "pearson": + raise NotImplementedError( + f"Only Pearson correlation is currently supported. {constants.FEEDBACK_LINK}" + ) + if min_periods: + raise NotImplementedError( + f"min_periods not yet supported. {constants.FEEDBACK_LINK}" + ) + # TODO(chelsealin): Support non-numeric columns correlation. + if not numeric_only: + raise NotImplementedError( + f"Only numeric columns' correlation is currently supported. {constants.FEEDBACK_LINK}" + ) + if len(self.columns) > 30: + raise NotImplementedError( + f"Only work with dataframes containing fewer than 30 columns. Current: {self.columns}. {constants.FEEDBACK_LINK}" + ) + # TODO(chelsealin): Support multi-index dataframes' correlation. + return DataFrame(self._block.corr()) + def to_pandas( self, max_download_size: Optional[int] = None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3d31253021..32ceb68539 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1783,6 +1783,35 @@ def test_combine_first( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +def test_corr_w_numeric_only(scalars_dfs): + columns = ["int64_too", "int64_col", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[columns].corr(numeric_only=True).to_pandas() + pd_result = scalars_pandas_df[columns].corr(numeric_only=True) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_corr_w_invalid_parameters(scalars_dfs): + columns = ["int64_too", "int64_col", "float64_col"] + scalars_df, _ = scalars_dfs + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(method="kendall") + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(min_periods=1) + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(numeric_only=False) + + @pytest.mark.parametrize( ("op"), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 05f4167838..1ea8d78973 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2805,6 +2805,41 @@ def combine_first(self, other) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corr(self, method, min_periods, numeric_only) -> DataFrame: + """ + Compute pairwise correlation of columns, excluding NA/null values. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600], + ... 'C': [0.8, 0.4, 0.9]}) + >>> df.corr(numeric_only=True) + >>> df + A B C + A 1.0 1.0 0.188982 + B 1.0 1.0 0.188982 + C 0.188982 0.188982 1.0 + + [3 rows x 3 columns] + + Args: + method (string, default "pearson"): + Correlation method to use - currently only "pearson" is supported. + min_periods (int, default None): + The minimum number of observations needed to return a result. Non-default values + are not yet supported, so a result will be returned for at least two observations. + numeric_only(bool, default False): + Include only float, int or boolean data. - currently numeric only is supported + + Returns: + DataFrame: Correlation matrix. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None ) -> DataFrame: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b203471606..6c01a6dd0c 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -840,7 +840,7 @@ def corr(self, other, method="pearson", min_periods=None) -> float: float: Will return NaN if there are fewer than two numeric pairs, either series has a variance or covariance of zero, or any input value is infinite. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cov( self, From bdcfabc21621ca6bec3b2e4d2b36e5e880b3cdc0 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 14 Feb 2024 00:49:34 +0000 Subject: [PATCH 2/7] support multi-indices --- bigframes/core/blocks.py | 17 ++++++++++++----- tests/system/small/test_multiindex.py | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index ee6871731b..9408809f48 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -102,11 +102,11 @@ def __init__( ): """Construct a block object, will create default index if no index columns specified.""" index_columns = list(index_columns) - if index_labels: + if index_labels is not None: index_labels = list(index_labels) if len(index_labels) != len(index_columns): raise ValueError( - "'index_columns' and 'index_labels' must have equal length" + f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length" ) if len(index_columns) == 0: new_index_col_id = guid.generate_guid() @@ -1103,7 +1103,9 @@ def corr(self): ] expr = self.expr.aggregate(aggregations) - label_col_id = guid.generate_guid() + index_col_ids = [ + guid.generate_guid() for i in range(self.column_labels.nlevels) + ] input_count = len(self.value_columns) unpivot_columns = tuple( ( @@ -1116,11 +1118,16 @@ def corr(self): expr = expr.unpivot( row_labels=labels, - index_col_ids=[label_col_id], + index_col_ids=index_col_ids, unpivot_columns=unpivot_columns, ) - return Block(expr, column_labels=labels, index_columns=[label_col_id]) + return Block( + expr, + column_labels=self.column_labels, + index_columns=index_col_ids, + index_labels=self.column_labels.names, + ) def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]: """ diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index c5e8b45b8e..e0b9164315 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -880,6 +880,27 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) +def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "float64_col", "int64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2])) + + bf = scalars_df_index[columns].copy() + bf.columns = multi_columns + + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf.corr(numeric_only=True).to_pandas() + pd_result = pd_df.corr(numeric_only=True) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("index_names",), [ From 57dc1cf68c872d75ee9bc689068ce16dc530b7f4 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 14 Feb 2024 21:54:55 +0000 Subject: [PATCH 3/7] fix mypy --- bigframes/core/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9408809f48..993f2caa47 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1929,7 +1929,7 @@ def to_pandas(self) -> pd.Index: df = expr.session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index - index.names = list(self._block._index_labels) + index.names = list(self._block._index_labels) # type:ignore return index def resolve_level(self, level: LevelsType) -> typing.Sequence[str]: From ec810fde06e98edb1430364379b192bdacc08e5b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 14 Feb 2024 23:29:08 +0000 Subject: [PATCH 4/7] support non-numeric col --- bigframes/dataframe.py | 16 ++++++++-------- tests/system/small/test_dataframe.py | 25 ++++++++++++++++++------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6f30e546bd..ccbf68ebb5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1026,17 +1026,17 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr raise NotImplementedError( f"min_periods not yet supported. {constants.FEEDBACK_LINK}" ) - # TODO(chelsealin): Support non-numeric columns correlation. - if not numeric_only: - raise NotImplementedError( - f"Only numeric columns' correlation is currently supported. {constants.FEEDBACK_LINK}" - ) if len(self.columns) > 30: raise NotImplementedError( - f"Only work with dataframes containing fewer than 30 columns. Current: {self.columns}. {constants.FEEDBACK_LINK}" + f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}" ) - # TODO(chelsealin): Support multi-index dataframes' correlation. - return DataFrame(self._block.corr()) + + if not numeric_only: + frame = self._raise_on_non_numeric("corr") + else: + frame = self._drop_non_numeric() + + return DataFrame(frame._block.corr()) def to_pandas( self, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 32ceb68539..1b4594766e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1783,12 +1783,26 @@ def test_combine_first( pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) -def test_corr_w_numeric_only(scalars_dfs): - columns = ["int64_too", "int64_col", "float64_col"] +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col", "numeric_col"], True), + (["bool_col", "int64_col", "float64_col", "numeric_col"], False), + (["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_corr_w_numeric_only(scalars_dfs, columns, numeric_only): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].corr(numeric_only=True).to_pandas() - pd_result = scalars_pandas_df[columns].corr(numeric_only=True) + bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only) # BigFrames and Pandas differ in their data type handling: # - Column types: BigFrames uses Float64, Pandas uses float64. @@ -1808,9 +1822,6 @@ def test_corr_w_invalid_parameters(scalars_dfs): with pytest.raises(NotImplementedError): scalars_df[columns].corr(min_periods=1) - with pytest.raises(NotImplementedError): - scalars_df[columns].corr(numeric_only=False) - @pytest.mark.parametrize( ("op"), From f63d7f511fd5f72a21fa3237c5c711e98c74b211 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 15 Feb 2024 19:19:35 +0000 Subject: [PATCH 5/7] fix doc --- third_party/bigframes_vendored/pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 1ea8d78973..36221e92f0 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2833,7 +2833,7 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: The minimum number of observations needed to return a result. Non-default values are not yet supported, so a result will be returned for at least two observations. numeric_only(bool, default False): - Include only float, int or boolean data. - currently numeric only is supported + Include only float, int, boolean, decimal data. Returns: DataFrame: Correlation matrix. From d7b741cdb741bf2124fda69c6c964516a27bef03 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 16 Feb 2024 01:41:39 +0000 Subject: [PATCH 6/7] fix system 3.9 --- tests/system/small/test_dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1b4594766e..8f75534fc6 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1786,11 +1786,11 @@ def test_combine_first( @pytest.mark.parametrize( ("columns", "numeric_only"), [ - (["bool_col", "int64_col", "float64_col", "numeric_col"], True), - (["bool_col", "int64_col", "float64_col", "numeric_col"], False), - (["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"], True), + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), pytest.param( - ["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"], + ["bool_col", "int64_col", "float64_col", "string_col"], False, marks=pytest.mark.xfail( raises=NotImplementedError, From 860a3423ee26f05fb20addbe0c760616043b86ed Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 16 Feb 2024 03:59:21 +0000 Subject: [PATCH 7/7] fix doctest --- third_party/bigframes_vendored/pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 36221e92f0..84d2aa7fcb 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2818,7 +2818,6 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: ... 'B': [400, 500, 600], ... 'C': [0.8, 0.4, 0.9]}) >>> df.corr(numeric_only=True) - >>> df A B C A 1.0 1.0 0.188982 B 1.0 1.0 0.188982