From c43d4e3bf3fbe6ee0a472042b6382bd946dcbce3 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 13 Feb 2024 22:38:26 +0000
Subject: [PATCH 1/7] feat: add `DataFrames.corr()` method

---
 bigframes/core/blocks.py                      | 33 +++++++++++++++++
 bigframes/dataframe.py                        | 21 +++++++++++
 tests/system/small/test_dataframe.py          | 29 +++++++++++++++
 .../bigframes_vendored/pandas/core/frame.py   | 35 +++++++++++++++++++
 .../bigframes_vendored/pandas/core/series.py  |  2 +-
 5 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 34df7231cc..ee6871731b 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1089,6 +1089,39 @@ def summarize(
         labels = self._get_labels_for_columns(column_ids)
         return Block(expr, column_labels=labels, index_columns=[label_col_id])
 
+    def corr(self):
+        """Returns a block object to compute the self-correlation on this block."""
+        aggregations = [
+            (
+                ex.BinaryAggregation(
+                    agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col)
+                ),
+                f"{left_col}-{right_col}",
+            )
+            for left_col in self.value_columns
+            for right_col in self.value_columns
+        ]
+        expr = self.expr.aggregate(aggregations)
+
+        label_col_id = guid.generate_guid()
+        input_count = len(self.value_columns)
+        unpivot_columns = tuple(
+            (
+                guid.generate_guid(),
+                tuple(expr.column_ids[input_count * i : input_count * (i + 1)]),
+            )
+            for i in range(input_count)
+        )
+        labels = self._get_labels_for_columns(self.value_columns)
+
+        expr = expr.unpivot(
+            row_labels=labels,
+            index_col_ids=[label_col_id],
+            unpivot_columns=unpivot_columns,
+        )
+
+        return Block(expr, column_labels=labels, index_columns=[label_col_id])
+
     def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]:
         """
         Gets a standard set of stats to preemptively fetch for a column if
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 9db567a497..6f30e546bd 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1017,6 +1017,27 @@ def combine(
     def combine_first(self, other: DataFrame):
         return self._apply_dataframe_binop(other, ops.fillna_op)
 
+    def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFrame:
+        if method != "pearson":
+            raise NotImplementedError(
+                f"Only Pearson correlation is currently supported. {constants.FEEDBACK_LINK}"
+            )
+        if min_periods:
+            raise NotImplementedError(
+                f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
+            )
+        # TODO(chelsealin): Support non-numeric columns correlation.
+        if not numeric_only:
+            raise NotImplementedError(
+                f"Only numeric columns' correlation is currently supported. {constants.FEEDBACK_LINK}"
+            )
+        if len(self.columns) > 30:
+            raise NotImplementedError(
+                f"Only work with dataframes containing fewer than 30 columns. Current: {self.columns}. {constants.FEEDBACK_LINK}"
+            )
+        # TODO(chelsealin): Support multi-index dataframes' correlation.
+        return DataFrame(self._block.corr())
+
     def to_pandas(
         self,
         max_download_size: Optional[int] = None,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 3d31253021..32ceb68539 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -1783,6 +1783,35 @@ def test_combine_first(
     pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
 
 
+def test_corr_w_numeric_only(scalars_dfs):
+    columns = ["int64_too", "int64_col", "float64_col"]
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    bf_result = scalars_df[columns].corr(numeric_only=True).to_pandas()
+    pd_result = scalars_pandas_df[columns].corr(numeric_only=True)
+
+    # BigFrames and Pandas differ in their data type handling:
+    # - Column types: BigFrames uses Float64, Pandas uses float64.
+    # - Index types: BigFrames uses strign, Pandas uses object.
+    pd.testing.assert_frame_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
+def test_corr_w_invalid_parameters(scalars_dfs):
+    columns = ["int64_too", "int64_col", "float64_col"]
+    scalars_df, _ = scalars_dfs
+
+    with pytest.raises(NotImplementedError):
+        scalars_df[columns].corr(method="kendall")
+
+    with pytest.raises(NotImplementedError):
+        scalars_df[columns].corr(min_periods=1)
+
+    with pytest.raises(NotImplementedError):
+        scalars_df[columns].corr(numeric_only=False)
+
+
 @pytest.mark.parametrize(
     ("op"),
     [
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 05f4167838..1ea8d78973 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2805,6 +2805,41 @@ def combine_first(self, other) -> DataFrame:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def corr(self, method, min_periods, numeric_only) -> DataFrame:
+        """
+        Compute pairwise correlation of columns, excluding NA/null values.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'A': [1, 2, 3],
+            ...                    'B': [400, 500, 600],
+            ...                    'C': [0.8, 0.4, 0.9]})
+            >>> df.corr(numeric_only=True)
+            >>> df
+                      A         B         C
+            A       1.0       1.0  0.188982
+            B       1.0       1.0  0.188982
+            C  0.188982  0.188982       1.0
+            <BLANKLINE>
+            [3 rows x 3 columns]
+
+        Args:
+            method (string, default "pearson"):
+                Correlation method to use - currently only "pearson" is supported.
+            min_periods (int, default None):
+                The minimum number of observations needed to return a result.  Non-default values
+                are not yet supported, so a result will be returned for at least two observations.
+            numeric_only(bool, default False):
+                Include only float, int or boolean data. - currently numeric only is supported
+
+        Returns:
+            DataFrame:  Correlation matrix.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def update(
         self, other, join: str = "left", overwrite: bool = True, filter_func=None
     ) -> DataFrame:
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index b203471606..6c01a6dd0c 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -840,7 +840,7 @@ def corr(self, other, method="pearson", min_periods=None) -> float:
             float:  Will return NaN if there are fewer than two numeric pairs, either series has a
                 variance or covariance of zero, or any input value is infinite.
         """
-        raise NotImplementedError("abstract method")
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
     def cov(
         self,

From bdcfabc21621ca6bec3b2e4d2b36e5e880b3cdc0 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 14 Feb 2024 00:49:34 +0000
Subject: [PATCH 2/7] support multi-indices

---
 bigframes/core/blocks.py              | 17 ++++++++++++-----
 tests/system/small/test_multiindex.py | 21 +++++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index ee6871731b..9408809f48 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -102,11 +102,11 @@ def __init__(
     ):
         """Construct a block object, will create default index if no index columns specified."""
         index_columns = list(index_columns)
-        if index_labels:
+        if index_labels is not None:
             index_labels = list(index_labels)
             if len(index_labels) != len(index_columns):
                 raise ValueError(
-                    "'index_columns' and 'index_labels' must have equal length"
+                    f"'index_columns' (size {len(index_columns)}) and 'index_labels' (size {len(index_labels)}) must have equal length"
                 )
         if len(index_columns) == 0:
             new_index_col_id = guid.generate_guid()
@@ -1103,7 +1103,9 @@ def corr(self):
         ]
         expr = self.expr.aggregate(aggregations)
 
-        label_col_id = guid.generate_guid()
+        index_col_ids = [
+            guid.generate_guid() for i in range(self.column_labels.nlevels)
+        ]
         input_count = len(self.value_columns)
         unpivot_columns = tuple(
             (
@@ -1116,11 +1118,16 @@ def corr(self):
 
         expr = expr.unpivot(
             row_labels=labels,
-            index_col_ids=[label_col_id],
+            index_col_ids=index_col_ids,
             unpivot_columns=unpivot_columns,
         )
 
-        return Block(expr, column_labels=labels, index_columns=[label_col_id])
+        return Block(
+            expr,
+            column_labels=self.column_labels,
+            index_columns=index_col_ids,
+            index_labels=self.column_labels.names,
+        )
 
     def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]:
         """
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index c5e8b45b8e..e0b9164315 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -880,6 +880,27 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index
     pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
 
 
+def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index):
+    columns = ["int64_too", "float64_col", "int64_col"]
+    multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2]))
+
+    bf = scalars_df_index[columns].copy()
+    bf.columns = multi_columns
+
+    pd_df = scalars_pandas_df_index[columns].copy()
+    pd_df.columns = multi_columns
+
+    bf_result = bf.corr(numeric_only=True).to_pandas()
+    pd_result = pd_df.corr(numeric_only=True)
+
+    # BigFrames and Pandas differ in their data type handling:
+    # - Column types: BigFrames uses Float64, Pandas uses float64.
+    # - Index types: BigFrames uses strign, Pandas uses object.
+    pandas.testing.assert_frame_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     ("index_names",),
     [

From 57dc1cf68c872d75ee9bc689068ce16dc530b7f4 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 14 Feb 2024 21:54:55 +0000
Subject: [PATCH 3/7] fix mypy

---
 bigframes/core/blocks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 9408809f48..993f2caa47 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1929,7 +1929,7 @@ def to_pandas(self) -> pd.Index:
         df = expr.session._rows_to_dataframe(results, dtypes)
         df = df.set_index(index_columns)
         index = df.index
-        index.names = list(self._block._index_labels)
+        index.names = list(self._block._index_labels)  # type:ignore
         return index
 
     def resolve_level(self, level: LevelsType) -> typing.Sequence[str]:

From ec810fde06e98edb1430364379b192bdacc08e5b Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 14 Feb 2024 23:29:08 +0000
Subject: [PATCH 4/7] support non-numeric col

---
 bigframes/dataframe.py               | 16 ++++++++--------
 tests/system/small/test_dataframe.py | 25 ++++++++++++++++++-------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 6f30e546bd..ccbf68ebb5 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1026,17 +1026,17 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
             raise NotImplementedError(
                 f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
             )
-        # TODO(chelsealin): Support non-numeric columns correlation.
-        if not numeric_only:
-            raise NotImplementedError(
-                f"Only numeric columns' correlation is currently supported. {constants.FEEDBACK_LINK}"
-            )
         if len(self.columns) > 30:
             raise NotImplementedError(
-                f"Only work with dataframes containing fewer than 30 columns. Current: {self.columns}. {constants.FEEDBACK_LINK}"
+                f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
             )
-        # TODO(chelsealin): Support multi-index dataframes' correlation.
-        return DataFrame(self._block.corr())
+
+        if not numeric_only:
+            frame = self._raise_on_non_numeric("corr")
+        else:
+            frame = self._drop_non_numeric()
+
+        return DataFrame(frame._block.corr())
 
     def to_pandas(
         self,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 32ceb68539..1b4594766e 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -1783,12 +1783,26 @@ def test_combine_first(
     pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
 
 
-def test_corr_w_numeric_only(scalars_dfs):
-    columns = ["int64_too", "int64_col", "float64_col"]
+@pytest.mark.parametrize(
+    ("columns", "numeric_only"),
+    [
+        (["bool_col", "int64_col", "float64_col", "numeric_col"], True),
+        (["bool_col", "int64_col", "float64_col", "numeric_col"], False),
+        (["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"], True),
+        pytest.param(
+            ["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"],
+            False,
+            marks=pytest.mark.xfail(
+                raises=NotImplementedError,
+            ),
+        ),
+    ],
+)
+def test_corr_w_numeric_only(scalars_dfs, columns, numeric_only):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    bf_result = scalars_df[columns].corr(numeric_only=True).to_pandas()
-    pd_result = scalars_pandas_df[columns].corr(numeric_only=True)
+    bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas()
+    pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only)
 
     # BigFrames and Pandas differ in their data type handling:
     # - Column types: BigFrames uses Float64, Pandas uses float64.
@@ -1808,9 +1822,6 @@ def test_corr_w_invalid_parameters(scalars_dfs):
     with pytest.raises(NotImplementedError):
         scalars_df[columns].corr(min_periods=1)
 
-    with pytest.raises(NotImplementedError):
-        scalars_df[columns].corr(numeric_only=False)
-
 
 @pytest.mark.parametrize(
     ("op"),

From f63d7f511fd5f72a21fa3237c5c711e98c74b211 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 15 Feb 2024 19:19:35 +0000
Subject: [PATCH 5/7] fix doc

---
 third_party/bigframes_vendored/pandas/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 1ea8d78973..36221e92f0 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2833,7 +2833,7 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame:
                 The minimum number of observations needed to return a result.  Non-default values
                 are not yet supported, so a result will be returned for at least two observations.
             numeric_only(bool, default False):
-                Include only float, int or boolean data. - currently numeric only is supported
+                Include only float, int, boolean, decimal data.
 
         Returns:
             DataFrame:  Correlation matrix.

From d7b741cdb741bf2124fda69c6c964516a27bef03 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Fri, 16 Feb 2024 01:41:39 +0000
Subject: [PATCH 6/7] fix system 3.9

---
 tests/system/small/test_dataframe.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 1b4594766e..8f75534fc6 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -1786,11 +1786,11 @@ def test_combine_first(
 @pytest.mark.parametrize(
     ("columns", "numeric_only"),
     [
-        (["bool_col", "int64_col", "float64_col", "numeric_col"], True),
-        (["bool_col", "int64_col", "float64_col", "numeric_col"], False),
-        (["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"], True),
+        (["bool_col", "int64_col", "float64_col"], True),
+        (["bool_col", "int64_col", "float64_col"], False),
+        (["bool_col", "int64_col", "float64_col", "string_col"], True),
         pytest.param(
-            ["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"],
+            ["bool_col", "int64_col", "float64_col", "string_col"],
             False,
             marks=pytest.mark.xfail(
                 raises=NotImplementedError,

From 860a3423ee26f05fb20addbe0c760616043b86ed Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Fri, 16 Feb 2024 03:59:21 +0000
Subject: [PATCH 7/7] fix doctest

---
 third_party/bigframes_vendored/pandas/core/frame.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 36221e92f0..84d2aa7fcb 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2818,7 +2818,6 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame:
             ...                    'B': [400, 500, 600],
             ...                    'C': [0.8, 0.4, 0.9]})
             >>> df.corr(numeric_only=True)
-            >>> df
                       A         B         C
             A       1.0       1.0  0.188982
             B       1.0       1.0  0.188982