Skip to content

Commit ec810fd

Browse files
committed
support non-numeric col
1 parent 57dc1cf commit ec810fd

File tree

2 files changed

+26
-15
lines changed

2 files changed

+26
-15
lines changed

bigframes/dataframe.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,17 +1026,17 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
10261026
raise NotImplementedError(
10271027
f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
10281028
)
1029-
# TODO(chelsealin): Support non-numeric columns correlation.
1030-
if not numeric_only:
1031-
raise NotImplementedError(
1032-
f"Only numeric columns' correlation is currently supported. {constants.FEEDBACK_LINK}"
1033-
)
10341029
if len(self.columns) > 30:
10351030
raise NotImplementedError(
1036-
f"Only work with dataframes containing fewer than 30 columns. Current: {self.columns}. {constants.FEEDBACK_LINK}"
1031+
f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
10371032
)
1038-
# TODO(chelsealin): Support multi-index dataframes' correlation.
1039-
return DataFrame(self._block.corr())
1033+
1034+
if not numeric_only:
1035+
frame = self._raise_on_non_numeric("corr")
1036+
else:
1037+
frame = self._drop_non_numeric()
1038+
1039+
return DataFrame(frame._block.corr())
10401040

10411041
def to_pandas(
10421042
self,

tests/system/small/test_dataframe.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1783,12 +1783,26 @@ def test_combine_first(
17831783
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
17841784

17851785

1786-
def test_corr_w_numeric_only(scalars_dfs):
1787-
columns = ["int64_too", "int64_col", "float64_col"]
1786+
@pytest.mark.parametrize(
1787+
("columns", "numeric_only"),
1788+
[
1789+
(["bool_col", "int64_col", "float64_col", "numeric_col"], True),
1790+
(["bool_col", "int64_col", "float64_col", "numeric_col"], False),
1791+
(["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"], True),
1792+
pytest.param(
1793+
["bool_col", "int64_col", "float64_col", "numeric_col", "string_col"],
1794+
False,
1795+
marks=pytest.mark.xfail(
1796+
raises=NotImplementedError,
1797+
),
1798+
),
1799+
],
1800+
)
1801+
def test_corr_w_numeric_only(scalars_dfs, columns, numeric_only):
17881802
scalars_df, scalars_pandas_df = scalars_dfs
17891803

1790-
bf_result = scalars_df[columns].corr(numeric_only=True).to_pandas()
1791-
pd_result = scalars_pandas_df[columns].corr(numeric_only=True)
1804+
bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas()
1805+
pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only)
17921806

17931807
# BigFrames and Pandas differ in their data type handling:
17941808
# - Column types: BigFrames uses Float64, Pandas uses float64.
@@ -1808,9 +1822,6 @@ def test_corr_w_invalid_parameters(scalars_dfs):
18081822
with pytest.raises(NotImplementedError):
18091823
scalars_df[columns].corr(min_periods=1)
18101824

1811-
with pytest.raises(NotImplementedError):
1812-
scalars_df[columns].corr(numeric_only=False)
1813-
18141825

18151826
@pytest.mark.parametrize(
18161827
("op"),

0 commit comments

Comments
 (0)