-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
REG: fix regression in df.corrwith on tied data when method is spearman #49032
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
d228d7b
e230225
059effd
6a898f0
8fd679d
3528e40
12e9232
8578581
93879df
8ae9403
2f3c952
cef9e9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10593,23 +10593,30 @@ def corrwith( | |
corrs = {} | ||
if numeric_only: | ||
cols = self.select_dtypes(include=np.number).columns | ||
ndf = self[cols].values.transpose() | ||
else: | ||
cols = self.columns | ||
ndf = self.values.transpose() | ||
k = other.values | ||
k_mask = ~other.isna() | ||
if isinstance(k, BaseMaskedArray): | ||
k = k._data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we special Casing here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nullable arrays are not supported by rank_1d. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can use the mask too in that case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean we still cast the type, but write rank step as the following? libalgos.rank_1d(k, mask=nonnull_mask) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant more like k_mask = k._mask |
||
if method == "pearson": | ||
for i, r in enumerate(ndf): | ||
nonnull_mask = ~np.isnan(r) & ~np.isnan(k) | ||
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ | ||
0, 1 | ||
] | ||
for col in cols: | ||
val = self[col].values | ||
nonnull_mask = ~self[col].isna() & k_mask | ||
if isinstance(val, BaseMaskedArray): | ||
val = val._data | ||
corrs[col] = np.corrcoef( | ||
val[nonnull_mask], k[nonnull_mask] | ||
)[0, 1] | ||
else: | ||
for i, r in enumerate(ndf): | ||
nonnull_mask = ~np.isnan(r) & ~np.isnan(k) | ||
corrs[cols[i]] = np.corrcoef( | ||
r[nonnull_mask].argsort().argsort(), | ||
k[nonnull_mask].argsort().argsort(), | ||
for col in cols: | ||
val = self[col].values | ||
nonnull_mask = ~self[col].isna() & k_mask | ||
if isinstance(val, BaseMaskedArray): | ||
val = val._data | ||
corrs[col] = np.corrcoef( | ||
libalgos.rank_1d(val[nonnull_mask]), | ||
libalgos.rank_1d(k[nonnull_mask]), | ||
)[0, 1] | ||
return Series(corrs) | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only): | |
expected = Series(data=corrs, index=["a", "b"]) | ||
tm.assert_series_equal(result, expected) | ||
else: | ||
with pytest.raises(TypeError, match="not supported for the input types"): | ||
with pytest.raises( | ||
TypeError, | ||
match=r"unsupported operand type\(s\) for /: 'str' and 'int'", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now this is raised by np.corrcoef. |
||
): | ||
df.corrwith(s, numeric_only=numeric_only) | ||
|
||
def test_corrwith_index_intersection(self): | ||
|
@@ -406,3 +409,86 @@ def test_corrwith_kendall(self): | |
result = df.corrwith(df**2, method="kendall") | ||
expected = Series(np.ones(len(result))) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"numeric_only, ser, expected", | ||
[ | ||
( | ||
True, | ||
Series([0, 1, 1, 0]), | ||
Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), | ||
), | ||
( | ||
True, | ||
Series([0.0, 1.0, 1.0, 0.0]), | ||
Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), | ||
), | ||
( | ||
True, | ||
Series([False, True, True, False]), | ||
Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), | ||
), | ||
( | ||
False, | ||
Series([0, 1, 1, 0]), | ||
Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), | ||
), | ||
( | ||
False, | ||
Series([0.0, 1.0, 1.0, 0.0]), | ||
Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), | ||
), | ||
( | ||
False, | ||
Series([False, True, True, False]), | ||
Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), | ||
), | ||
( | ||
True, | ||
Series([0, pd.NA, 1, 0], dtype="Int64"), | ||
Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), | ||
), | ||
( | ||
True, | ||
Series([0.0, pd.NA, 1.0, 0.0], dtype="Float64"), | ||
Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), | ||
GYHHAHA marked this conversation as resolved.
Show resolved
Hide resolved
|
||
), | ||
( | ||
True, | ||
Series([False, pd.NA, True, False], dtype="boolean"), | ||
Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), | ||
), | ||
( | ||
False, | ||
Series([0, pd.NA, 1, 0], dtype="Int64"), | ||
Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), | ||
), | ||
( | ||
False, | ||
Series([0, pd.NA, 1, 0], dtype="Int64"), | ||
Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), | ||
), | ||
( | ||
False, | ||
Series([False, pd.NA, True, False], dtype="boolean"), | ||
Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), | ||
), | ||
], | ||
) | ||
def test_corrwith_spearman_with_tied_data(self, ser, numeric_only, expected): | ||
GYHHAHA marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH#21925 | ||
df = DataFrame( | ||
{ | ||
"A": [2, 5, 8, 9], | ||
"B": [2, np.nan, 8, 9], | ||
"C": [2, np.nan, 8, 9], | ||
"D": [0, 1, 1, 0], | ||
"E": [0, np.nan, 1, 0], | ||
"F": [0, np.nan, 1, 0], | ||
"G": [False, True, True, False], | ||
"H": [False, pd.NA, True, False], | ||
}, | ||
).astype({"C": "Int64", "F": "Float64", "H": "boolean"}) | ||
GYHHAHA marked this conversation as resolved.
Show resolved
Hide resolved
|
||
s = Series([0, 1, 1, 0]) | ||
result = df.corrwith(s, method="spearman", numeric_only=numeric_only) | ||
tm.assert_series_equal(result, expected) |
Uh oh!
There was an error while loading. Please reload this page.