diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da2a9bdada469..2b437734a451a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -673,6 +673,7 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`) - Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 60ee73ef6b43f..9dfa4a9486558 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -353,10 +353,9 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): float64_t[:, ::1] result uint8_t[:, :] mask int64_t nobs = 0 - float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy + float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val N, K = (mat).shape - if minp is None: minpv = 1 else: @@ -389,8 +388,14 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): else: divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) + # clip `covxy / divisor` to ensure coeff is within bounds if divisor != 0: - result[xi, yi] = result[yi, xi] = covxy / divisor + val = covxy / divisor + if val > 1.0: + val = 1.0 + elif val < -1.0: + val = -1.0 + result[xi, yi] = result[yi, xi] = val else: result[xi, yi] = result[yi, xi] = NaN diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index d5e94382b8314..6dfbc325aafa4 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -485,3 +485,15 @@ def test_corrwith_min_periods_boolean(self): result = df_bool.corrwith(ser_bool, min_periods=3) expected = Series([0.57735, 0.57735], index=["A", "B"]) tm.assert_series_equal(result, expected) + + def test_corr_within_bounds(self): + df1 = DataFrame({"x": [0, 1], "y": [1.35951, 1.3595100000000007]}) + result1 = df1.corr().max().max() + expected1 = 1.0 + tm.assert_equal(result1, expected1) + + rng = np.random.default_rng(seed=42) + df2 = DataFrame(rng.random((100, 4))) + corr_matrix = df2.corr() + assert corr_matrix.min().min() >= -1.0 + assert corr_matrix.max().max() <= 1.0