From d63760a40e62c651fa35463dae86900a6969e6fd Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Thu, 20 Mar 2025 08:20:19 -0400 Subject: [PATCH 1/8] clip correlation coefficient between -1 and 1 --- pandas/core/frame.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8f65277f660f7..079755e9a8a99 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11233,6 +11233,9 @@ def corr( f"'{method}' was supplied" ) + # clip coefficient to ensure it is within theoretical bounds + correl = np.clip(correl, -1, 1) + result = self._constructor(correl, index=idx, columns=cols, copy=False) return result.__finalize__(self, method="corr") From 1ffedbe76be93f3a893fd933061e160fd15c4c3e Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Thu, 20 Mar 2025 09:06:41 -0400 Subject: [PATCH 2/8] Added test to check if corr within bounds --- pandas/tests/frame/methods/test_cov_corr.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index d5e94382b8314..befc7ec3757e3 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -485,3 +485,15 @@ def test_corrwith_min_periods_boolean(self): result = df_bool.corrwith(ser_bool, min_periods=3) expected = Series([0.57735, 0.57735], index=["A", "B"]) tm.assert_series_equal(result, expected) + + def test_corr_within_bounds(self): + df1 = DataFrame({"x": [0, 1], "y": [1.35951, 1.3595100000000007]}) + result1 = df1.corr().max().max() + expected1 = 1.0 + tm.assert_equal(result1, expected1) + + rng = np.random.default_rng(seed=42) + df2 = DataFrame(rng.random(100, 4)) + corr_matrix = df2.corr() + assert corr_matrix.min().min() >= -1.0 + assert corr_matrix.max().max() <= 1.0 From a997140a4b318e53fcaf9b61e1235c92b85ee4c1 Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Thu, 20 Mar 2025 09:11:52 -0400 Subject: [PATCH 3/8] Added tuple to mistyped parameter --- pandas/tests/frame/methods/test_cov_corr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index befc7ec3757e3..6dfbc325aafa4 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -493,7 +493,7 @@ def test_corr_within_bounds(self): tm.assert_equal(result1, expected1) rng = np.random.default_rng(seed=42) - df2 = DataFrame(rng.random(100, 4)) + df2 = DataFrame(rng.random((100, 4))) corr_matrix = df2.corr() assert corr_matrix.min().min() >= -1.0 assert corr_matrix.max().max() <= 1.0 From 7f2fdaa953b939495ac9c4b3709273cde03971b7 Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Thu, 20 Mar 2025 12:42:17 -0400 Subject: [PATCH 4/8] Transfered np.clip to algos.nancorr --- pandas/_libs/algos.pyx | 3 +++ pandas/core/frame.py | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 60ee73ef6b43f..a81e4b7d92d55 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -394,6 +394,9 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): else: result[xi, yi] = result[yi, xi] = NaN + # clip coefficient to ensure it is within theoretical bounds + result = np.clip(result, -1, 1) + return result.base # ---------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 079755e9a8a99..8f65277f660f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11233,9 +11233,6 @@ def corr( f"'{method}' was supplied" ) - # clip coefficient to ensure it is within theoretical bounds - correl = np.clip(correl, -1, 1) - result = self._constructor(correl, index=idx, columns=cols, copy=False) return result.__finalize__(self, method="corr") From 34eb7016da21919827ee1f3c1ea20dff7f472a89 Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Thu, 20 Mar 2025 13:12:22 -0400 Subject: [PATCH 5/8] Clip covxy / divsor instead of result --- pandas/_libs/algos.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a81e4b7d92d55..a7e791b37eea0 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -389,14 +389,13 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): else: divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) + # clip `covxy / divisor` to ensure coeff is within bounds if divisor != 0: - result[xi, yi] = result[yi, xi] = covxy / divisor + val = np.clip(covxy / divisor, -1, 1) + result[xi, yi] = result[yi, xi] = val else: result[xi, yi] = result[yi, xi] = NaN - # clip coefficient to ensure it is within theoretical bounds - result = np.clip(result, -1, 1) - return result.base # ---------------------------------------------------------------------- From fc5ccae13cf6fd08d51f64ab8499b753980e2b8c Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Thu, 20 Mar 2025 13:25:04 -0400 Subject: [PATCH 6/8] Clip covxy / divsor within nogil --- pandas/_libs/algos.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a7e791b37eea0..9dfa4a9486558 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -353,10 +353,9 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): float64_t[:, ::1] result uint8_t[:, :] mask int64_t nobs = 0 - float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy + float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val N, K = (mat).shape - if minp is None: minpv = 1 else: @@ -391,7 +390,11 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): # clip `covxy / divisor` to ensure coeff is within bounds if divisor != 0: - val = np.clip(covxy / divisor, -1, 1) + val = covxy / divisor + if val > 1.0: + val = 1.0 + elif val < -1.0: + val = -1.0 result[xi, yi] = result[yi, xi] = val else: result[xi, yi] = result[yi, xi] = NaN From ff2adc6073fd6d6f62c57981e74bc860d225c051 Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Mon, 24 Mar 2025 22:48:36 -0400 Subject: [PATCH 7/8] Added whatsnew note --- doc/source/whatsnew/v3.0.0.rst | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da2a9bdada469..c0fa30b06133d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -151,10 +151,23 @@ These improvements also fixed certain bugs in groupby: - :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) - :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) -.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_300.notable_bug_fixes.corr_bounded: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +Improved handling of numerical precision errors in ``DataFrame.corr`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Fixed an issue in :meth:`.DataFrame.corr` where numerical precision errors could cause correlation values to +exceed 1.0 when dealing with data having very small variances. Correlation coefficients are now properly +bounded to the valid range of [-1.0, 1.0] (:issue:`61120`). + +Previously, the code below would return ``1.1547005383792517``. It now returns ``1.0`` as expected. + +.. ipython:: python + + data = pd.DataFrame(dict( + x=[0, 1], + y=[1.35951, 1.3595100000000007] + )) + data.corr().max().max() .. --------------------------------------------------------------------------- .. _whatsnew_300.api_breaking: From f0573e1878b51b6045f4015e8a11d4357e7503d0 Mon Sep 17 00:00:00 2001 From: John Hendricks Date: Tue, 25 Mar 2025 12:33:21 -0400 Subject: [PATCH 8/8] Replaced long entry with single entry --- doc/source/whatsnew/v3.0.0.rst | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c0fa30b06133d..2b437734a451a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -151,23 +151,10 @@ These improvements also fixed certain bugs in groupby: - :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) - :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) -.. _whatsnew_300.notable_bug_fixes.corr_bounded: +.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: -Improved handling of numerical precision errors in ``DataFrame.corr`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Fixed an issue in :meth:`.DataFrame.corr` where numerical precision errors could cause correlation values to -exceed 1.0 when dealing with data having very small variances. Correlation coefficients are now properly -bounded to the valid range of [-1.0, 1.0] (:issue:`61120`). - -Previously, the code below would return ``1.1547005383792517``. It now returns ``1.0`` as expected. - -.. ipython:: python - - data = pd.DataFrame(dict( - x=[0, 1], - y=[1.35951, 1.3595100000000007] - )) - data.corr().max().max() +notable_bug_fix2 +^^^^^^^^^^^^^^^^ .. --------------------------------------------------------------------------- .. _whatsnew_300.api_breaking: @@ -686,6 +673,7 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`) - Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)