From 72427e6ba014e7ba785f2f9b968bad0efb344586 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 12 Jan 2021 08:43:29 -0800 Subject: [PATCH 1/7] PERF: cythonize kendall correlation --- pandas/_libs/algos.pyx | 71 ++++++++++++++++++++++++++++++++++++++++++ pandas/core/frame.py | 4 ++- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 76bfb001cea81..057891a470398 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -393,6 +393,77 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr return result +# ---------------------------------------------------------------------- +# Kendall correlation + + +@cython.boundscheck(False) +@cython.wraparound(False) +def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: + cdef: + Py_ssize_t i, j, xi, yi, N, K + ndarray[float64_t, ndim=2] result + ndarray[float64_t, ndim=2] ranked_mat + ndarray[uint8_t, ndim=2] mask + float64_t currj + ndarray[uint8_t, ndim=1] valid + ndarray[float64_t, ndim=2] valid_cols + ndarray[float64_t, ndim=1] col + int64_t n_concordant + int64_t total_concordant = 0 + int64_t total_discordant = 0 + float64_t kendall_tau + int64_t n_obs + const int64_t[:] labels_n + + N, K = (mat).shape + + result = np.empty((K, K), dtype=np.float64) + mask = np.isfinite(mat).view(np.uint8) + + ranked_mat = np.empty((N, K), dtype=np.float64) + # For compatibility when calling rank_1d + labels_n = np.zeros(N, dtype=np.int64) + + for i in range(K): + ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) + + for xi in range(K): + for yi in range(xi + 1, K): + valid = mask[:, xi] & mask[:, yi] + if valid.sum() < minp: + result[xi, yi] = NaN + result[yi, xi] = NaN + else: + # Get columns and order second column using 1st column ranks + if not valid.all(): + valid_cols = ranked_mat[valid.nonzero()][:, [xi, yi]] + else: + valid_cols = ranked_mat[:, [xi, yi]] + # Unfortunately we have to sort here, since we can have tied indices + col = valid_cols[:, 1][valid_cols[:, 0].argsort()] + n_obs = valid_cols.shape[0] + total_concordant = 0 + total_discordant = 0 + for j in range(n_obs - 1): + currj = col[j] + # Count num concordant and discordant pairs + n_concordant = np.sum(col[j+1:]>=currj) + total_concordant += n_concordant + total_discordant += (n_obs-1-j-n_concordant) + kendall_tau = (total_concordant - total_discordant) / \ + (total_concordant + total_discordant) + result[xi, yi] = kendall_tau + result[yi, xi] = kendall_tau + + if mask[:, xi].sum() > minp: + result[xi, xi] = 1 + else: + result[xi, xi] = NaN + + return result + + # ---------------------------------------------------------------------- ctypedef fused algos_t: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e65e9302dd4d5..69734a40c84f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8458,7 +8458,9 @@ def corr(self, method="pearson", min_periods=1) -> DataFrame: correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": correl = libalgos.nancorr_spearman(mat, minp=min_periods) - elif method == "kendall" or callable(method): + elif method == "kendall": + correl = libalgos.nancorr_kendall(mat, minp=min_periods) + elif callable(method): if min_periods is None: min_periods = 1 mat = mat.T From 7ded648c0c6011c5a829c02851be715340d0089b Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 13 Jan 2021 14:40:54 -0800 Subject: [PATCH 2/7] Maybe improve perf --- pandas/_libs/algos.pyx | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 057891a470398..5f39a20ef276a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -407,7 +407,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra ndarray[uint8_t, ndim=2] mask float64_t currj ndarray[uint8_t, ndim=1] valid - ndarray[float64_t, ndim=2] valid_cols + ndarray[int64_t] sorted_idxs ndarray[float64_t, ndim=1] col int64_t n_concordant int64_t total_concordant = 0 @@ -419,7 +419,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra N, K = (mat).shape result = np.empty((K, K), dtype=np.float64) - mask = np.isfinite(mat).view(np.uint8) + mask = np.isfinite(mat) ranked_mat = np.empty((N, K), dtype=np.float64) # For compatibility when calling rank_1d @@ -429,6 +429,9 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) for xi in range(K): + sorted_idxs = ranked_mat[:, xi].argsort() + ranked_mat = ranked_mat[sorted_idxs] + mask = mask[sorted_idxs] for yi in range(xi + 1, K): valid = mask[:, xi] & mask[:, yi] if valid.sum() < minp: @@ -437,12 +440,11 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra else: # Get columns and order second column using 1st column ranks if not valid.all(): - valid_cols = ranked_mat[valid.nonzero()][:, [xi, yi]] + col = ranked_mat[valid.nonzero()][:, yi] else: - valid_cols = ranked_mat[:, [xi, yi]] + col = ranked_mat[:, yi] # Unfortunately we have to sort here, since we can have tied indices - col = valid_cols[:, 1][valid_cols[:, 0].argsort()] - n_obs = valid_cols.shape[0] + n_obs = col.shape[0] total_concordant = 0 total_discordant = 0 for j in range(n_obs - 1): From ef1dc872f5857a46ff4452e4fbb0e689dcc6ed3c Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 19 Jan 2021 16:01:20 -0800 Subject: [PATCH 3/7] Use count_nonzero instead of sum for perf improvement --- pandas/_libs/algos.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5f39a20ef276a..7087f56044b40 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -443,14 +443,13 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra col = ranked_mat[valid.nonzero()][:, yi] else: col = ranked_mat[:, yi] - # Unfortunately we have to sort here, since we can have tied indices n_obs = col.shape[0] total_concordant = 0 total_discordant = 0 for j in range(n_obs - 1): currj = col[j] # Count num concordant and discordant pairs - n_concordant = np.sum(col[j+1:]>=currj) + n_concordant = np.count_nonzero(col[j+1:]>=currj) total_concordant += n_concordant total_discordant += (n_obs-1-j-n_concordant) kendall_tau = (total_concordant - total_discordant) / \ From b2384bf8fe29ae16eb4095c68cc958b430337cb0 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 19 Jan 2021 19:37:07 -0800 Subject: [PATCH 4/7] More perf improvements --- pandas/_libs/algos.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7087f56044b40..c465d312ccd48 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -449,7 +449,10 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra for j in range(n_obs - 1): currj = col[j] # Count num concordant and discordant pairs - n_concordant = np.count_nonzero(col[j+1:]>=currj) + n_concordant = 0 + for k in range(j, n_obs): + if col[k] > currj: + n_concordant += 1 total_concordant += n_concordant total_discordant += (n_obs-1-j-n_concordant) kendall_tau = (total_concordant - total_discordant) / \ From 26f4771917a57abf585d1f880b22fb5feb03a227 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 19 Jan 2021 19:48:43 -0800 Subject: [PATCH 5/7] Update whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8a1544801d0cf..23afc82a0345b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -174,7 +174,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`) - Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`) -- +- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) .. --------------------------------------------------------------------------- From 85a33c8435bdf02309384e43ece78a50f9742437 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 20 Jan 2021 08:37:02 -0800 Subject: [PATCH 6/7] Docstring & comments --- pandas/_libs/algos.pyx | 22 ++++++++++++++++++++-- pandas/core/frame.py | 3 +-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index c465d312ccd48..7dc73e0cb0fd6 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -395,13 +395,28 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr # ---------------------------------------------------------------------- # Kendall correlation - +# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient @cython.boundscheck(False) @cython.wraparound(False) def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: + """ + Perform kendall correlation on a 2d array + + Parameters + ---------- + mat : Array to compute kendall correlation on + minp : int, default 1 + Minimum number of observations required per pair of columns + to have a valid result. + + Returns + ------- + numpy.ndarray + Correlation matrix + """ cdef: - Py_ssize_t i, j, xi, yi, N, K + Py_ssize_t i, j, k, xi, yi, N, K ndarray[float64_t, ndim=2] result ndarray[float64_t, ndim=2] ranked_mat ndarray[uint8_t, ndim=2] mask @@ -455,6 +470,9 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra n_concordant += 1 total_concordant += n_concordant total_discordant += (n_obs-1-j-n_concordant) + # Note: we do total_concordant+total_discordant here which is + # equivalent to the C(n, 2), the total # of pairs, + # listed on wikipedia kendall_tau = (total_concordant - total_discordant) / \ (total_concordant + total_discordant) result[xi, yi] = kendall_tau diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c43942ee500cc..288292589e940 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8463,8 +8463,7 @@ def corr(self, method="pearson", min_periods=1) -> DataFrame: min_periods : int, optional Minimum number of observations required per pair of columns - to have a valid result. Currently only available for Pearson - and Spearman correlation. + to have a valid result. Returns ------- From 012b5a1b015e38a6948621174cdea41dadf62f2d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 20 Jan 2021 09:57:07 -0800 Subject: [PATCH 7/7] Changes from code review --- pandas/_libs/algos.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7dc73e0cb0fd6..080a84bef1e58 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -405,14 +405,15 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra Parameters ---------- - mat : Array to compute kendall correlation on + mat : np.ndarray[float64_t, ndim=2] + Array to compute kendall correlation on minp : int, default 1 Minimum number of observations required per pair of columns to have a valid result. Returns ------- - numpy.ndarray + numpy.ndarray[float64_t, ndim=2] Correlation matrix """ cdef: @@ -469,7 +470,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra if col[k] > currj: n_concordant += 1 total_concordant += n_concordant - total_discordant += (n_obs-1-j-n_concordant) + total_discordant += (n_obs - 1 - j - n_concordant) # Note: we do total_concordant+total_discordant here which is # equivalent to the C(n, 2), the total # of pairs, # listed on wikipedia