From 72427e6ba014e7ba785f2f9b968bad0efb344586 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 12 Jan 2021 08:43:29 -0800
Subject: [PATCH 1/7] PERF: cythonize kendall correlation

---
 pandas/_libs/algos.pyx | 71 ++++++++++++++++++++++++++++++++++++++++++
 pandas/core/frame.py   |  4 ++-
 2 files changed, 74 insertions(+), 1 deletion(-)
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 76bfb001cea81..057891a470398 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -393,6 +393,77 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
     return result
 
 
+# ----------------------------------------------------------------------
+# Kendall correlation
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
+    cdef:
+        Py_ssize_t i, j, xi, yi, N, K
+        ndarray[float64_t, ndim=2] result
+        ndarray[float64_t, ndim=2] ranked_mat
+        ndarray[uint8_t, ndim=2] mask
+        float64_t currj
+        ndarray[uint8_t, ndim=1] valid
+        ndarray[float64_t, ndim=2] valid_cols
+        ndarray[float64_t, ndim=1] col
+        int64_t n_concordant
+        int64_t total_concordant = 0
+        int64_t total_discordant = 0
+        float64_t kendall_tau
+        int64_t n_obs
+        const int64_t[:] labels_n
+
+    N, K = (<object>mat).shape
+
+    result = np.empty((K, K), dtype=np.float64)
+    mask = np.isfinite(mat).view(np.uint8)
+
+    ranked_mat = np.empty((N, K), dtype=np.float64)
+    # For compatibility when calling rank_1d
+    labels_n = np.zeros(N, dtype=np.int64)
+
+    for i in range(K):
+        ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
+
+    for xi in range(K):
+        for yi in range(xi + 1, K):
+            valid = mask[:, xi] & mask[:, yi]
+            if valid.sum() < minp:
+                result[xi, yi] = NaN
+                result[yi, xi] = NaN
+            else:
+                # Get columns and order second column using 1st column ranks
+                if not valid.all():
+                    valid_cols = ranked_mat[valid.nonzero()][:, [xi, yi]]
+                else:
+                    valid_cols = ranked_mat[:, [xi, yi]]
+                # Unfortunately we have to sort here, since we can have tied indices
+                col = valid_cols[:, 1][valid_cols[:, 0].argsort()]
+                n_obs = valid_cols.shape[0]
+                total_concordant = 0
+                total_discordant = 0
+                for j in range(n_obs - 1):
+                    currj = col[j]
+                    # Count num concordant and discordant pairs
+                    n_concordant = np.sum(col[j+1:]>=currj)
+                    total_concordant += n_concordant
+                    total_discordant += (n_obs-1-j-n_concordant)
+                kendall_tau = (total_concordant - total_discordant) / \
+                              (total_concordant + total_discordant)
+                result[xi, yi] = kendall_tau
+                result[yi, xi] = kendall_tau
+
+        if mask[:, xi].sum() > minp:
+            result[xi, xi] = 1
+        else:
+            result[xi, xi] = NaN
+
+    return result
+
+
 # ----------------------------------------------------------------------
 
 ctypedef fused algos_t:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index e65e9302dd4d5..69734a40c84f5 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -8458,7 +8458,9 @@ def corr(self, method="pearson", min_periods=1) -> DataFrame:
             correl = libalgos.nancorr(mat, minp=min_periods)
         elif method == "spearman":
             correl = libalgos.nancorr_spearman(mat, minp=min_periods)
-        elif method == "kendall" or callable(method):
+        elif method == "kendall":
+            correl = libalgos.nancorr_kendall(mat, minp=min_periods)
+        elif callable(method):
             if min_periods is None:
                 min_periods = 1
             mat = mat.T

From 7ded648c0c6011c5a829c02851be715340d0089b Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 13 Jan 2021 14:40:54 -0800
Subject: [PATCH 2/7] Maybe improve perf

---
 pandas/_libs/algos.pyx | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 057891a470398..5f39a20ef276a 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -407,7 +407,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
         ndarray[uint8_t, ndim=2] mask
         float64_t currj
         ndarray[uint8_t, ndim=1] valid
-        ndarray[float64_t, ndim=2] valid_cols
+        ndarray[int64_t] sorted_idxs
         ndarray[float64_t, ndim=1] col
         int64_t n_concordant
         int64_t total_concordant = 0
@@ -419,7 +419,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
     N, K = (<object>mat).shape
 
     result = np.empty((K, K), dtype=np.float64)
-    mask = np.isfinite(mat).view(np.uint8)
+    mask = np.isfinite(mat)
 
     ranked_mat = np.empty((N, K), dtype=np.float64)
     # For compatibility when calling rank_1d
@@ -429,6 +429,9 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
         ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
 
     for xi in range(K):
+        sorted_idxs = ranked_mat[:, xi].argsort()
+        ranked_mat = ranked_mat[sorted_idxs]
+        mask = mask[sorted_idxs]
         for yi in range(xi + 1, K):
             valid = mask[:, xi] & mask[:, yi]
             if valid.sum() < minp:
@@ -437,12 +440,11 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
             else:
                 # Get columns and order second column using 1st column ranks
                 if not valid.all():
-                    valid_cols = ranked_mat[valid.nonzero()][:, [xi, yi]]
+                    col = ranked_mat[valid.nonzero()][:, yi]
                 else:
-                    valid_cols = ranked_mat[:, [xi, yi]]
+                    col = ranked_mat[:, yi]
                 # Unfortunately we have to sort here, since we can have tied indices
-                col = valid_cols[:, 1][valid_cols[:, 0].argsort()]
-                n_obs = valid_cols.shape[0]
+                n_obs = col.shape[0]
                 total_concordant = 0
                 total_discordant = 0
                 for j in range(n_obs - 1):

From ef1dc872f5857a46ff4452e4fbb0e689dcc6ed3c Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 19 Jan 2021 16:01:20 -0800
Subject: [PATCH 3/7] Use count_nonzero instead of sum for perf improvement

---
 pandas/_libs/algos.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 5f39a20ef276a..7087f56044b40 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -443,14 +443,13 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
                     col = ranked_mat[valid.nonzero()][:, yi]
                 else:
                     col = ranked_mat[:, yi]
-                # Unfortunately we have to sort here, since we can have tied indices
                 n_obs = col.shape[0]
                 total_concordant = 0
                 total_discordant = 0
                 for j in range(n_obs - 1):
                     currj = col[j]
                     # Count num concordant and discordant pairs
-                    n_concordant = np.sum(col[j+1:]>=currj)
+                    n_concordant = np.count_nonzero(col[j+1:]>=currj)
                     total_concordant += n_concordant
                     total_discordant += (n_obs-1-j-n_concordant)
                 kendall_tau = (total_concordant - total_discordant) / \

From b2384bf8fe29ae16eb4095c68cc958b430337cb0 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 19 Jan 2021 19:37:07 -0800
Subject: [PATCH 4/7] More perf improvements

---
 pandas/_libs/algos.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 7087f56044b40..c465d312ccd48 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -449,7 +449,10 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
                 for j in range(n_obs - 1):
                     currj = col[j]
                     # Count num concordant and discordant pairs
-                    n_concordant = np.count_nonzero(col[j+1:]>=currj)
+                    n_concordant = 0
+                    for k in range(j, n_obs):
+                        if col[k] > currj:
+                            n_concordant += 1
                     total_concordant += n_concordant
                     total_discordant += (n_obs-1-j-n_concordant)
                 kendall_tau = (total_concordant - total_discordant) / \

From 26f4771917a57abf585d1f880b22fb5feb03a227 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 19 Jan 2021 19:48:43 -0800
Subject: [PATCH 5/7] Update whatsnew

---
 doc/source/whatsnew/v1.3.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 8a1544801d0cf..23afc82a0345b 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -174,7 +174,7 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`)
 - Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`)
--
+- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
 
 .. ---------------------------------------------------------------------------
 

From 85a33c8435bdf02309384e43ece78a50f9742437 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 20 Jan 2021 08:37:02 -0800
Subject: [PATCH 6/7] Docstring & comments

---
 pandas/_libs/algos.pyx | 22 ++++++++++++++++++++--
 pandas/core/frame.py   |  3 +--
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index c465d312ccd48..7dc73e0cb0fd6 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -395,13 +395,28 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
 
 # ----------------------------------------------------------------------
 # Kendall correlation
-
+# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
+    """
+    Perform kendall correlation on a 2d array
+
+    Parameters
+    ----------
+    mat : Array to compute kendall correlation on
+    minp : int, default 1
+        Minimum number of observations required per pair of columns
+        to have a valid result.
+
+    Returns
+    -------
+    numpy.ndarray
+        Correlation matrix
+    """
     cdef:
-        Py_ssize_t i, j, xi, yi, N, K
+        Py_ssize_t i, j, k, xi, yi, N, K
         ndarray[float64_t, ndim=2] result
         ndarray[float64_t, ndim=2] ranked_mat
         ndarray[uint8_t, ndim=2] mask
@@ -455,6 +470,9 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
                             n_concordant += 1
                     total_concordant += n_concordant
                     total_discordant += (n_obs-1-j-n_concordant)
+                # Note: we do total_concordant+total_discordant here which is
+                # equivalent to the C(n, 2), the total # of pairs,
+                # listed on wikipedia
                 kendall_tau = (total_concordant - total_discordant) / \
                               (total_concordant + total_discordant)
                 result[xi, yi] = kendall_tau
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c43942ee500cc..288292589e940 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -8463,8 +8463,7 @@ def corr(self, method="pearson", min_periods=1) -> DataFrame:
 
         min_periods : int, optional
             Minimum number of observations required per pair of columns
-            to have a valid result. Currently only available for Pearson
-            and Spearman correlation.
+            to have a valid result.
 
         Returns
         -------

From 012b5a1b015e38a6948621174cdea41dadf62f2d Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 20 Jan 2021 09:57:07 -0800
Subject: [PATCH 7/7] Changes from code review

---
 pandas/_libs/algos.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 7dc73e0cb0fd6..080a84bef1e58 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -405,14 +405,15 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
 
     Parameters
     ----------
-    mat : Array to compute kendall correlation on
+    mat : np.ndarray[float64_t, ndim=2]
+        Array to compute kendall correlation on
     minp : int, default 1
         Minimum number of observations required per pair of columns
         to have a valid result.
 
     Returns
     -------
-    numpy.ndarray
+    numpy.ndarray[float64_t, ndim=2]
         Correlation matrix
     """
     cdef:
@@ -469,7 +470,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
                         if col[k] > currj:
                             n_concordant += 1
                     total_concordant += n_concordant
-                    total_discordant += (n_obs-1-j-n_concordant)
+                    total_discordant += (n_obs - 1 - j - n_concordant)
                 # Note: we do total_concordant+total_discordant here which is
                 # equivalent to the C(n, 2), the total # of pairs,
                 # listed on wikipedia