Skip to content

Commit 37d0683

Browse files
committed
ENH: Cython nancorr speeds up DataFrame.corr with method='pearson' by > 100x
1 parent 73e4cc6 commit 37d0683

File tree

2 files changed

+67
-15
lines changed

2 files changed

+67
-15
lines changed

pandas/core/frame.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3806,22 +3806,26 @@ def corr(self, method='pearson'):
38063806
y : DataFrame
38073807
"""
38083808
numeric_df = self._get_numeric_data()
3809-
mat = numeric_df.values.T
38103809
cols = numeric_df.columns
3810+
mat = numeric_df.values
38113811

3812-
corrf = nanops.get_corr_func(method)
3813-
K = len(cols)
3814-
correl = np.empty((K, K), dtype=float)
3815-
mask = np.isfinite(mat)
3816-
for i, ac in enumerate(mat):
3817-
for j, bc in enumerate(mat):
3818-
valid = mask[i] & mask[j]
3819-
if not valid.all():
3820-
c = corrf(ac[valid], bc[valid])
3821-
else:
3822-
c = corrf(ac, bc)
3823-
correl[i, j] = c
3824-
correl[j, i] = c
3812+
if method == 'pearson':
3813+
correl = lib.nancorr(mat)
3814+
else:
3815+
mat = mat.T
3816+
corrf = nanops.get_corr_func(method)
3817+
K = len(cols)
3818+
correl = np.empty((K, K), dtype=float)
3819+
mask = np.isfinite(mat)
3820+
for i, ac in enumerate(mat):
3821+
for j, bc in enumerate(mat):
3822+
valid = mask[i] & mask[j]
3823+
if not valid.all():
3824+
c = corrf(ac[valid], bc[valid])
3825+
else:
3826+
c = corrf(ac, bc)
3827+
correl[i, j] = c
3828+
correl[j, i] = c
38253829

38263830
return self._constructor(correl, index=cols, columns=cols)
38273831

pandas/src/moments.pyx

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,55 @@ def ewma(ndarray[double_t] input, double_t com):
247247

248248
return output
249249

250-
#-------------------------------------------------------------------------------
250+
#----------------------------------------------------------------------
251+
# Pairwise covariance
252+
253+
@cython.boundscheck(False)
254+
@cython.wraparound(False)
255+
def nancorr(ndarray[float64_t, ndim=2] mat):
256+
cdef:
257+
Py_ssize_t i, j, xi, yi, N, K
258+
ndarray[float64_t, ndim=2] result
259+
ndarray[uint8_t, ndim=2] mask
260+
int64_t nobs = 0
261+
float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany
262+
263+
N, K = (<object> mat).shape
264+
265+
result = np.empty((K, K), dtype=np.float64)
266+
mask = np.isfinite(mat).view(np.uint8)
267+
268+
for xi in range(K):
269+
for yi in range(xi + 1):
270+
nobs = sumxx = sumyy = sumx = sumy = 0
271+
for i in range(N):
272+
if mask[i, xi] and mask[i, yi]:
273+
vx = mat[i, xi]
274+
vy = mat[i, yi]
275+
nobs += 1
276+
sumx += vx
277+
sumy += vy
278+
279+
meanx = sumx / nobs
280+
meany = sumy / nobs
281+
282+
# now the cov numerator
283+
sumx = 0
284+
285+
for i in range(N):
286+
if mask[i, xi] and mask[i, yi]:
287+
vx = mat[i, xi] - meanx
288+
vy = mat[i, yi] - meany
289+
290+
sumx += vx * vy
291+
sumxx += vx * vx
292+
sumyy += vy * vy
293+
294+
result[xi, yi] = result[yi, xi] = sumx / sqrt(sumxx * sumyy)
295+
296+
return result
297+
298+
#----------------------------------------------------------------------
251299
# Rolling variance
252300

253301
def _check_minp(minp, N):

0 commit comments

Comments
 (0)