Skip to content

Commit 5996d17

Browse files
committed
PERF: speed up tz-aware operations by making searchsorted call in bulk,
rather than piecewise
1 parent e935829 commit 5996d17

File tree

1 file changed

+19
-14
lines changed

1 file changed

+19
-14
lines changed

pandas/_libs/tslibs/conversion.pyx

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -636,34 +636,40 @@ cdef inline int64_t[:] _tz_convert_dst(int64_t[:] values, tzinfo tz,
636636
"""
637637
cdef:
638638
Py_ssize_t n = len(values)
639-
Py_ssize_t i, pos
639+
Py_ssize_t i
640+
int64_t[:] pos
640641
int64_t[:] result = np.empty(n, dtype=np.int64)
641642
ndarray[int64_t] trans
642643
int64_t[:] deltas
643644
int64_t v
645+
bint tz_is_local
644646

645-
if not is_tzlocal(tz):
647+
tz_is_local = is_tzlocal(tz)
648+
649+
if not tz_is_local:
646650
# get_dst_info cannot extract offsets from tzlocal because its
647651
# dependent on a datetime
648652
trans, deltas, _ = get_dst_info(tz)
649653
if not to_utc:
650654
# We add `offset` below instead of subtracting it
651655
deltas = -1 * np.array(deltas, dtype='i8')
652656

657+
# Previously, this search was done pointwise to try and benefit
658+
# from getting to skip searches for iNaTs. However, it seems call
659+
# overhead dominates the search time so doing it once in bulk
660+
# appears to be substantially faster
661+
pos = trans.searchsorted(values, side='right') - 1
662+
653663
for i in range(n):
654664
v = values[i]
655665
if v == NPY_NAT:
656666
result[i] = v
657-
elif is_tzlocal(tz):
667+
elif tz_is_local:
658668
result[i] = _tz_convert_tzlocal_utc(v, tz, to_utc=to_utc)
659669
else:
660-
# TODO: Is it more efficient to call searchsorted pointwise or
661-
# on `values` outside the loop? We are not consistent about this.
662-
# relative effiency of pointwise increases with number of iNaTs
663-
pos = trans.searchsorted(v, side='right') - 1
664-
if pos < 0:
670+
if pos[i] < 0:
665671
raise ValueError('First time before start of DST info')
666-
result[i] = v - deltas[pos]
672+
result[i] = v - deltas[pos[i]]
667673

668674
return result
669675

@@ -1252,9 +1258,9 @@ def is_date_array_normalized(int64_t[:] stamps, object tz=None):
12521258
is_normalized : bool True if all stamps are normalized
12531259
"""
12541260
cdef:
1255-
Py_ssize_t pos, i, n = len(stamps)
1261+
Py_ssize_t i, n = len(stamps)
12561262
ndarray[int64_t] trans
1257-
int64_t[:] deltas
1263+
int64_t[:] deltas, pos
12581264
npy_datetimestruct dts
12591265
int64_t local_val, delta
12601266
str typ
@@ -1283,11 +1289,10 @@ def is_date_array_normalized(int64_t[:] stamps, object tz=None):
12831289
return False
12841290

12851291
else:
1292+
pos = trans.searchsorted(stamps) - 1
12861293
for i in range(n):
12871294
# Adjust datetime64 timestamp, recompute datetimestruct
1288-
pos = trans.searchsorted(stamps[i]) - 1
1289-
1290-
dt64_to_dtstruct(stamps[i] + deltas[pos], &dts)
1295+
dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts)
12911296
if (dts.hour + dts.min + dts.sec + dts.us) > 0:
12921297
return False
12931298

0 commit comments

Comments
 (0)