Skip to content

Commit ba4859f

Browse files
jbrockmendelfeefladder
authored andcommitted
PERF: read_stata (pandas-dev#43059)
1 parent eb5978d commit ba4859f

File tree

2 files changed

+13
-5
lines changed

2 files changed

+13
-5
lines changed

doc/source/whatsnew/v1.4.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ Performance improvements
222222
- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
223223
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
224224
- Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
225+
- Performance improvement in :func:`read_stata` (:issue:`43059`)
225226
-
226227

227228
.. ---------------------------------------------------------------------------

pandas/io/stata.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1757,7 +1757,10 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
17571757
fmt = cast(str, fmt) # only strs in VALID_RANGE
17581758
nmin, nmax = self.VALID_RANGE[fmt]
17591759
series = data[colname]
1760-
missing = np.logical_or(series < nmin, series > nmax)
1760+
1761+
# appreciably faster to do this with ndarray instead of Series
1762+
svals = series._values
1763+
missing = (svals < nmin) | (svals > nmax)
17611764

17621765
if not missing.any():
17631766
continue
@@ -1776,13 +1779,17 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
17761779
if dtype not in (np.float32, np.float64):
17771780
dtype = np.float64
17781781
replacement = Series(series, dtype=dtype)
1779-
replacement[missing] = np.nan
1782+
# Note: operating on ._values is much faster than directly
1783+
# TODO: can we fix that?
1784+
replacement._values[missing] = np.nan
17801785
replacements[colname] = replacement
17811786
if replacements:
17821787
columns = data.columns
1783-
replacement_df = DataFrame(replacements)
1788+
replacement_df = DataFrame(replacements, copy=False)
17841789
replaced = concat(
1785-
[data.drop(replacement_df.columns, axis=1), replacement_df], axis=1
1790+
[data.drop(replacement_df.columns, axis=1), replacement_df],
1791+
axis=1,
1792+
copy=False,
17861793
)
17871794
data = replaced[columns]
17881795
return data
@@ -1901,7 +1908,7 @@ def _do_convert_categoricals(
19011908
cat_converted_data.append((col, cat_series))
19021909
else:
19031910
cat_converted_data.append((col, data[col]))
1904-
data = DataFrame.from_dict(dict(cat_converted_data))
1911+
data = DataFrame(dict(cat_converted_data), copy=False)
19051912
return data
19061913

19071914
@property

0 commit comments

Comments
 (0)