@@ -1757,7 +1757,10 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
1757
1757
fmt = cast (str , fmt ) # only strs in VALID_RANGE
1758
1758
nmin , nmax = self .VALID_RANGE [fmt ]
1759
1759
series = data [colname ]
1760
- missing = np .logical_or (series < nmin , series > nmax )
1760
+
1761
+ # appreciably faster to do this with ndarray instead of Series
1762
+ svals = series ._values
1763
+ missing = (svals < nmin ) | (svals > nmax )
1761
1764
1762
1765
if not missing .any ():
1763
1766
continue
@@ -1776,13 +1779,17 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
1776
1779
if dtype not in (np .float32 , np .float64 ):
1777
1780
dtype = np .float64
1778
1781
replacement = Series (series , dtype = dtype )
1779
- replacement [missing ] = np .nan
1782
+ # Note: operating on ._values is much faster than directly
1783
+ # TODO: can we fix that?
1784
+ replacement ._values [missing ] = np .nan
1780
1785
replacements [colname ] = replacement
1781
1786
if replacements :
1782
1787
columns = data .columns
1783
- replacement_df = DataFrame (replacements )
1788
+ replacement_df = DataFrame (replacements , copy = False )
1784
1789
replaced = concat (
1785
- [data .drop (replacement_df .columns , axis = 1 ), replacement_df ], axis = 1
1790
+ [data .drop (replacement_df .columns , axis = 1 ), replacement_df ],
1791
+ axis = 1 ,
1792
+ copy = False ,
1786
1793
)
1787
1794
data = replaced [columns ]
1788
1795
return data
@@ -1901,7 +1908,7 @@ def _do_convert_categoricals(
1901
1908
cat_converted_data .append ((col , cat_series ))
1902
1909
else :
1903
1910
cat_converted_data .append ((col , data [col ]))
1904
- data = DataFrame . from_dict (dict (cat_converted_data ))
1911
+ data = DataFrame (dict (cat_converted_data ), copy = False )
1905
1912
return data
1906
1913
1907
1914
@property
0 commit comments