Skip to content

Commit 8f21b97

Browse files
authored
SAS7BDAT parser: Speed up blank_missing (#48502)
* SAS7BDAT parser: Speed up blank_missing * Add what's new * Fix issue no * Update v1.6.0.rst * Update sas.pyx
1 parent 12dce19 commit 8f21b97

File tree

3 files changed

+9
-4
lines changed

3 files changed

+9
-4
lines changed

doc/source/whatsnew/v1.6.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ Performance improvements
108108
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
109109
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
110110
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
111+
- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`)
111112
-
112113

113114
.. ---------------------------------------------------------------------------

pandas/io/sas/sas.pyx

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ ctypedef signed long long int64_t
99
ctypedef unsigned char uint8_t
1010
ctypedef unsigned short uint16_t
1111

12+
cdef object np_nan = np.nan
13+
1214
# rle_decompress decompresses data using a Run Length Encoding
1315
# algorithm. It is partially documented here:
1416
#
@@ -220,6 +222,7 @@ cdef class Parser:
220222
int current_page_subheaders_count
221223
int current_row_in_chunk_index
222224
int current_row_in_file_index
225+
bint blank_missing
223226
int header_length
224227
int row_length
225228
int bit_offset
@@ -235,6 +238,7 @@ cdef class Parser:
235238
char[:] column_types
236239

237240
self.parser = parser
241+
self.blank_missing = parser.blank_missing
238242
self.header_length = self.parser.header_length
239243
self.column_count = parser.column_count
240244
self.lengths = parser.column_data_lengths()
@@ -428,7 +432,10 @@ cdef class Parser:
428432
# .rstrip(b"\x00 ") but without Python call overhead.
429433
while lngt > 0 and source[start+lngt-1] in b"\x00 ":
430434
lngt -= 1
431-
string_chunk[js, current_row] = (&source[start])[:lngt]
435+
if lngt == 0 and self.blank_missing:
436+
string_chunk[js, current_row] = np_nan
437+
else:
438+
string_chunk[js, current_row] = (&source[start])[:lngt]
432439
js += 1
433440

434441
self.current_row_on_page_index += 1

pandas/io/sas/sas7bdat.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -795,9 +795,6 @@ def _chunk_to_dataframe(self) -> DataFrame:
795795
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix)
796796
if self.convert_text and (self.encoding is not None):
797797
rslt[name] = self._decode_string(rslt[name].str)
798-
if self.blank_missing:
799-
ii = rslt[name].str.len() == 0
800-
rslt[name][ii] = np.nan
801798
js += 1
802799
else:
803800
self.close()

0 commit comments

Comments
 (0)