diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 649629714c3b1..949bc7b73af7e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -742,6 +742,8 @@ I/O - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) Plotting ^^^^^^^^ diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 221c07a0631d2..a5bfd5866a261 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -244,8 +244,8 @@ cdef class Parser(object): self.parser = parser self.header_length = self.parser.header_length self.column_count = parser.column_count - self.lengths = parser._column_data_lengths - self.offsets = parser._column_data_offsets + self.lengths = parser.column_data_lengths() + self.offsets = parser.column_data_offsets() self.byte_chunk = parser._byte_chunk self.string_chunk = parser._string_chunk self.row_length = parser.row_length @@ -257,7 +257,7 @@ cdef class Parser(object): # page indicators self.update_next_page() - column_types = parser.column_types + column_types = parser.column_types() # map column types for j in range(self.column_count): @@ -375,7 +375,7 @@ cdef class Parser(object): if done: return True return False - elif self.current_page_type == page_data_type: + elif self.current_page_type & page_data_type == page_data_type: self.process_byte_array_with_data( bit_offset + subheader_pointers_offset + self.current_row_on_page_index * self.row_length, @@ -437,7 +437,7 @@ cdef class Parser(object): elif column_types[j] == column_type_string: # string string_chunk[js, current_row] = np.array(source[start:( - start + lngt)]).tostring().rstrip() + start + lngt)]).tostring().rstrip(b"\x00 ") js += 1 self.current_row_on_page_index += 1 diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index efeb306b618d1..3582f538c16bf 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -82,7 +82,6 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self.compression = "" self.column_names_strings = [] self.column_names = [] - self.column_types = [] self.column_formats = [] self.columns = [] @@ -90,6 +89,8 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._cached_page = None self._column_data_lengths = [] self._column_data_offsets = [] + self._column_types = [] + self._current_row_in_file_index = 0 self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 @@ -102,6 +103,19 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._get_properties() self._parse_metadata() + def column_data_lengths(self): + """Return a numpy int64 array of the column data lengths""" + return np.asarray(self._column_data_lengths, dtype=np.int64) + + def column_data_offsets(self): + """Return a numpy int64 array of the column offsets""" + return np.asarray(self._column_data_offsets, dtype=np.int64) + + def column_types(self): + """Returns a numpy character array of the column types: + s (string) or d (double)""" + return np.asarray(self._column_types, dtype=np.dtype('S1')) + def close(self): try: self.handle.close() @@ -287,8 +301,10 @@ def _process_page_meta(self): pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types if self._current_page_type in pt: self._process_page_metadata() - return ((self._current_page_type in [256] + const.page_mix_types) or - (self._current_page_data_subheader_pointers is not None)) + is_data_page = self._current_page_type & const.page_data_type + is_mix_page = self._current_page_type in const.page_mix_types + return (is_data_page or is_mix_page + or self._current_page_data_subheader_pointers != []) def _read_page_header(self): bit_offset = self._page_bit_offset @@ -503,12 +519,6 @@ def _process_columnattributes_subheader(self, offset, length): int_len = self._int_length column_attributes_vectors_count = ( length - 2 * int_len - 12) // (int_len + 8) - self.column_types = np.empty( - column_attributes_vectors_count, dtype=np.dtype('S1')) - self._column_data_lengths = np.empty( - column_attributes_vectors_count, dtype=np.int64) - self._column_data_offsets = np.empty( - column_attributes_vectors_count, dtype=np.int64) for i in range(column_attributes_vectors_count): col_data_offset = (offset + int_len + const.column_data_offset_offset + @@ -520,16 +530,13 @@ def _process_columnattributes_subheader(self, offset, length): const.column_type_offset + i * (int_len + 8)) x = self._read_int(col_data_offset, int_len) - self._column_data_offsets[i] = x + self._column_data_offsets.append(x) x = self._read_int(col_data_len, const.column_data_length_length) - self._column_data_lengths[i] = x + self._column_data_lengths.append(x) x = self._read_int(col_types, const.column_type_length) - if x == 1: - self.column_types[i] = b'd' - else: - self.column_types[i] = b's' + self._column_types.append(b'd' if x == 1 else b's') def _process_columnlist_subheader(self, offset, length): # unknown purpose @@ -586,7 +593,7 @@ def _process_format_subheader(self, offset, length): col.name = self.column_names[current_column_number] col.label = column_label col.format = column_format - col.ctype = self.column_types[current_column_number] + col.ctype = self._column_types[current_column_number] col.length = self._column_data_lengths[current_column_number] self.column_formats.append(column_format) @@ -599,7 +606,7 @@ def read(self, nrows=None): elif nrows is None: nrows = self.row_count - if len(self.column_types) == 0: + if len(self._column_types) == 0: self.close() raise EmptyDataError("No columns to parse from file") @@ -610,8 +617,8 @@ def read(self, nrows=None): if nrows > m: nrows = m - nd = (self.column_types == b'd').sum() - ns = (self.column_types == b's').sum() + nd = self._column_types.count(b'd') + ns = self._column_types.count(b's') self._string_chunk = np.empty((ns, nrows), dtype=np.object) self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) @@ -639,11 +646,13 @@ def _read_next_page(self): self._page_length)) self._read_page_header() - if self._current_page_type == const.page_meta_type: + page_type = self._current_page_type + if page_type == const.page_meta_type: self._process_page_metadata() - pt = [const.page_meta_type, const.page_data_type] - pt += [const.page_mix_types] - if self._current_page_type not in pt: + + is_data_page = page_type & const.page_data_type + pt = [const.page_meta_type] + const.page_mix_types + if not is_data_page and self._current_page_type not in pt: return self._read_next_page() return False @@ -660,7 +669,7 @@ def _chunk_to_dataframe(self): name = self.column_names[j] - if self.column_types[j] == b'd': + if self._column_types[j] == b'd': rslt[name] = self._byte_chunk[jb, :].view( dtype=self.byte_order + 'd') rslt[name] = np.asarray(rslt[name], dtype=np.float64) @@ -674,7 +683,7 @@ def _chunk_to_dataframe(self): rslt[name] = pd.to_datetime(rslt[name], unit=unit, origin="1960-01-01") jb += 1 - elif self.column_types[j] == b's': + elif self._column_types[j] == b's': rslt[name] = self._string_chunk[js, :] if self.convert_text and (self.encoding is not None): rslt[name] = rslt[name].str.decode( @@ -686,6 +695,6 @@ def _chunk_to_dataframe(self): else: self.close() raise ValueError("unknown column type %s" % - self.column_types[j]) + self._column_types[j]) return rslt diff --git a/pandas/tests/io/sas/data/load_log.sas7bdat b/pandas/tests/io/sas/data/load_log.sas7bdat new file mode 100644 index 0000000000000..dc78925471baf Binary files /dev/null and b/pandas/tests/io/sas/data/load_log.sas7bdat differ diff --git a/pandas/tests/io/sas/data/many_columns.csv b/pandas/tests/io/sas/data/many_columns.csv new file mode 100644 index 0000000000000..307fc30f33b9f --- /dev/null +++ b/pandas/tests/io/sas/data/many_columns.csv @@ -0,0 +1,4 @@ +DATASRC,PDDOCID,age,agegt89,ASSESSA,ASSESS1,ASSESS3,ASSESS4,ASSESS5,ASSESS6,ASSESS7,week,BECK,conf1,conf2,conf3,demo3,demo4,demo5,demo6,demo7,demo11a,demo11b,demo11c,demo11d,derm1b,derm2,derm3,derm4,derm5a,derm5b,derm7,derm7a,derm7b,derm8,derm9,ECG3,ecgrtxt,ecgrhr,ecgrpr,ecgrqrs,ecgrqrsaxis,ecgrqt,ecgrqtc,ecgrrep,ecgrtime,mmse1,mmse2,mmse3,mmse4,mmse5,mmse6,mmse7,mmse8,mmse9,mmse10,mmse11,mmse12,mmse13,mmse14,mmse15,mmse16,mmse17,mmse18,mmse19,mmse20,mmse,mmsescor,mrf1,mrf2,mrf3,mrf4,mrf5,mrf6,mrf7,mrf8,mrf9,mrf10,mrf11,mrf12,mrf13,nvitl1s,nvitl1d,nvitl1r,nvitl2s,nvitl2d,nvitl2r,nvitl3s,nvitl3d,nvitl3r,nvitl4s,nvitl4d,nvitl4r,nvitl5,nvitl1,nvitl2,nvitl3,nvitl4,phys1,phys1a,phys14,phys15a,phys15b,phys15c,phys15d,phys16a,phys16b,phys16c,phys16d,phys17a,phys17b,phys17c,phys17d,phys18a,phys18b,phys18c,phys18d,phys19a,phys19b,phys20,phys22,phys24,phys26,phys28,PREG1,PREG2,updrsa,updrs1,updrs2,updrs3,updrs4,updrs5a,updrs6a,updrs7a,updrs8a,updrs9a,updrs10a,updrs11a,updrs12a,updrs13a,updrs14a,updrs15a,updrs16a,updrs17a,updrs18a,updrs19a,updrs20a1,updrs20b1,updrs20c1,updrs20d1,updrs20e1,updrs21a1,updrs21b1,updrs22a1,updrs22b1,updrs22c1,updrs22d1,updrs22e1,updrs23a1,updrs23b1,updrs24a1,updrs24b1,updrs25a1,updrs25b1,updrs26a1,updrs26b1,updrs26c1,updrs26d1,updrs27a,updrs28a,updrs29a,updrs30a,updrs31a,updrs32a,updrs33a,updrs34a,updrs35,updrs36,updrs37,updrs38,updrs39,updrs5b,updrs6b,updrs7b,updrs8b,updrs9b,updrs10b,updrs11b,updrs12b,updrs13b,updrs14b,updrs15b,updrs16b,updrs17b,updrs18b,updrs19b,updrs20a2,updrs20b2,updrs20c2,updrs20d2,updrs20e2,updrs21a2,updrs21b2,updrs22a2,updrs22b2,updrs22c2,updrs22d2,updrs22e2,updrs23a2,updrs23b2,updrs24a2,updrs24b2,updrs25a2,updrs25b2,updrs26a2,updrs26b2,updrs26c2,updrs26d2,updrs27b,updrs28b,updrs29b,updrs30b,updrs31b,updrs32b,updrs33b,updrs34b,updrs5c,updrs6c,updrs7c,updrs8c,updrs9c,updrs10c,updrs11c,updrs12c,updrs13c,updrs14c,updrs15c,updrs16c,updrs17c,updrs32c,updrs33c,updrs34c,updrsmental,updrsadl,updrsadlon,updrsadloff,updrsadlmin,updrstremor,updrstremortreat,updrstremormin,updrsrigid,updrsrigidtreat,updrsrigidmin,updrsmotor,updrsmotortreat,updrsmotormin,updrs,updrstrt,updrsmin,updrs4a,updrs41,updrs42,updrs43,updrs44,updrs45,updrs46,updrs47,updrs48,updrs49,updrs410,updrs411,vitl1s,vitl1d,vitl2,vitl3s,vitl3d,vitl4,vitl5,vitl6,assess,fbeck,conf,demo1,derm,ecg,ecgr,mrf,nvitl,fphys1,fpreg,fupdrs,fupdrs4,vitl,site,race,rImaged,rPD,rPDlt5,rAgeGt30,rHY,rMed,rMelanoma,rPreclude,rNeed,rEligible,gender,incsae,incsusp,incterm,increlated,inctermat,increason,incafter24,incendp,incres,disp2,disp3,disp4,disp6,inex1,inex2,inex3,inex4,inex5,inex6,inex7,inex8,inex9,inex10,inex11,inex12,inex13,inex14,inex15,inex16,inex17,inex18,inex19,inex20,inex21,inex22,inex23,inex24,inex25,inex26,inex27,inex28,treatment,treat,disp,inex,classify,enrollyr,demoyear,dob_yr,inexdays,demodays,onsetdays,diagdays,medstartdays,physdays,phys21dys,phys23dys,phys25dys,phys27dys,phys29dys,confdays,pregdays,nvitldays,nvitlscandays,vitldays,labdays,ecgdays,ecgtestdays,mrfdays,dermdays,dermexamdays,dermbiopdays,mmsedays,beckdays,updrdays,updr4days,assessdays,daystotherapy,dispdays,endpdys,termdys,SAEdys,resdys,lmeddys,wddays,VISIT_NO +a030,ab304,43.0,0.0,0.0,0.0,,,,,,-2.0,0.0,1.0,1.0,,2.0,1.0,19.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,,,,,,0.0,2.0,ABNORMAL,75.0,150.0,100.0,-3.0,410.0,460.0,2.0,1000.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,3.0,5.0,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,26.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,94.0,73.0,155.0,96.0,71.0,148.0,91.0,69.0,146.0,67.0,72.0,1.0,42840.0,46080.0,46980.0,30600.0,100.0,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,1.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,1.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.5,95.0,95.0,7.0,,2.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,1.5,,1.5,7.5,,7.5,20.0,,20.0,25.0,,25.0,,,,,,,,,,,,,138.0,86.0,72.0,130.0,80.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,abc,1.0,1.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,2002.0,1914.0,-28.0,-28.0,-404.0,-28.0,0.0,-28.0,,,,,-6.0,-28.0,-13.0,-13.0,-12.0,-28.0,-28.0,-28.0,-28.0,-28.0,-14.0,-14.0,,-28.0,-28.0,-28.0,,-28.0,,659.0,426.0,659.0,,,658.0,100.0,ab +a030,ab304,43.0,0.0,0.0,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,2.0,95.0,95.0,7.0,,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,3.0,,,3.0,0.0,,0.0,3.0,,3.0,13.0,,13.0,16.0,,16.0,,,,,,,,,,,,,140.0,86.0,76.0,132.0,80.0,84.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,0.0,0.0,,,,,,,,,0.0,,0.0,,659.0,426.0,659.0,,,658.0,100.0,ab +a030,ab304,43.0,0.0,0.0,0.0,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,0.5,1.0,2.0,90.0,95.0,7.0,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,0.5,,0.5,2.0,,2.0,16.0,,16.0,21.0,,21.0,0.0,,,,,,,,,,,,149.0,88.0,80.0,136.0,90.0,82.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,1.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,29.0,29.0,,,,,,,,,29.0,29.0,29.0,,659.0,426.0,659.0,,,658.0,100.0,ab diff --git a/pandas/tests/io/sas/data/many_columns.sas7bdat b/pandas/tests/io/sas/data/many_columns.sas7bdat new file mode 100644 index 0000000000000..582316fc59e18 Binary files /dev/null and b/pandas/tests/io/sas/data/many_columns.sas7bdat differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index efde152a918bd..f4b14241ed80e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -199,6 +199,22 @@ def test_compact_numerical_values(datapath): tm.assert_series_equal(result, expected, check_exact=True) +def test_many_columns(datapath): + # Test for looking for column information in more places (PR #22628) + fname = datapath("io", "sas", "data", "many_columns.sas7bdat") + df = pd.read_sas(fname, encoding='latin-1') + fname = datapath("io", "sas", "data", "many_columns.csv") + df0 = pd.read_csv(fname, encoding='latin-1') + tm.assert_frame_equal(df, df0) + + +def test_inconsistent_number_of_rows(datapath): + # Regression test for issue #16615. (PR #22628) + fname = datapath("io", "sas", "data", "load_log.sas7bdat") + df = pd.read_sas(fname, encoding='latin-1') + assert len(df) == 2097 + + def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")