diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 7e01303477f8f..e906d9e461541 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -31,6 +31,10 @@ New features Other enhancements ^^^^^^^^^^^^^^^^^^ +- Handle truncated floats in SAS xport files (see + :class:`~pandas.io.sas.XPortReader`` and ``pd.read_sas``). + + .. _whatsnew_0180.enhancements.rounding: Datetimelike rounding diff --git a/pandas/io/sas.py b/pandas/io/sas.py index 5f55f861afb72..006c2aaf55ca8 100644 --- a/pandas/io/sas.py +++ b/pandas/io/sas.py @@ -1,5 +1,5 @@ """ -Tools for reading SAS XPort files into Pandas objects. +Read a SAS XPort format file into a Pandas DataFrame. Based on code from Jack Cushman (github.com/jcushman/xport). @@ -25,10 +25,6 @@ 'nifl', 'nifd', 'npos', '_'] -# TODO: Support for 4 byte floats, see https://github.com/jcushman/xport/pull/3 -# Need a test file - - _base_params_doc = """\ Parameters ---------- @@ -161,15 +157,33 @@ def _split_line(s, parts): return out +def _handle_truncated_float_vec(vec, nbytes): + # This feature is not well documented, but some SAS XPORT files + # have 2-7 byte "truncated" floats. To read these truncated + # floats, pad them with zeros on the right to make 8 byte floats. + # + # References: + # https://github.com/jcushman/xport/pull/3 + # The R "foreign" library + + if nbytes != 8: + vec1 = np.zeros(len(vec), np.dtype('S8')) + dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes)) + vec2 = vec1.view(dtype=dtype) + vec2['f0'] = vec + return vec2 + + return vec + + def _parse_float_vec(vec): """ - Parse a vector of 8-byte values representing IBM 8 byte floats - into native 8 byte floats. + Parse a vector of float values representing IBM 8 byte floats into + native 8 byte floats. """ dtype = np.dtype('>u4,>u4') vec1 = vec.view(dtype=dtype) - xport1 = vec1['f0'] xport2 = vec1['f1'] @@ -266,7 +280,8 @@ def _read_header(self): raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() - file_info = _split_line(line2, [ ['prefix',24], ['version',8], ['OS',8], ['_',24], ['created',16]]) + file_info = _split_line(line2, [['prefix', 24], ['version', 8], ['OS', 8], + ['_', 24], ['created', 16]]) if file_info['prefix'] != "SAS SAS SASLIB": raise ValueError("Header record has invalid prefix.") file_info['created'] = _parse_date(file_info['created']) @@ -283,11 +298,11 @@ def _read_header(self): fieldnamelength = int(header1[-5:-2]) # usually 140, could be 135 # member info - member_info = _split_line(self._get_row(), [['prefix',8], ['set_name',8], - ['sasdata',8],['version',8], - ['OS',8],['_',24],['created',16]]) - member_info.update( _split_line(self._get_row(), [['modified',16], ['_',16], - ['label',40],['type',8]])) + member_info = _split_line(self._get_row(), [['prefix', 8], ['set_name', 8], + ['sasdata', 8],['version', 8], + ['OS', 8],['_', 24],['created', 16]]) + member_info.update( _split_line(self._get_row(), [['modified', 16], ['_', 16], + ['label', 40],['type', 8]])) member_info['modified'] = _parse_date(member_info['modified']) member_info['created'] = _parse_date(member_info['created']) self.member_info = member_info @@ -313,8 +328,9 @@ def _read_header(self): field = dict(zip(_fieldkeys, fieldstruct)) del field['_'] field['ntype'] = types[field['ntype']] - if field['ntype'] == 'numeric' and field['field_length'] != 8: - raise TypeError("Only 8-byte floats are currently implemented. Can't read field %s." % field) + fl = field['field_length'] + if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): + raise TypeError("Floating point field width %d is not between 2 and 8." % fw) for k, v in field.items(): try: @@ -339,11 +355,7 @@ def _read_header(self): # Setup the dtype. dtypel = [] for i,field in enumerate(self.fields): - ntype = field['ntype'] - if ntype == "numeric": - dtypel.append(('s' + str(i), ">u8")) - elif ntype == "char": - dtypel.append(('s' + str(i), "S" + str(field['field_length']))) + dtypel.append(('s' + str(i), "S" + str(field['field_length']))) dtype = np.dtype(dtypel) self._dtype = dtype @@ -416,8 +428,8 @@ def get_chunk(self, size=None): def _missing_double(self, vec): v = vec.view(dtype='u1,u1,u2,u4') miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0) - miss1 = ((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |\ - (v['f0'] == 0x5f) | (v['f0'] == 0x2e) + miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) | + (v['f0'] == 0x5f) | (v['f0'] == 0x2e)) miss &= miss1 return miss @@ -440,6 +452,7 @@ def read(self, nrows=None): vec = data['s%d' % j] ntype = self.fields[j]['ntype'] if ntype == "numeric": + vec = _handle_truncated_float_vec(vec, self.fields[j]['field_length']) miss = self._missing_double(vec) v = _parse_float_vec(vec) v[miss] = np.nan diff --git a/pandas/io/tests/data/paxraw_d_short.csv b/pandas/io/tests/data/paxraw_d_short.csv new file mode 100644 index 0000000000000..776799df5d8a2 --- /dev/null +++ b/pandas/io/tests/data/paxraw_d_short.csv @@ -0,0 +1,101 @@ +SEQN,PAXSTAT,PAXCAL,PAXDAY,PAXN,PAXHOUR,PAXMINUT,PAXINTEN,PAXSTEP +31128,1,1,1,1,0,0,166,4 +31128,1,1,1,2,0,1,27,0 +31128,1,1,1,3,0,2,0,0 +31128,1,1,1,4,0,3,276,4 +31128,1,1,1,5,0,4,0,0 +31128,1,1,1,6,0,5,0,0 +31128,1,1,1,7,0,6,0,0 +31128,1,1,1,8,0,7,0,0 +31128,1,1,1,9,0,8,0,0 +31128,1,1,1,10,0,9,0,0 +31128,1,1,1,11,0,10,0,0 +31128,1,1,1,12,0,11,0,0 +31128,1,1,1,13,0,12,0,0 +31128,1,1,1,14,0,13,0,0 +31128,1,1,1,15,0,14,0,0 +31128,1,1,1,16,0,15,0,0 +31128,1,1,1,17,0,16,0,0 +31128,1,1,1,18,0,17,0,0 +31128,1,1,1,19,0,18,0,0 +31128,1,1,1,20,0,19,0,0 +31128,1,1,1,21,0,20,260,3 +31128,1,1,1,22,0,21,0,0 +31128,1,1,1,23,0,22,0,0 +31128,1,1,1,24,0,23,19,0 +31128,1,1,1,25,0,24,34,1 +31128,1,1,1,26,0,25,47,4 +31128,1,1,1,27,0,26,4,0 +31128,1,1,1,28,0,27,11,0 +31128,1,1,1,29,0,28,48,1 +31128,1,1,1,30,0,29,58,3 +31128,1,1,1,31,0,30,32,2 +31128,1,1,1,32,0,31,15,1 +31128,1,1,1,33,0,32,117,3 +31128,1,1,1,34,0,33,24,0 +31128,1,1,1,35,0,34,61,7 +31128,1,1,1,36,0,35,115,12 +31128,1,1,1,37,0,36,183,11 +31128,1,1,1,38,0,37,68,5 +31128,1,1,1,39,0,38,73,3 +31128,1,1,1,40,0,39,93,7 +31128,1,1,1,41,0,40,201,14 +31128,1,1,1,42,0,41,126,6 +31128,1,1,1,43,0,42,61,4 +31128,1,1,1,44,0,43,97,7 +31128,1,1,1,45,0,44,62,3 +31128,1,1,1,46,0,45,77,10 +31128,1,1,1,47,0,46,105,8 +31128,1,1,1,48,0,47,209,12 +31128,1,1,1,49,0,48,72,4 +31128,1,1,1,50,0,49,50,1 +31128,1,1,1,51,0,50,324,7 +31128,1,1,1,52,0,51,582,16 +31128,1,1,1,53,0,52,387,31 +31128,1,1,1,54,0,53,780,54 +31128,1,1,1,55,0,54,618,10 +31128,1,1,1,56,0,55,0,0 +31128,1,1,1,57,0,56,0,0 +31128,1,1,1,58,0,57,0,0 +31128,1,1,1,59,0,58,123,1 +31128,1,1,1,60,0,59,0,0 +31128,1,1,1,61,1,0,0,0 +31128,1,1,1,62,1,1,0,0 +31128,1,1,1,63,1,2,0,0 +31128,1,1,1,64,1,3,0,0 +31128,1,1,1,65,1,4,0,0 +31128,1,1,1,66,1,5,0,0 +31128,1,1,1,67,1,6,0,0 +31128,1,1,1,68,1,7,0,0 +31128,1,1,1,69,1,8,0,0 +31128,1,1,1,70,1,9,0,0 +31128,1,1,1,71,1,10,0,0 +31128,1,1,1,72,1,11,0,0 +31128,1,1,1,73,1,12,0,0 +31128,1,1,1,74,1,13,0,0 +31128,1,1,1,75,1,14,0,0 +31128,1,1,1,76,1,15,0,0 +31128,1,1,1,77,1,16,0,0 +31128,1,1,1,78,1,17,0,0 +31128,1,1,1,79,1,18,0,0 +31128,1,1,1,80,1,19,0,0 +31128,1,1,1,81,1,20,0,0 +31128,1,1,1,82,1,21,0,0 +31128,1,1,1,83,1,22,0,0 +31128,1,1,1,84,1,23,0,0 +31128,1,1,1,85,1,24,0,0 +31128,1,1,1,86,1,25,0,0 +31128,1,1,1,87,1,26,0,0 +31128,1,1,1,88,1,27,0,0 +31128,1,1,1,89,1,28,0,0 +31128,1,1,1,90,1,29,0,0 +31128,1,1,1,91,1,30,0,0 +31128,1,1,1,92,1,31,0,0 +31128,1,1,1,93,1,32,0,0 +31128,1,1,1,94,1,33,0,0 +31128,1,1,1,95,1,34,2,0 +31128,1,1,1,96,1,35,0,0 +31128,1,1,1,97,1,36,0,0 +31128,1,1,1,98,1,37,0,0 +31128,1,1,1,99,1,38,0,0 +31128,1,1,1,100,1,39,0,0 diff --git a/pandas/io/tests/data/paxraw_d_short.xpt b/pandas/io/tests/data/paxraw_d_short.xpt new file mode 100644 index 0000000000000..da5bf98244342 Binary files /dev/null and b/pandas/io/tests/data/paxraw_d_short.xpt differ diff --git a/pandas/io/tests/test_sas.py b/pandas/io/tests/test_sas.py index 8d1041229bf3c..2691b3f8b9c5f 100644 --- a/pandas/io/tests/test_sas.py +++ b/pandas/io/tests/test_sas.py @@ -22,6 +22,7 @@ def setUp(self): self.file01 = os.path.join(self.dirpath, "DEMO_G.XPT") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.XPT") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.XPT") + self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt") def test1(self): @@ -110,3 +111,21 @@ def test3(self): data = read_sas(self.file03) tm.assert_frame_equal(data, data_csv) + + + def test4(self): + # Test with paxraw_d_short.xpt, a shortened version of: + # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP + # This file has truncated floats (5 bytes in this case). + + data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv")) + + data = XportReader(self.file04).read() + for x in data: + data[x] = data[x].astype(np.int64) + tm.assert_frame_equal(data, data_csv) + + data = read_sas(self.file04) + for x in data: + data[x] = data[x].astype(np.int64) + tm.assert_frame_equal(data, data_csv)