Skip to content

ENH: Add support for reading value labels from 108-format and prior Stata dta files #58155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Other enhancements
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
-
Expand Down
75 changes: 49 additions & 26 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,36 +1502,18 @@ def _decode(self, s: bytes) -> str:
)
return s.decode("latin-1")

def _read_value_labels(self) -> None:
self._ensure_open()
if self._value_labels_read:
# Don't read twice
return
if self._format_version <= 108:
# Value labels are not supported in version 108 and earlier.
self._value_labels_read = True
self._value_label_dict: dict[str, dict[float, str]] = {}
return

if self._format_version >= 117:
self._path_or_buf.seek(self._seek_value_labels)
else:
assert self._dtype is not None
offset = self._nobs * self._dtype.itemsize
self._path_or_buf.seek(self._data_location + offset)

self._value_labels_read = True
self._value_label_dict = {}

def _read_new_value_labels(self) -> None:
while True:
if self._format_version >= 117:
if self._path_or_buf.read(5) == b"</val": # <lbl>
break # end of value label table

slength = self._path_or_buf.read(4)
if not slength:
break # end of value label table (format < 117)
if self._format_version <= 117:
break # end of value label table (format < 117), or end-of-file
if self._format_version == 108:
labname = self._decode(self._path_or_buf.read(9))
elif self._format_version <= 117:
labname = self._decode(self._path_or_buf.read(33))
else:
labname = self._decode(self._path_or_buf.read(129))
Expand All @@ -1555,9 +1537,50 @@ def _read_value_labels(self) -> None:
self._value_label_dict[labname][val[i]] = self._decode(
txt[off[i] : end]
)

if self._format_version >= 117:
self._path_or_buf.read(6) # </lbl>

def _read_old_value_labels(self) -> None:
while True:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small docstring (one-liner) here and above indicating the versions that this targets would help readability without having to dig deeper.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea - I have now added this.

if not self._path_or_buf.read(2):
# end-of-file may have been reached, if so stop here
break

# otherwise back up and read again, taking byteorder into account
self._path_or_buf.seek(-2, os.SEEK_CUR)
n = self._read_uint16()
labname = self._decode(self._path_or_buf.read(9))
self._path_or_buf.read(1) # padding
codes = np.frombuffer(
self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n
)
self._value_label_dict[labname] = {}
for i in range(n):
self._value_label_dict[labname][codes[i]] = self._decode(
self._path_or_buf.read(8)
)

def _read_value_labels(self) -> None:
self._ensure_open()
if self._value_labels_read:
# Don't read twice
return

if self._format_version >= 117:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this logic move to the helper functions? Seems cleaner to me to move the cursor in the code that does the actual reading. I think the other lines about checking if already can stay here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I was trying to avoid duplicating calculations for the seek location between the two function, but that does make sense (maybe long term the calculations could be moved to _read_old_header and stored in self._seek_value_labels which would then match _read_new_header?). I have now made the suggested change here too.

self._path_or_buf.seek(self._seek_value_labels)
else:
assert self._dtype is not None
offset = self._nobs * self._dtype.itemsize
self._path_or_buf.seek(self._data_location + offset)

self._value_labels_read = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps move these to after the block that does the read. Make a bit more sense there now that this is short.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I have now moved this down.

self._value_label_dict: dict[str, dict[int, str]] = {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps one last one. Should this be moved to the __init__? I prefer to declare all attributes there since it avoids late addition of attributes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have now moved this up as suggested. I put it in the # State variables for the file section as this seemed the closest fit, but can shift it around it you like.


if self._format_version >= 108:
self._read_new_value_labels()
else:
self._read_old_value_labels()

def _read_strls(self) -> None:
self._path_or_buf.seek(self._seek_strls)
Expand Down Expand Up @@ -1729,7 +1752,7 @@ def read(
i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt)
)

if convert_categoricals and self._format_version > 108:
if convert_categoricals:
data = self._do_convert_categoricals(
data, self._value_label_dict, self._lbllist, order_categoricals
)
Expand Down Expand Up @@ -1845,7 +1868,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
def _do_convert_categoricals(
self,
data: DataFrame,
value_label_dict: dict[str, dict[float, str]],
value_label_dict: dict[str, dict[int, str]],
lbllist: Sequence[str],
order_categoricals: bool,
) -> DataFrame:
Expand Down Expand Up @@ -1983,7 +2006,7 @@ def variable_labels(self) -> dict[str, str]:
self._ensure_open()
return dict(zip(self._varlist, self._variable_labels))

def value_labels(self) -> dict[str, dict[float, str]]:
def value_labels(self) -> dict[str, dict[int, str]]:
"""
Return a nested dict associating each variable name to its value and label.

Expand Down
Binary file added pandas/tests/io/data/stata/stata4_105.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata4_108.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata4_111.dta
Binary file not shown.
48 changes: 47 additions & 1 deletion pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def test_read_dta3(self, file, datapath):
tm.assert_frame_equal(parsed, expected)

@pytest.mark.parametrize(
"file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"]
"file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"]
)
def test_read_dta4(self, file, datapath):
file = datapath("io", "data", "stata", f"{file}.dta")
Expand Down Expand Up @@ -270,6 +270,52 @@ def test_read_dta4(self, file, datapath):
# stata doesn't save .category metadata
tm.assert_frame_equal(parsed, expected)

@pytest.mark.parametrize("file", ["stata4_105", "stata4_108"])
def test_readold_dta4(self, file, datapath):
# This test is the same as test_read_dta4 above except that the columns
# had to be renamed to match the restrictions in older file format
file = datapath("io", "data", "stata", f"{file}.dta")
parsed = self.read_dta(file)

expected = DataFrame.from_records(
[
["one", "ten", "one", "one", "one"],
["two", "nine", "two", "two", "two"],
["three", "eight", "three", "three", "three"],
["four", "seven", 4, "four", "four"],
["five", "six", 5, np.nan, "five"],
["six", "five", 6, np.nan, "six"],
["seven", "four", 7, np.nan, "seven"],
["eight", "three", 8, np.nan, "eight"],
["nine", "two", 9, np.nan, "nine"],
["ten", "one", "ten", np.nan, "ten"],
],
columns=[
"fulllab",
"fulllab2",
"incmplab",
"misslab",
"floatlab",
],
)

# these are all categoricals
for col in expected:
orig = expected[col].copy()

categories = np.asarray(expected["fulllab"][orig.notna()])
if col == "incmplab":
categories = orig

cat = orig.astype("category")._values
cat = cat.set_categories(categories, ordered=True)
cat.categories.rename(None, inplace=True)

expected[col] = cat

# stata doesn't save .category metadata
tm.assert_frame_equal(parsed, expected)

# File containing strls
def test_read_dta12(self, datapath):
parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))
Expand Down