pandas-dev · mroeschke · Apr 9, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -33,6 +33,7 @@ Other enhancements
 - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
 - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
 - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
+- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
 -

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1502,36 +1502,18 @@ def _decode(self, s: bytes) -> str:
             )
             return s.decode("latin-1")
 
-    def _read_value_labels(self) -> None:
-        self._ensure_open()
-        if self._value_labels_read:
-            # Don't read twice
-            return
-        if self._format_version <= 108:
-            # Value labels are not supported in version 108 and earlier.
-            self._value_labels_read = True
-            self._value_label_dict: dict[str, dict[float, str]] = {}
-            return
-
-        if self._format_version >= 117:
-            self._path_or_buf.seek(self._seek_value_labels)
-        else:
-            assert self._dtype is not None
-            offset = self._nobs * self._dtype.itemsize
-            self._path_or_buf.seek(self._data_location + offset)
-
-        self._value_labels_read = True
-        self._value_label_dict = {}
-
+    def _read_new_value_labels(self) -> None:
         while True:
             if self._format_version >= 117:
                 if self._path_or_buf.read(5) == b"</val":  # <lbl>
                     break  # end of value label table
 
             slength = self._path_or_buf.read(4)
             if not slength:
-                break  # end of value label table (format < 117)
-            if self._format_version <= 117:
+                break  # end of value label table (format < 117), or end-of-file
+            if self._format_version == 108:
+                labname = self._decode(self._path_or_buf.read(9))
+            elif self._format_version <= 117:
                 labname = self._decode(self._path_or_buf.read(33))
             else:
                 labname = self._decode(self._path_or_buf.read(129))
@@ -1555,9 +1537,50 @@ def _read_value_labels(self) -> None:
                 self._value_label_dict[labname][val[i]] = self._decode(
                     txt[off[i] : end]
                 )
+
             if self._format_version >= 117:
                 self._path_or_buf.read(6)  # </lbl>
+
+    def _read_old_value_labels(self) -> None:
+        while True:
+            if not self._path_or_buf.read(2):
+                # end-of-file may have been reached, if so stop here
+                break
+
+            # otherwise back up and read again, taking byteorder into account
+            self._path_or_buf.seek(-2, os.SEEK_CUR)
+            n = self._read_uint16()
+            labname = self._decode(self._path_or_buf.read(9))
+            self._path_or_buf.read(1)  # padding
+            codes = np.frombuffer(
+                self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n
+            )
+            self._value_label_dict[labname] = {}
+            for i in range(n):
+                self._value_label_dict[labname][codes[i]] = self._decode(
+                    self._path_or_buf.read(8)
+                )
+
+    def _read_value_labels(self) -> None:
+        self._ensure_open()
+        if self._value_labels_read:
+            # Don't read twice
+            return
+
+        if self._format_version >= 117:
+            self._path_or_buf.seek(self._seek_value_labels)
+        else:
+            assert self._dtype is not None
+            offset = self._nobs * self._dtype.itemsize
+            self._path_or_buf.seek(self._data_location + offset)
+
         self._value_labels_read = True
+        self._value_label_dict: dict[str, dict[int, str]] = {}
+
+        if self._format_version >= 108:
+            self._read_new_value_labels()
+        else:
+            self._read_old_value_labels()
 
     def _read_strls(self) -> None:
         self._path_or_buf.seek(self._seek_strls)
@@ -1729,7 +1752,7 @@ def read(
                         i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt)
                     )
 
-        if convert_categoricals and self._format_version > 108:
+        if convert_categoricals:
             data = self._do_convert_categoricals(
                 data, self._value_label_dict, self._lbllist, order_categoricals
             )
@@ -1845,7 +1868,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
     def _do_convert_categoricals(
         self,
         data: DataFrame,
-        value_label_dict: dict[str, dict[float, str]],
+        value_label_dict: dict[str, dict[int, str]],
         lbllist: Sequence[str],
         order_categoricals: bool,
     ) -> DataFrame:
@@ -1983,7 +2006,7 @@ def variable_labels(self) -> dict[str, str]:
         self._ensure_open()
         return dict(zip(self._varlist, self._variable_labels))
 
-    def value_labels(self) -> dict[str, dict[float, str]]:
+    def value_labels(self) -> dict[str, dict[int, str]]:
         """
         Return a nested dict associating each variable name to its value and label.
 

diff --git a/pandas/tests/io/data/stata/stata4_105.dta b/pandas/tests/io/data/stata/stata4_105.dta
diff --git a/pandas/tests/io/data/stata/stata4_108.dta b/pandas/tests/io/data/stata/stata4_108.dta
diff --git a/pandas/tests/io/data/stata/stata4_111.dta b/pandas/tests/io/data/stata/stata4_111.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -225,7 +225,7 @@ def test_read_dta3(self, file, datapath):
         tm.assert_frame_equal(parsed, expected)
 
     @pytest.mark.parametrize(
-        "file", ["stata4_113", "stata4_114", "stata4_115", "stata4_117"]
+        "file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"]
     )
     def test_read_dta4(self, file, datapath):
         file = datapath("io", "data", "stata", f"{file}.dta")
@@ -270,6 +270,52 @@ def test_read_dta4(self, file, datapath):
         # stata doesn't save .category metadata
         tm.assert_frame_equal(parsed, expected)
 
+    @pytest.mark.parametrize("file", ["stata4_105", "stata4_108"])
+    def test_readold_dta4(self, file, datapath):
+        # This test is the same as test_read_dta4 above except that the columns
+        # had to be renamed to match the restrictions in older file format
+        file = datapath("io", "data", "stata", f"{file}.dta")
+        parsed = self.read_dta(file)
+
+        expected = DataFrame.from_records(
+            [
+                ["one", "ten", "one", "one", "one"],
+                ["two", "nine", "two", "two", "two"],
+                ["three", "eight", "three", "three", "three"],
+                ["four", "seven", 4, "four", "four"],
+                ["five", "six", 5, np.nan, "five"],
+                ["six", "five", 6, np.nan, "six"],
+                ["seven", "four", 7, np.nan, "seven"],
+                ["eight", "three", 8, np.nan, "eight"],
+                ["nine", "two", 9, np.nan, "nine"],
+                ["ten", "one", "ten", np.nan, "ten"],
+            ],
+            columns=[
+                "fulllab",
+                "fulllab2",
+                "incmplab",
+                "misslab",
+                "floatlab",
+            ],
+        )
+
+        # these are all categoricals
+        for col in expected:
+            orig = expected[col].copy()
+
+            categories = np.asarray(expected["fulllab"][orig.notna()])
+            if col == "incmplab":
+                categories = orig
+
+            cat = orig.astype("category")._values
+            cat = cat.set_categories(categories, ordered=True)
+            cat.categories.rename(None, inplace=True)
+
+            expected[col] = cat
+
+        # stata doesn't save .category metadata
+        tm.assert_frame_equal(parsed, expected)
+
     # File containing strls
     def test_read_dta12(self, datapath):
         parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))