pandas-dev · bashtage · Aug 9, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -397,6 +397,7 @@ Other enhancements
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)
 - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
 - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`)
+- ``read_stata()`` and ``StataReader`` raise with a more explicit error message when reading Stata files with repeated value labels when ``convert_categoricals=True`` (:issue:`13923`)
 - ``DataFrame.style`` will now render sparsified MultiIndexes (:issue:`11655`)
 - ``DataFrame.style`` will now show column level names (e.g. ``DataFrame.columns.names``) (:issue:`13775`)
 - ``DataFrame`` has gained support to re-order the columns based on the values

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1649,7 +1649,15 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
                         categories.append(value_label_dict[label][category])
                     else:
                         categories.append(category)  # Partially labeled
-                cat_data.categories = categories
+                try:
+                    cat_data.categories = categories
+                except ValueError:
+                    vc = Series(categories).value_counts()
+                    repeats = list(vc.index[vc > 1])
+                    repeats = '\n' + '-' * 80 + '\n'.join(repeats)
+                    msg = 'Value labels for column {0} are not unique. The ' \
+                          'repeated labels are:\n{1}'.format(col, repeats)
+                    raise ValueError(msg)
                 # TODO: is the next line needed above in the data(...) method?
                 cat_data = Series(cat_data, index=data.index)
                 cat_converted_data.append((col, cat_data))

diff --git a/pandas/io/tests/data/stata15.dta b/pandas/io/tests/data/stata15.dta
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -80,6 +80,7 @@ def setUp(self):
         self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta')
 
         self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta')
+        self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
 
     def read_dta(self, file):
         # Legacy default reader configuration
@@ -1212,6 +1213,12 @@ def test_unsupported_datetype(self):
             with tm.ensure_clean() as path:
                 original.to_stata(path)
 
+    def test_repeated_column_labels(self):
+        # GH 13923
+        with tm.assertRaises(ValueError) as cm:
+            read_stata(self.dta23, convert_categoricals=True)
+            tm.assertTrue('wolof' in cm.exception)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)