diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 30a0d918b46ec..e4ecd090f882b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -397,6 +397,7 @@ Other enhancements - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) - ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) +- ``read_stata()`` and ``StataReader`` raise with a more explicit error message when reading Stata files with repeated value labels when ``convert_categoricals=True`` (:issue:`13923`) - ``DataFrame.style`` will now render sparsified MultiIndexes (:issue:`11655`) - ``DataFrame.style`` will now show column level names (e.g. ``DataFrame.columns.names``) (:issue:`13775`) - ``DataFrame`` has gained support to re-order the columns based on the values diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 59bc24acac6f8..a67fb6651edd0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1649,7 +1649,15 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, categories.append(value_label_dict[label][category]) else: categories.append(category) # Partially labeled - cat_data.categories = categories + try: + cat_data.categories = categories + except ValueError: + vc = Series(categories).value_counts() + repeats = list(vc.index[vc > 1]) + repeats = '\n' + '-' * 80 + '\n'.join(repeats) + msg = 'Value labels for column {0} are not unique. The ' \ + 'repeated labels are:\n{1}'.format(col, repeats) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) diff --git a/pandas/io/tests/data/stata15.dta b/pandas/io/tests/data/stata15.dta new file mode 100644 index 0000000000000..d13e2fa337db3 Binary files /dev/null and b/pandas/io/tests/data/stata15.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 2e3182b69acaf..7752fff5247c0 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -80,6 +80,7 @@ def setUp(self): self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta') self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta') + self.dta23 = os.path.join(self.dirpath, 'stata15.dta') def read_dta(self, file): # Legacy default reader configuration @@ -1212,6 +1213,12 @@ def test_unsupported_datetype(self): with tm.ensure_clean() as path: original.to_stata(path) + def test_repeated_column_labels(self): + # GH 13923 + with tm.assertRaises(ValueError) as cm: + read_stata(self.dta23, convert_categoricals=True) + tm.assertTrue('wolof' in cm.exception) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)