diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index b1fa6dbed442d..7041352971407 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -27,6 +27,7 @@ API changes Enhancements ~~~~~~~~~~~~ +- StataReader: Properly support sorting categorical variables read from stata files. .. _whatsnew_0152.performance: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c2542594861c4..36ac705e4e3ed 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1139,12 +1139,17 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None, )[0] for i in cols: col = data.columns[i] - labeled_data = np.copy(data[col]) - labeled_data = labeled_data.astype(object) - for k, v in compat.iteritems( - self.value_label_dict[self.lbllist[i]]): - labeled_data[(data[col] == k).values] = v - data[col] = Categorical.from_array(labeled_data) + codes = np.nan_to_num(data[col]) + codes = codes.astype(int) + codes = codes-1 + categories = [] + labeldict = self.value_label_dict[self.lbllist[i]] + for j in range(max(labeldict.keys())): + try: + categories.append(labeldict[j+1]) + except: + categories.append(j+1) + data[col] = Categorical.from_codes(codes, categories, ordered=True) if not preserve_dtypes: retyped_data = [] diff --git a/pandas/io/tests/data/stata10_117.dta b/pandas/io/tests/data/stata10_117.dta new file mode 100644 index 0000000000000..79dfffd94483f Binary files /dev/null and b/pandas/io/tests/data/stata10_117.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 2cb7809166be5..10df5cc91558e 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -13,6 +13,7 @@ import pandas as pd from pandas.compat import iterkeys +from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, @@ -81,6 +82,8 @@ def setUp(self): self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta') + def read_dta(self, file): # Legacy default reader configuration @@ -744,6 +747,12 @@ def test_drop_column(self): columns = ['byte_', 'int_', 'long_', 'not_found'] read_stata(self.dta15_117, convert_dates=True, columns=columns) + def test_categorical_sorting(self): + dataset = read_stata(self.dta19_117) + dataset = dataset.sort("srh") + expected = Categorical.from_codes(codes=[-1, -1, 0, 1, 1, 1, 2, 2, 3, 4], categories=["Poor", "Fair", "Good", "Very good", "Excellent"]) + tm.assert_equal(True, (np.asarray(expected)==np.asarray(dataset["srh"])).all()) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)