From 8de37e96aafd688d99b1f20afb458c9daf6e9df2 Mon Sep 17 00:00:00 2001 From: taoufik Date: Mon, 6 Jan 2020 11:39:51 +0100 Subject: [PATCH 1/4] Fix read_json category dtype --- pandas/io/json/_json.py | 5 +++-- pandas/tests/io/json/test_pandas.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7f2aab569ab71..4270f312d56a3 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -12,7 +12,7 @@ from pandas._typing import JSONSerializable from pandas.errors import AbstractMethodError -from pandas.core.dtypes.common import ensure_str, is_period_dtype +from pandas.core.dtypes.common import ensure_str, is_period_dtype, is_categorical_dtype from pandas import DataFrame, MultiIndex, Series, isna, to_datetime from pandas.core.construction import create_series_with_explicit_dtype @@ -892,7 +892,8 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): ) if dtype is not None: try: - dtype = np.dtype(dtype) + if not is_categorical_dtype(dtype): + dtype = np.dtype(dtype) return data.astype(dtype), True except (TypeError, ValueError): return data, False diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 09d8a1d3f10ea..16429890e0606 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1197,6 +1197,18 @@ def test_read_local_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + def test_read_json_category_dtype(self): + json = ( + '{"a": 0, "b": "A"}\n' + '{"a": 1, "b": "B"}\n' + '{"a": 2, "b": "B"}\n' + '{"a": 3, "b": "B"}\n' + ) + json = StringIO(json) + result = read_json(json, lines=True, dtype={"a": "category"}) + expected = DataFrame([[0, "foo"], [1, "bar"], [2, "foo"], [3, "bar"]]) + tm.assert_frame_equal(result, expected) + def test_read_jsonl_unicode_chars(self): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK From 3f99f04e18d00f5a98678c674eb7f712ec1868c7 Mon Sep 17 00:00:00 2001 From: taoufik Date: Mon, 6 Jan 2020 12:08:55 +0100 Subject: [PATCH 2/4] Use pandas_dtype instead of np.dtype --- pandas/io/json/_json.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 4270f312d56a3..5c95ef48c8e9b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -12,7 +12,12 @@ from pandas._typing import JSONSerializable from pandas.errors import AbstractMethodError -from pandas.core.dtypes.common import ensure_str, is_period_dtype, is_categorical_dtype +from pandas.core.dtypes.common import ( + ensure_str, + is_period_dtype, + is_categorical_dtype, + pandas_dtype, +) from pandas import DataFrame, MultiIndex, Series, isna, to_datetime from pandas.core.construction import create_series_with_explicit_dtype @@ -893,7 +898,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): if dtype is not None: try: if not is_categorical_dtype(dtype): - dtype = np.dtype(dtype) + dtype = pandas_dtype(dtype) return data.astype(dtype), True except (TypeError, ValueError): return data, False From e92b4cb9cba7abbc7450f253d1f0e0f7fbbe719b Mon Sep 17 00:00:00 2001 From: taoufik Date: Mon, 6 Jan 2020 12:23:15 +0100 Subject: [PATCH 3/4] Add tests --- pandas/tests/io/json/test_pandas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 16429890e0606..686a0ce99f9aa 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -8,6 +8,7 @@ import pytest from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.core.dtypes.common import is_categorical import pandas.util._test_decorators as td import pandas as pd @@ -1201,13 +1202,12 @@ def test_read_json_category_dtype(self): json = ( '{"a": 0, "b": "A"}\n' '{"a": 1, "b": "B"}\n' - '{"a": 2, "b": "B"}\n' + '{"a": 2, "b": "A"}\n' '{"a": 3, "b": "B"}\n' ) json = StringIO(json) - result = read_json(json, lines=True, dtype={"a": "category"}) - expected = DataFrame([[0, "foo"], [1, "bar"], [2, "foo"], [3, "bar"]]) - tm.assert_frame_equal(result, expected) + result = read_json(json, lines=True, dtype={"b": "category"}) + assert is_categorical(result["b"]) def test_read_jsonl_unicode_chars(self): # GH15132: non-ascii unicode characters From de216026e8e24216a78cb6cbd96ff3fbb8ae10f8 Mon Sep 17 00:00:00 2001 From: taoufik Date: Mon, 6 Jan 2020 13:30:39 +0100 Subject: [PATCH 4/4] Sort imports --- pandas/io/json/_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 5c95ef48c8e9b..8d44a54fbd988 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -13,10 +13,10 @@ from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( + pandas_dtype, ensure_str, is_period_dtype, is_categorical_dtype, - pandas_dtype, ) from pandas import DataFrame, MultiIndex, Series, isna, to_datetime