From ecf941bd44cbc4a68b4f2d16b68bf16bbfeabe5c Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 09:25:25 -0700 Subject: [PATCH 1/8] Preserve index on json_normalize --- pandas/io/json/_normalize.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 49f95430d9bb9..2e35325b39d71 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -19,7 +19,7 @@ from pandas._libs.writers import convert_json_to_lines import pandas as pd -from pandas import DataFrame +from pandas import DataFrame, Series if TYPE_CHECKING: from collections.abc import Iterable @@ -266,7 +266,7 @@ def _simple_json_normalize( def json_normalize( - data: dict | list[dict], + data: dict | list[dict] | Series, record_path: str | list | None = None, meta: str | list[str | list[str]] | None = None, meta_prefix: str | None = None, @@ -455,6 +455,11 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: ) return result + if isinstance(data, Series): + index = data.index + else: + index = None + if isinstance(data, list) and not data: return DataFrame() elif isinstance(data, dict): @@ -477,7 +482,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: and record_prefix is None and max_level is None ): - return DataFrame(_simple_json_normalize(data, sep=sep)) + return DataFrame(_simple_json_normalize(data, sep=sep), index=index) if record_path is None: if any([isinstance(x, dict) for x in y.values()] for y in data): @@ -489,7 +494,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep, max_level=max_level) - return DataFrame(data) + return DataFrame(data, index=index) elif not isinstance(record_path, list): record_path = [record_path] @@ -564,4 +569,6 @@ def _recursive_extract(data, path, seen_meta, level: int = 0) -> None: values[i] = val result[k] = values.repeat(lengths) + if index is not None: + result.index = index.repeat(lengths) return result From f5cc0f6185628a1c8619299e182f278f73e33cd2 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 10:51:41 -0700 Subject: [PATCH 2/8] Update unit tests --- pandas/tests/io/json/test_normalize.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 0f33883feba3a..3b0e77d3f6dfa 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -560,6 +560,14 @@ def test_top_column_with_leading_underscore(self): expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"]) tm.assert_frame_equal(result, expected) + + def test_series_index(self, state_data): + idx = [7, 8] + series = Series(state_data, index=idx) + result = json_normalize(series) + assert (result.index == idx).all() + result = json_normalize(series, "counties") + assert (result.index == np.array(idx).repeat([3, 2])).all() class TestNestedToRecord: @@ -893,4 +901,5 @@ def test_series_non_zero_index(self): "elements.c": [np.nan, np.nan, 3.0], } ) + expected.index = [1, 2, 3] tm.assert_frame_equal(result, expected) From ec6aef228d5d4c13ad7e5d78202fd81b2813afbd Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 10:59:53 -0700 Subject: [PATCH 3/8] Update release notes --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 012fe47c476d1..f843d1648dc9b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -31,6 +31,7 @@ Other enhancements - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) +- Support passing a ``Series`` input to :func:`normalize_json` (:issue:`51452`) - .. --------------------------------------------------------------------------- From decdc23f625a553004d1d228dd5da2fe16b3abb3 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 11:01:14 -0700 Subject: [PATCH 4/8] Pass linter --- pandas/io/json/_normalize.py | 5 ++++- pandas/tests/io/json/test_normalize.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 2e35325b39d71..59585d7e3aca7 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -19,7 +19,10 @@ from pandas._libs.writers import convert_json_to_lines import pandas as pd -from pandas import DataFrame, Series +from pandas import ( + DataFrame, + Series, +) if TYPE_CHECKING: from collections.abc import Iterable diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3b0e77d3f6dfa..62d3f0628b0b3 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -560,7 +560,7 @@ def test_top_column_with_leading_underscore(self): expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"]) tm.assert_frame_equal(result, expected) - + def test_series_index(self, state_data): idx = [7, 8] series = Series(state_data, index=idx) From a326898a1b8547f484b8f09983c914886d615a67 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 13:02:54 -0700 Subject: [PATCH 5/8] Set index in constructor --- pandas/tests/io/json/test_normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 62d3f0628b0b3..8b1bf3b264ece 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -899,7 +899,7 @@ def test_series_non_zero_index(self): "elements.a": [1.0, np.nan, np.nan], "elements.b": [np.nan, 2.0, np.nan], "elements.c": [np.nan, np.nan, 3.0], - } + }, + index=[1, 2, 3], ) - expected.index = [1, 2, 3] tm.assert_frame_equal(result, expected) From 8461a3d45c68fadffcdccf2d4e25e31b43cd3a86 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 13:03:12 -0700 Subject: [PATCH 6/8] Use tm assert_index_equal in unit test --- pandas/tests/io/json/test_normalize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 8b1bf3b264ece..d83e7b4641e88 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -562,12 +562,12 @@ def test_top_column_with_leading_underscore(self): tm.assert_frame_equal(result, expected) def test_series_index(self, state_data): - idx = [7, 8] + idx = Index([7, 8]) series = Series(state_data, index=idx) result = json_normalize(series) - assert (result.index == idx).all() + tm.assert_index_equal(result.index, idx) result = json_normalize(series, "counties") - assert (result.index == np.array(idx).repeat([3, 2])).all() + tm.assert_index_equal(result.index, idx.repeat([3, 2])) class TestNestedToRecord: From 5ad9bf634624cee2fe56a0461553b1b339947ab9 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 13:08:57 -0700 Subject: [PATCH 7/8] Update docstring and examples --- pandas/io/json/_normalize.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 59585d7e3aca7..f784004487646 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -283,7 +283,7 @@ def json_normalize( Parameters ---------- - data : dict or list of dicts + data : dict, list of dicts, or Series of dicts Unserialized JSON objects. record_path : str or list of str, default None Path in each object to list of records. If not passed, data will be @@ -368,6 +368,26 @@ def json_normalize( 1 NaN Mark Reg 130 60 2 2.0 Faye Raker 130 60 + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] + >>> series = pd.Series(data, index=pd.Index(["a", "b", "c"])) + >>> pd.json_normalize(series) + id name fitness.height fitness.weight + a 1.0 Cole Volk 130 60 + b NaN Mark Reg 130 60 + c 2.0 Faye Raker 130 60 + >>> data = [ ... { ... "state": "Florida", From 12d71785f55a70c19cf278ebb26f29bae2a4e86b Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 14 Feb 2024 13:11:37 -0700 Subject: [PATCH 8/8] Update release notes --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f843d1648dc9b..ab23f0adcb958 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -31,7 +31,7 @@ Other enhancements - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) -- Support passing a ``Series`` input to :func:`normalize_json` (:issue:`51452`) +- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) - .. ---------------------------------------------------------------------------