From 1575f2fb8afe366f0e50f6500e73a1741de7ef6a Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Mon, 21 Jun 2021 15:33:48 -0400 Subject: [PATCH 1/8] BUG: json_normalize not consistently ignoring errors (#41876) --- pandas/io/json/_normalize.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 5927d6482d3b0..5b365c37b0c12 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -469,7 +469,16 @@ def _recursive_extract(data, path, seen_meta, level=0): for obj in data: for val, key in zip(_meta, meta_keys): if level + 1 == len(val): - seen_meta[key] = _pull_field(obj, val[-1]) + try: + seen_meta[key] = _pull_field(obj, val[-1]) + except KeyError as e: + if errors == "ignore": + seen_meta[key] = np.nan + else: + raise KeyError( + "Try running with errors='ignore' as key " + f"{e} is not always present" + ) from e _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: From cbea941c15194722e3364dce19692a9be95c1970 Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Tue, 22 Jun 2021 13:16:10 -0400 Subject: [PATCH 2/8] adding unit tests (#41876) --- pandas/tests/io/json/test_normalize.py | 36 +++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a428d8c71a793..3a9cbb41e02d0 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -105,6 +105,7 @@ def missing_metadata(): "zip": 44646, } ], + "previous_residences": {"cities": [{"city_name": "Foo York City"}]}, }, { "addresses": [ @@ -115,7 +116,8 @@ def missing_metadata(): "state": "TN", "zip": 37643, } - ] + ], + "previous_residences": {"cities": [{"city_name": "Barmingham"}]}, }, ] @@ -623,6 +625,38 @@ def test_missing_meta(self, missing_metadata): expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) + def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata): + # GH41876 + # Ensure errors='raise' works as intended even when a record_path of length + # greater than one is passed in + msg = "Try running with errors='ignore' as key 'name' is not always present" + with pytest.raises(KeyError, match=msg): + json_normalize( + data=missing_metadata, + record_path=["previous_residences", "cities"], + meta="name", + errors="raise", + ) + + def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata): + # GH41876 + # Ensure errors='ignore' works as intended even when a record_path of length + # greater than one is passed in + result = json_normalize( + data=missing_metadata, + record_path=["previous_residences", "cities"], + meta="name", + errors="ignore", + ) + ex_data = [ + ["Foo York City", "Alice"], + ["Barmingham", np.nan], + ] + columns = ["city_name", "name"] + columns = ["city_name", "name"] + expected = DataFrame(ex_data, columns=columns) + tm.assert_frame_equal(result, expected) + def test_donot_drop_nonevalues(self): # GH21356 data = [ From 1c7c10c032fa4aee94fd2c5fc9974c82700bbf34 Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Wed, 23 Jun 2021 19:13:30 -0400 Subject: [PATCH 3/8] fixing typo in unit tests (#41876) --- pandas/tests/io/json/test_normalize.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3a9cbb41e02d0..9f893d13875e9 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -620,7 +620,6 @@ def test_missing_meta(self, missing_metadata): [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"], [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan], ] - columns = ["city", "number", "state", "street", "zip", "name"] columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) @@ -653,7 +652,6 @@ def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadat ["Barmingham", np.nan], ] columns = ["city_name", "name"] - columns = ["city_name", "name"] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) From 4a2197bed72038451866766f38e7bdbdbdb0d3f1 Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Tue, 29 Jun 2021 22:54:55 -0400 Subject: [PATCH 4/8] add whatsnew and other minor changes (#41876) --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/io/json/_normalize.py | 54 +++++++++++++------------- pandas/tests/io/json/test_normalize.py | 10 ++++- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9859f12a34621..df248f3bc879d 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -172,7 +172,7 @@ MultiIndex I/O ^^^ -- +- Bug in :func:`json_normalize` where ``errors=Ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Period diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 5b365c37b0c12..dcdf192d7a622 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -380,14 +380,32 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js: dict[str, Any], spec: list | str) -> Scalar | Iterable: + def _pull_field( + js: dict[str, Any], spec: list | str, extract_record: bool = False + ) -> Scalar | Iterable: """Internal function to pull field""" result = js - if isinstance(spec, list): - for field in spec: - result = result[field] - else: - result = result[spec] + try: + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] + except KeyError as e: + if extract_record: + raise KeyError( + f"Key {e} not found. If specifying a record_path, all elements of " + f"data should have the path." + ) from e + else: + if errors == "ignore": + result = np.nan + else: + raise KeyError( + f"Key {e} not found. To replace missing values of {e} with " + f"np.nan, pass in errors='ignore'" + ) from e + return result def _pull_records(js: dict[str, Any], spec: list | str) -> list: @@ -396,7 +414,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: _pull_field, but require to return list. And will raise error if has non iterable value. """ - result = _pull_field(js, spec) + result = _pull_field(js, spec, extract_record=True) # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not # null, otherwise return an empty list @@ -469,16 +487,7 @@ def _recursive_extract(data, path, seen_meta, level=0): for obj in data: for val, key in zip(_meta, meta_keys): if level + 1 == len(val): - try: - seen_meta[key] = _pull_field(obj, val[-1]) - except KeyError as e: - if errors == "ignore": - seen_meta[key] = np.nan - else: - raise KeyError( - "Try running with errors='ignore' as key " - f"{e} is not always present" - ) from e + seen_meta[key] = _pull_field(obj, val[-1]) _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: @@ -497,16 +506,7 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - try: - meta_val = _pull_field(obj, val[level:]) - except KeyError as e: - if errors == "ignore": - meta_val = np.nan - else: - raise KeyError( - "Try running with errors='ignore' as key " - f"{e} is not always present" - ) from e + meta_val = _pull_field(obj, val[level:]) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 9f893d13875e9..faf9fc903d7b5 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -600,7 +600,10 @@ def test_json_normalize_errors(self, missing_metadata): # If meta keys are not always present a new option to set # errors='ignore' has been implemented - msg = "Try running with errors='ignore' as key 'name' is not always present" + msg = ( + "Key 'name' not found. To replace missing values of " + "'name' with np.nan, pass in errors='ignore'" + ) with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, @@ -628,7 +631,10 @@ def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata # GH41876 # Ensure errors='raise' works as intended even when a record_path of length # greater than one is passed in - msg = "Try running with errors='ignore' as key 'name' is not always present" + msg = ( + "Key 'name' not found. To replace missing values of " + "'name' with np.nan, pass in errors='ignore'" + ) with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, From 73ed8fb69cdbfe9ffccaa1cb5a0f0cf88316faa0 Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Tue, 29 Jun 2021 22:59:17 -0400 Subject: [PATCH 5/8] fix typo in whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index df248f3bc879d..4e290c70448bb 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -172,7 +172,7 @@ MultiIndex I/O ^^^ -- Bug in :func:`json_normalize` where ``errors=Ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) +- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Period From 01796d3285baed5d1bb04510afc653d7575dd1c9 Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Sun, 4 Jul 2021 18:18:20 -0400 Subject: [PATCH 6/8] minor fix in json_normalize --- pandas/io/json/_normalize.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index dcdf192d7a622..da86e5a209838 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -397,14 +397,13 @@ def _pull_field( f"Key {e} not found. If specifying a record_path, all elements of " f"data should have the path." ) from e + elif errors == "ignore": + result = np.nan else: - if errors == "ignore": - result = np.nan - else: - raise KeyError( - f"Key {e} not found. To replace missing values of {e} with " - f"np.nan, pass in errors='ignore'" - ) from e + raise KeyError( + f"Key {e} not found. To replace missing values of {e} with " + f"np.nan, pass in errors='ignore'" + ) from e return result From 39862cbff005accaea66d8f2921a2df83091aa67 Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Sun, 4 Jul 2021 18:46:57 -0400 Subject: [PATCH 7/8] fix merge conflict in whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4e290c70448bb..fe95158af4908 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -172,6 +172,7 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - From 5477c0b5d5ed0c3866ad8e4f33155a79f270e1b9 Mon Sep 17 00:00:00 2001 From: Neel Raman Date: Wed, 14 Jul 2021 19:02:41 -0400 Subject: [PATCH 8/8] fix type hint issue --- pandas/io/json/_normalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index da86e5a209838..729d60ca78944 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -398,7 +398,7 @@ def _pull_field( f"data should have the path." ) from e elif errors == "ignore": - result = np.nan + return np.nan else: raise KeyError( f"Key {e} not found. To replace missing values of {e} with "