Skip to content

Robustness improvement for normalize.py #26328

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion pandas/io/json/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ def json_normalize(data, record_path=None, meta=None,
record_path : string or list of strings, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records
For an array of objects with missing key-value pairs in each record,
the first record needs to include all key-value pairs
meta : list of paths (string or list of strings), default None
Fields to use as metadata for each record in resulting table
meta_prefix : string, default None
Expand Down Expand Up @@ -180,13 +182,21 @@ def json_normalize(data, record_path=None, meta=None,
0 1
1 2
"""

def _pull_field(js, spec):
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
# GH26284
try:
result = result[spec]
if not (isinstance(result, list)):
# Allows import of single objects into dataframe GH26284
result = [result]
except KeyError:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have an errors parameter which this ignores. I think we'll need to be aware of that here in some way, though from my mine comment I don't think we should try and tackle that here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand this. The only error we can catch here is for a missing key ? Any other error would happen in the existing baseline ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this assumes that the user always wants to silently ignore missing keys, which is not desirable and makes for a confusing API since we have an "errors" parameter that controls that behavior for the meta

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah understood. If we want to give control there are clearly two ways....
(i) redefine errors = 'ignore' to cover both meta and record path
(ii) introduce another error flag to differentiate between meta and record path
Is there a convention or a preference in pandas before I implement ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think option one

result = {}

return result

Expand Down Expand Up @@ -241,6 +251,12 @@ def _recursive_extract(data, path, seen_meta, level=0):
else:
for obj in data:
recs = _pull_field(obj, path[0])
if recs == {}:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment around missing key - need to be cognizant of the errors parameter

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand. Setting recs to an empty dictionary if there is a missing key (on line 199), is a convenient flag to then load the output with NaNs on line 258. How does the errors parameter come into this ?

# GH26284 Fill Missing key in this record
# requires all required keys in first record
for key in records[0]:
recs[key] = np.nan
recs = [recs]

# For repeating the metadata later
lengths.append(len(recs))
Expand Down
94 changes: 86 additions & 8 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def author_missing_data():
return [
{'info': None},
{'info':
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
'author_name':
{'first': 'Jane', 'last_name': 'Doe'}
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
'author_name':
{'first': 'Jane', 'last_name': 'Doe'}
}]


Expand All @@ -85,6 +85,49 @@ def missing_metadata():
]


@pytest.fixture
def deep_nested_missing_keys():
return [{
'keyC': [{
'keyCA': 'StringCA1',
'keyCB': {
'keyCBA': 4,
'keyCBB': 5,
'keyCBC': [{'keyCBCA': 6, 'keyCBCB': 7, 'keyCBCC': 8.2},
{'keyCBCA': 'keyCBCA', 'keyCBCB': 10,
'keyCBCC': 11},
{'keyCBCA': 12, 'keyCBCB': [13], 'keyCBCC': 14}],
'keyCBD': 15
},
'keyCC': 16
}],
'keyD': 17,
'keyE': [{
'keyEA': 18,
'keyEB': {'keyEBA': 19, 'keyEBB': 20}
}]
}, {
'keyC': [{
'keyCA': {'StringCA2': 'StringCA2'},
'keyCB': {
'keyCBA': 34,
'keyCBB': 35,
'keyCBC': [
{'keyCBCA': 'keyCBCA', 'keyCBCB': 37.1, 'keyCBCC': 38},
{'keyCBCA': 39, 'keyCBCB': True, 'keyCBCC': 41},
{'keyCBCA': 42, 'keyCBCB': 43, 'keyCBCC': {'test': 44}}],
'keyCBD': 45
},
'keyCC': False
}],
'keyD Missing': 47,
'keyE': [{
'keyEA': 48,
'Missing keyEB': 49
}]
}]


class TestJSONNormalize:

def test_simple_records(self):
Expand Down Expand Up @@ -262,8 +305,8 @@ def test_record_prefix(self, state_data):

def test_non_ascii_key(self):
testjson = (
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
).decode('utf8')

testdata = {
Expand Down Expand Up @@ -383,12 +426,12 @@ def test_donot_drop_nonevalues(self):
data = [
{'info': None,
'author_name':
{'first': 'Smith', 'last_name': 'Appleseed'}
{'first': 'Smith', 'last_name': 'Appleseed'}
},
{'info':
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
{'created_at': '11/08/1993', 'last_updated': '26/05/2012'},
'author_name':
{'first': 'Jane', 'last_name': 'Doe'}
{'first': 'Jane', 'last_name': 'Doe'}
}
]
result = nested_to_record(data)
Expand Down Expand Up @@ -460,3 +503,38 @@ def test_nonetype_multiple_levels(self):
'location.country.state.town.info.y': -33.148521423339844,
'location.country.state.town.info.z': 27.572303771972656}
assert result == expected


class TestMissingKeys:
# GH26284

def test_string(self, deep_nested_missing_keys):
data = ['StringCA1', {'StringCA2': 'StringCA2'}]
result = json_normalize(data=deep_nested_missing_keys,
record_path=['keyC', 'keyCA'])
expected = DataFrame(data)
tm.assert_frame_equal(result, expected)

def test_single_object(self, deep_nested_missing_keys):
data = {16, False}
result = json_normalize(data=deep_nested_missing_keys,
record_path=['keyC', 'keyCC'])
expected = DataFrame(data)
tm.assert_frame_equal(result, expected)

def test_object_array(self, deep_nested_missing_keys):
data = {'keyCBCA': [6, 'keyCBCA', 12, 'keyCBCA', 39, 42],
'keyCBCB': [7, 10, [13], 37.1, True, 43],
'keyCBCC': [8.2, 11, 14, 38, 41, {'test': 44}]}
result = json_normalize(data=deep_nested_missing_keys,
record_path=['keyC', 'keyCB', 'keyCBC'])
expected = DataFrame(data)
tm.assert_frame_equal(result, expected)

def test_Missing_Key(self, deep_nested_missing_keys):
data = {'keyEBA': [19.0, np.nan],
'keyEBB': [20.0, np.nan]}
result = json_normalize(data=deep_nested_missing_keys,
record_path=['keyE', 'keyEB'])
expected = DataFrame(data)
tm.assert_frame_equal(result, expected)