diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index fa4e35b08bf6e..46dc7fc3c37c8 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -111,6 +111,8 @@ def json_normalize(data, record_path=None, meta=None, record_path : string or list of strings, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records + For an array of objects with missing key-value pairs in each record, + the first record needs to include all key-value pairs meta : list of paths (string or list of strings), default None Fields to use as metadata for each record in resulting table meta_prefix : string, default None @@ -180,13 +182,21 @@ def json_normalize(data, record_path=None, meta=None, 0 1 1 2 """ + def _pull_field(js, spec): result = js if isinstance(spec, list): for field in spec: result = result[field] else: - result = result[spec] + # GH26284 + try: + result = result[spec] + if not (isinstance(result, list)): + # Allows import of single objects into dataframe GH26284 + result = [result] + except KeyError: + result = {} return result @@ -241,6 +251,12 @@ def _recursive_extract(data, path, seen_meta, level=0): else: for obj in data: recs = _pull_field(obj, path[0]) + if recs == {}: + # GH26284 Fill Missing key in this record + # requires all required keys in first record + for key in records[0]: + recs[key] = np.nan + recs = [recs] # For repeating the metadata later lengths.append(len(recs)) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a7407d843c6c9..ca8cc6b0fc9cc 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -60,9 +60,9 @@ def author_missing_data(): return [ {'info': None}, {'info': - {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, - 'author_name': - {'first': 'Jane', 'last_name': 'Doe'} + {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, + 'author_name': + {'first': 'Jane', 'last_name': 'Doe'} }] @@ -85,6 +85,49 @@ def missing_metadata(): ] +@pytest.fixture +def deep_nested_missing_keys(): + return [{ + 'keyC': [{ + 'keyCA': 'StringCA1', + 'keyCB': { + 'keyCBA': 4, + 'keyCBB': 5, + 'keyCBC': [{'keyCBCA': 6, 'keyCBCB': 7, 'keyCBCC': 8.2}, + {'keyCBCA': 'keyCBCA', 'keyCBCB': 10, + 'keyCBCC': 11}, + {'keyCBCA': 12, 'keyCBCB': [13], 'keyCBCC': 14}], + 'keyCBD': 15 + }, + 'keyCC': 16 + }], + 'keyD': 17, + 'keyE': [{ + 'keyEA': 18, + 'keyEB': {'keyEBA': 19, 'keyEBB': 20} + }] + }, { + 'keyC': [{ + 'keyCA': {'StringCA2': 'StringCA2'}, + 'keyCB': { + 'keyCBA': 34, + 'keyCBB': 35, + 'keyCBC': [ + {'keyCBCA': 'keyCBCA', 'keyCBCB': 37.1, 'keyCBCC': 38}, + {'keyCBCA': 39, 'keyCBCB': True, 'keyCBCC': 41}, + {'keyCBCA': 42, 'keyCBCB': 43, 'keyCBCC': {'test': 44}}], + 'keyCBD': 45 + }, + 'keyCC': False + }], + 'keyD Missing': 47, + 'keyE': [{ + 'keyEA': 48, + 'Missing keyEB': 49 + }] + }] + + class TestJSONNormalize: def test_simple_records(self): @@ -262,8 +305,8 @@ def test_record_prefix(self, state_data): def test_non_ascii_key(self): testjson = ( - b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' + - b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]' + b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' + + b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]' ).decode('utf8') testdata = { @@ -383,12 +426,12 @@ def test_donot_drop_nonevalues(self): data = [ {'info': None, 'author_name': - {'first': 'Smith', 'last_name': 'Appleseed'} + {'first': 'Smith', 'last_name': 'Appleseed'} }, {'info': - {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, + {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, 'author_name': - {'first': 'Jane', 'last_name': 'Doe'} + {'first': 'Jane', 'last_name': 'Doe'} } ] result = nested_to_record(data) @@ -460,3 +503,38 @@ def test_nonetype_multiple_levels(self): 'location.country.state.town.info.y': -33.148521423339844, 'location.country.state.town.info.z': 27.572303771972656} assert result == expected + + +class TestMissingKeys: + # GH26284 + + def test_string(self, deep_nested_missing_keys): + data = ['StringCA1', {'StringCA2': 'StringCA2'}] + result = json_normalize(data=deep_nested_missing_keys, + record_path=['keyC', 'keyCA']) + expected = DataFrame(data) + tm.assert_frame_equal(result, expected) + + def test_single_object(self, deep_nested_missing_keys): + data = {16, False} + result = json_normalize(data=deep_nested_missing_keys, + record_path=['keyC', 'keyCC']) + expected = DataFrame(data) + tm.assert_frame_equal(result, expected) + + def test_object_array(self, deep_nested_missing_keys): + data = {'keyCBCA': [6, 'keyCBCA', 12, 'keyCBCA', 39, 42], + 'keyCBCB': [7, 10, [13], 37.1, True, 43], + 'keyCBCC': [8.2, 11, 14, 38, 41, {'test': 44}]} + result = json_normalize(data=deep_nested_missing_keys, + record_path=['keyC', 'keyCB', 'keyCBC']) + expected = DataFrame(data) + tm.assert_frame_equal(result, expected) + + def test_Missing_Key(self, deep_nested_missing_keys): + data = {'keyEBA': [19.0, np.nan], + 'keyEBB': [20.0, np.nan]} + result = json_normalize(data=deep_nested_missing_keys, + record_path=['keyE', 'keyEB']) + expected = DataFrame(data) + tm.assert_frame_equal(result, expected)