-
-
Notifications
You must be signed in to change notification settings - Fork 19k
Enhanced json normalize #23861
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enhanced json normalize #23861
Changes from 4 commits
cb53be7
0972746
5a5c708
be7ec0e
a79e126
cd12a23
d3b3503
4ec60bc
e001264
5c88339
55f7b1c
1af2bfc
882a2ca
caba6db
4e22c69
c2eff85
247124f
ab15869
26bf967
fca2a27
7a58456
f3d25e3
7a1297d
177c750
cb82bca
2a7b966
4635591
22fd84e
2e407e3
cf27cae
124fbd9
7b65999
03d3d23
8e61a04
b808d5a
0eaea30
837ba18
217d4ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,8 +23,11 @@ def _convert_to_line_delimits(s): | |
return convert_json_to_lines(s) | ||
|
||
|
||
def nested_to_record(ds, prefix="", sep=".", level=0): | ||
"""a simplified json_normalize | ||
def nested_to_record(ds, prefix="", sep=".", level=0, | ||
max_level=None, ignore_keys=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can u type these parameters |
||
""" | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you revert the change to this line? |
||
A simplified json_normalize | ||
|
||
converts a nested dict into a flat dict ("record"), unlike json_normalize, | ||
it does not attempt to extract a subset of the data. | ||
|
@@ -41,6 +44,11 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
|
||
level: the number of levels in the jason string, optional, default: 0 | ||
|
||
max_level: normalize to a maximum level of, optional, default: None | ||
bhavaniravi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
ignore_keys: specific keys to normalize, optional, default: None | ||
|
||
|
||
.. versionadded:: 0.24.0 | ||
|
||
Returns | ||
------- | ||
d - dict or list of dicts, matching `ds` | ||
|
@@ -65,7 +73,6 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
|
||
new_ds = [] | ||
for d in ds: | ||
|
||
new_d = copy.deepcopy(d) | ||
for k, v in d.items(): | ||
# each key gets renamed with prefix | ||
|
@@ -76,16 +83,21 @@ def nested_to_record(ds, prefix="", sep=".", level=0): | |
else: | ||
newkey = prefix + sep + k | ||
|
||
# only dicts gets recurse-flattend | ||
# only dicts | ||
# or curr_level < max_level | ||
# or k not in ignore keys gets recurse-flattend | ||
# only at level>1 do we rename the rest of the keys | ||
if not isinstance(v, dict): | ||
if not isinstance(v, dict) or \ | ||
(max_level is not None and level >= max_level) or \ | ||
(ignore_keys is not None and k in ignore_keys): | ||
bhavaniravi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
if level != 0: # so we skip copying for top level, common case | ||
v = new_d.pop(k) | ||
new_d[newkey] = v | ||
continue | ||
else: | ||
v = new_d.pop(k) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1)) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1, | ||
max_level, ignore_keys)) | ||
new_ds.append(new_d) | ||
|
||
if singleton: | ||
|
@@ -97,7 +109,9 @@ def json_normalize(data, record_path=None, meta=None, | |
meta_prefix=None, | ||
record_prefix=None, | ||
errors='raise', | ||
sep='.'): | ||
sep='.', | ||
max_level=None, | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ignore_keys=None): | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
"Normalize" semi-structured JSON data into a flat table | ||
|
||
|
@@ -115,7 +129,6 @@ def json_normalize(data, record_path=None, meta=None, | |
If True, prefix records with dotted (?) path, e.g. foo.bar.field if | ||
path to records is ['foo', 'bar'] | ||
errors : {'raise', 'ignore'}, default 'raise' | ||
|
||
* 'ignore' : will ignore KeyError if keys listed in meta are not | ||
always present | ||
* 'raise' : will raise KeyError if keys listed in meta are not | ||
|
@@ -129,6 +142,11 @@ def json_normalize(data, record_path=None, meta=None, | |
|
||
.. versionadded:: 0.20.0 | ||
|
||
max_level: integer, max depth to normalize, default sNone | ||
ignore_keys: list, keys to ignore, default None | ||
|
||
.. versionadded:: 0.24.0 | ||
bhavaniravi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
|
||
Returns | ||
------- | ||
|
@@ -205,7 +223,9 @@ def _pull_field(js, spec): | |
# | ||
# TODO: handle record value which are lists, at least error | ||
# reasonably | ||
data = nested_to_record(data, sep=sep) | ||
data = nested_to_record(data, sep=sep, | ||
max_level=max_level, | ||
ignore_keys=ignore_keys) | ||
return DataFrame(data) | ||
elif not isinstance(record_path, list): | ||
record_path = [record_path] | ||
|
@@ -238,10 +258,8 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
else: | ||
for obj in data: | ||
recs = _pull_field(obj, path[0]) | ||
|
||
# For repeating the metadata later | ||
lengths.append(len(recs)) | ||
|
||
for val, key in zip(meta, meta_keys): | ||
if level + 1 > len(val): | ||
meta_val = seen_meta[key] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -261,6 +261,23 @@ def test_missing_field(self, author_missing_data): | |
expected = DataFrame(ex_data) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_with_max_level_one(self): | ||
data = [{ | ||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
expected_output = [{ | ||
'CreatedBy.Name': 'User001', | ||
'Lookup.TextField': 'Some text', | ||
'Lookup.UserField': {'Id': 'ID001', 'Name': 'Name001'}, | ||
'Image': {'a': 'b'} | ||
}] | ||
expected = DataFrame(expected_output) | ||
result = json_normalize(data, max_level=1, ignore_keys=["Image"]) | ||
tm.assert_frame_equal(result, expected) | ||
bhavaniravi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
|
||
class TestNestedToRecord(object): | ||
|
||
|
@@ -440,3 +457,56 @@ def test_nonetype_multiple_levels(self): | |
'location.country.state.town.info.y': -33.148521423339844, | ||
'location.country.state.town.info.z': 27.572303771972656} | ||
assert result == expected | ||
|
||
def test_with_max_level_none(self): | ||
data = [{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need the issue number as a comment |
||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
expected_output = [{ | ||
'CreatedBy.Name': 'User001', | ||
'Lookup.TextField': 'Some text', | ||
'Lookup.UserField.Id': 'ID001', | ||
'Lookup.UserField.Name': 'Name001', | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, ignore_keys=["Image"]) | ||
assert output == expected_output | ||
|
||
def test_with_max_level_zero(self): | ||
data = [{ | ||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, max_level=0, ignore_keys=["Image"]) | ||
assert output == data | ||
|
||
def test_with_max_level_one(self): | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
data = [{ | ||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
expected_output = [{ | ||
'CreatedBy.Name': 'User001', | ||
'Lookup.TextField': 'Some text', | ||
'Lookup.UserField': {'Id': 'ID001', 'Name': 'Name001'}, | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, max_level=1, ignore_keys=["Image"]) | ||
assert output == expected_output | ||
|
||
def test_with_all_keys_to_ignore(self): | ||
data = [{ | ||
'CreatedBy': {'Name': 'User001'}, | ||
'Lookup': {'TextField': 'Some text', | ||
'UserField': {'Id': 'ID001', 'Name': 'Name001'}}, | ||
'Image': {'a': 'b'} | ||
}] | ||
output = nested_to_record(data, ignore_keys=list(data[0].keys())) | ||
assert output == data |
Uh oh!
There was an error while loading. Please reload this page.