Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
cb53be7
ENH add max_level and ignore_keys configuration to nested_to_records
bhavaniravi Nov 22, 2018
0972746
ENH extend max_level and ignore keys to
bhavaniravi Nov 22, 2018
5a5c708
fix pep8 issues
bhavaniravi Nov 22, 2018
be7ec0e
add whatsnew to doc string
bhavaniravi Nov 22, 2018
a79e126
add testcase with large max_level
bhavaniravi Nov 23, 2018
cd12a23
add explation for flatten if condition
bhavaniravi Nov 23, 2018
d3b3503
update doc_string and built documentation
bhavaniravi Nov 23, 2018
4ec60bc
fix json normalize records path issue
bhavaniravi Nov 27, 2018
e001264
Merge branch 'master' into enhanced_json_normalize
bhavaniravi Nov 27, 2018
5c88339
Merge branch 'master' of git://github.com/pandas-dev/pandas into json…
bhavaniravi Dec 30, 2018
55f7b1c
fix merge conflict
bhavaniravi Jan 3, 2019
1af2bfc
fix testcase error
bhavaniravi Jan 3, 2019
882a2ca
add nested flattening to json_normalize
bhavaniravi Jan 3, 2019
caba6db
fixed pep8 issues
bhavaniravi Jan 3, 2019
4e22c69
fix merge conflict
bhavaniravi Jan 3, 2019
c2eff85
fix issues with doc string
bhavaniravi Jan 4, 2019
247124f
modify test case to paramaetized
bhavaniravi Jan 4, 2019
ab15869
fix issues with pep8
bhavaniravi Jan 10, 2019
26bf967
fix pep8 build fail
bhavaniravi Jan 16, 2019
fca2a27
fix testcase failure, inconsistent column order
bhavaniravi Feb 5, 2019
7a58456
fix documentation issues
bhavaniravi Mar 19, 2019
f3d25e3
fix merge conflicts with upstream
bhavaniravi Mar 19, 2019
7a1297d
Merge branch 'master' of git://github.com/pandas-dev/pandas into enha…
bhavaniravi Apr 20, 2019
177c750
fix testcase failure np.nan converted into str on line 328
bhavaniravi Apr 20, 2019
cb82bca
remove get_pip file
bhavaniravi Apr 20, 2019
2a7b966
rename test func test_max_level_with_record_prefix
bhavaniravi Apr 20, 2019
4635591
fix pep8 over-intended line
bhavaniravi Apr 21, 2019
22fd84e
fix docstring formatting issues
bhavaniravi Apr 21, 2019
2e407e3
convert to a fixture
bhavaniravi Apr 21, 2019
cf27cae
convert to inline data
bhavaniravi Apr 21, 2019
124fbd9
fix docstring formatting issues
bhavaniravi Apr 21, 2019
7b65999
fix docstring formatting issues
bhavaniravi Apr 21, 2019
03d3d23
add github issue id to test case
bhavaniravi Apr 22, 2019
8e61a04
fix pep8 flake issues
bhavaniravi Apr 22, 2019
b808d5a
Merge branch 'master' of git://github.com/pandas-dev/pandas into enha…
bhavaniravi Apr 22, 2019
0eaea30
Merge branch 'master' of git://github.com/pandas-dev/pandas into enha…
bhavaniravi Apr 23, 2019
837ba18
Merge branch 'master' of git://github.com/pandas-dev/pandas into enha…
bhavaniravi Apr 26, 2019
217d4ae
Merge branch 'master' of git://github.com/pandas-dev/pandas into enha…
bhavaniravi Apr 30, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 29 additions & 11 deletions pandas/io/json/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ def _convert_to_line_delimits(s):
return convert_json_to_lines(s)


def nested_to_record(ds, prefix="", sep=".", level=0):
"""a simplified json_normalize
def nested_to_record(ds, prefix="", sep=".", level=0,
max_level=None, ignore_keys=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u type these parameters

"""

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you revert the change to this line?

A simplified json_normalize

converts a nested dict into a flat dict ("record"), unlike json_normalize,
it does not attempt to extract a subset of the data.
Expand All @@ -41,6 +44,11 @@ def nested_to_record(ds, prefix="", sep=".", level=0):

level: the number of levels in the jason string, optional, default: 0

max_level: normalize to a maximum level of, optional, default: None
ignore_keys: specific keys to normalize, optional, default: None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the point of this parameter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid specific keys from getting normalized.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the difference between this and record_path in json_normalize then?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

record_path defines the "path to the data to be normalized". Where as with max_level it assumes record_path level as 0 and normalizes it until max_level. The key path in ignore_keys will be left out


.. versionadded:: 0.24.0

Returns
-------
d - dict or list of dicts, matching `ds`
Expand All @@ -65,7 +73,6 @@ def nested_to_record(ds, prefix="", sep=".", level=0):

new_ds = []
for d in ds:

new_d = copy.deepcopy(d)
for k, v in d.items():
# each key gets renamed with prefix
Expand All @@ -76,16 +83,21 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
else:
newkey = prefix + sep + k

# only dicts gets recurse-flattend
# only dicts
# or curr_level < max_level
# or k not in ignore keys gets recurse-flattend
# only at level>1 do we rename the rest of the keys
if not isinstance(v, dict):
if not isinstance(v, dict) or \
(max_level is not None and level >= max_level) or \
(ignore_keys is not None and k in ignore_keys):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
else:
v = new_d.pop(k)
new_d.update(nested_to_record(v, newkey, sep, level + 1))
new_d.update(nested_to_record(v, newkey, sep, level + 1,
max_level, ignore_keys))
new_ds.append(new_d)

if singleton:
Expand All @@ -97,7 +109,9 @@ def json_normalize(data, record_path=None, meta=None,
meta_prefix=None,
record_prefix=None,
errors='raise',
sep='.'):
sep='.',
max_level=None,
ignore_keys=None):
"""
"Normalize" semi-structured JSON data into a flat table

Expand All @@ -115,7 +129,6 @@ def json_normalize(data, record_path=None, meta=None,
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar']
errors : {'raise', 'ignore'}, default 'raise'

* 'ignore' : will ignore KeyError if keys listed in meta are not
always present
* 'raise' : will raise KeyError if keys listed in meta are not
Expand All @@ -129,6 +142,11 @@ def json_normalize(data, record_path=None, meta=None,

.. versionadded:: 0.20.0

max_level: integer, max depth to normalize, default sNone
ignore_keys: list, keys to ignore, default None

.. versionadded:: 0.24.0


Returns
-------
Expand Down Expand Up @@ -205,7 +223,9 @@ def _pull_field(js, spec):
#
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data, sep=sep)
data = nested_to_record(data, sep=sep,
max_level=max_level,
ignore_keys=ignore_keys)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]
Expand Down Expand Up @@ -238,10 +258,8 @@ def _recursive_extract(data, path, seen_meta, level=0):
else:
for obj in data:
recs = _pull_field(obj, path[0])

# For repeating the metadata later
lengths.append(len(recs))

for val, key in zip(meta, meta_keys):
if level + 1 > len(val):
meta_val = seen_meta[key]
Expand Down
70 changes: 70 additions & 0 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,23 @@ def test_missing_field(self, author_missing_data):
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)

def test_with_max_level_one(self):
data = [{
'CreatedBy': {'Name': 'User001'},
'Lookup': {'TextField': 'Some text',
'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
'Image': {'a': 'b'}
}]
expected_output = [{
'CreatedBy.Name': 'User001',
'Lookup.TextField': 'Some text',
'Lookup.UserField': {'Id': 'ID001', 'Name': 'Name001'},
'Image': {'a': 'b'}
}]
expected = DataFrame(expected_output)
result = json_normalize(data, max_level=1, ignore_keys=["Image"])
tm.assert_frame_equal(result, expected)


class TestNestedToRecord(object):

Expand Down Expand Up @@ -440,3 +457,56 @@ def test_nonetype_multiple_levels(self):
'location.country.state.town.info.y': -33.148521423339844,
'location.country.state.town.info.z': 27.572303771972656}
assert result == expected

def test_with_max_level_none(self):
data = [{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need the issue number as a comment

'CreatedBy': {'Name': 'User001'},
'Lookup': {'TextField': 'Some text',
'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
'Image': {'a': 'b'}
}]
expected_output = [{
'CreatedBy.Name': 'User001',
'Lookup.TextField': 'Some text',
'Lookup.UserField.Id': 'ID001',
'Lookup.UserField.Name': 'Name001',
'Image': {'a': 'b'}
}]
output = nested_to_record(data, ignore_keys=["Image"])
assert output == expected_output

def test_with_max_level_zero(self):
data = [{
'CreatedBy': {'Name': 'User001'},
'Lookup': {'TextField': 'Some text',
'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
'Image': {'a': 'b'}
}]
output = nested_to_record(data, max_level=0, ignore_keys=["Image"])
assert output == data

def test_with_max_level_one(self):
data = [{
'CreatedBy': {'Name': 'User001'},
'Lookup': {'TextField': 'Some text',
'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
'Image': {'a': 'b'}
}]
expected_output = [{
'CreatedBy.Name': 'User001',
'Lookup.TextField': 'Some text',
'Lookup.UserField': {'Id': 'ID001', 'Name': 'Name001'},
'Image': {'a': 'b'}
}]
output = nested_to_record(data, max_level=1, ignore_keys=["Image"])
assert output == expected_output

def test_with_all_keys_to_ignore(self):
data = [{
'CreatedBy': {'Name': 'User001'},
'Lookup': {'TextField': 'Some text',
'UserField': {'Id': 'ID001', 'Name': 'Name001'}},
'Image': {'a': 'b'}
}]
output = nested_to_record(data, ignore_keys=list(data[0].keys()))
assert output == data