-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
Enhancement include or exclude keys in json normalize #27262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4c7da78
502210e
c3268fd
1dc0c0c
0fdad82
cbdaca0
3458717
4d0082b
80c1011
8215729
49d8334
7f14091
e601693
b348038
e13aece
ddea67a
7e8da34
b78797c
73c1d8d
7bb81d7
d02d70d
a28de10
e075305
47f9f24
0f4a4bc
b64411b
59b139b
fe245e5
1efeb50
ab125c3
9ce4212
2733bfd
a871818
1ff357a
ef72c0a
7ae7065
57959d3
fcacc1b
30d80a5
cb5fa8b
303460b
f4bc66d
8060574
6fb13c3
4ba74ff
d35aeaa
d0a077f
7c94e51
bc7ac76
a15c463
b0bfa75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,14 @@ | |
|
||
from collections import defaultdict | ||
import copy | ||
from typing import DefaultDict, Dict, List, Optional, Union | ||
from typing import Callable, DefaultDict, Dict, Generator, List, Optional, Union | ||
|
||
import numpy as np | ||
|
||
from pandas._libs.writers import convert_json_to_lines | ||
|
||
from pandas import DataFrame | ||
from pandas.api.types import is_list_like | ||
|
||
|
||
def convert_to_line_delimits(s): | ||
|
@@ -26,12 +27,44 @@ def convert_to_line_delimits(s): | |
return convert_json_to_lines(s) | ||
|
||
|
||
def _parse_use_keys(use_keys: Optional[Union[str, List, Callable]]) -> Callable: | ||
""" | ||
Converts different types of use_keys into a callable. | ||
|
||
Parameters | ||
---------- | ||
use_keys : str, list or callable | ||
Returns true or false depending on whether to include or exclude a key. | ||
|
||
.. versionadded:: 1.0.0 | ||
|
||
Returns | ||
------- | ||
callable | ||
It Decides on whether to include a key in processing. | ||
""" | ||
if callable(use_keys): | ||
return use_keys | ||
|
||
if is_list_like(use_keys): | ||
bhavaniravi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return lambda x: x in use_keys # type: ignore | ||
|
||
if isinstance(use_keys, str): | ||
return lambda x: x == use_keys | ||
|
||
if use_keys is None: | ||
return lambda x: True | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the else here? None? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup |
||
raise TypeError("`use_keys` must be a str, list or a callable") | ||
|
||
|
||
def nested_to_record( | ||
ds, | ||
prefix: str = "", | ||
sep: str = ".", | ||
level: int = 0, | ||
max_level: Optional[int] = None, | ||
use_keys: Optional[Union[str, List[str], Callable]] = None, | ||
): | ||
""" | ||
A simplified json_normalize | ||
|
@@ -49,14 +82,19 @@ def nested_to_record( | |
|
||
.. versionadded:: 0.20.0 | ||
|
||
level: int, optional, default: 0 | ||
level : int, optional, default: 0 | ||
The number of levels in the json string. | ||
|
||
max_level: int, optional, default: None | ||
max_level : int, optional, default: None | ||
The max depth to normalize. | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
use_keys : str, list or callable, optional | ||
Criteria for inclusion of a particular JSON object (matches on key). | ||
|
||
.. versionadded:: 1.0.0 | ||
|
||
Returns | ||
------- | ||
d - dict or list of dicts, matching `ds` | ||
|
@@ -74,10 +112,39 @@ def nested_to_record( | |
'nested.e.c': 1, | ||
'nested.e.d': 2} | ||
""" | ||
|
||
def is_key_match(key, use_keys): | ||
if callable(use_keys): | ||
return use_keys(key) | ||
|
||
if use_keys is None: | ||
return lambda x: True | ||
|
||
if isinstance(use_keys, str): | ||
use_keys = [use_keys] | ||
|
||
if is_list_like(use_keys): | ||
return key in use_keys | ||
|
||
raise TypeError("`use_keys` must be a str, list or a callable") | ||
|
||
def flatten_deeper( | ||
_dict: dict, level: int, prev_keys: Optional[List] = None | ||
) -> Generator: | ||
prev_keys = prev_keys if prev_keys else [] | ||
for key, val in _dict.items(): | ||
if isinstance(val, dict) and (max_level is None or level < max_level): | ||
yield from flatten_deeper( | ||
_dict=val, prev_keys=prev_keys + [key], level=level + 1 | ||
) | ||
else: | ||
yield prev_keys + [key], val | ||
|
||
singleton = False | ||
if isinstance(ds, dict): | ||
ds = [ds] | ||
singleton = True | ||
|
||
new_ds = [] | ||
for d in ds: | ||
new_d = copy.deepcopy(d) | ||
|
@@ -90,20 +157,29 @@ def nested_to_record( | |
else: | ||
newkey = prefix + sep + k | ||
|
||
# flatten if type is dict and | ||
# current dict level < maximum level provided and | ||
# only dicts gets recurse-flattened | ||
# only at level>1 do we rename the rest of the keys | ||
if not isinstance(v, dict) or ( | ||
max_level is not None and level >= max_level | ||
# flatten if | ||
# current dict level <= maximum level provided and | ||
# current keypath matches the config in use_keys | ||
# only dicts gets recurse-flatten | ||
if ( | ||
is_key_match(newkey, use_keys) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this validation is necessary for the change; can you remove |
||
and (max_level is None or level < max_level) | ||
and isinstance(v, dict) | ||
): | ||
if level != 0: # so we skip copying for top level, common case | ||
v = new_d.pop(k) | ||
new_d[newkey] = v | ||
continue | ||
# pop the current key | ||
new_d.pop(k) | ||
# Flatten the value and update it at the current level | ||
for inner_keys, val in flatten_deeper(v, level=level + 1): | ||
new_d[sep.join([k, *inner_keys])] = val | ||
|
||
else: | ||
v = new_d.pop(k) | ||
new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) | ||
if isinstance(v, dict) and (max_level is None or level < max_level): | ||
new_d[k] = nested_to_record( | ||
v, newkey, sep, level + 1, max_level, use_keys | ||
) | ||
else: | ||
new_d[k] = v | ||
|
||
new_ds.append(new_d) | ||
|
||
if singleton: | ||
|
@@ -120,6 +196,7 @@ def json_normalize( | |
errors: Optional[str] = "raise", | ||
sep: str = ".", | ||
max_level: Optional[int] = None, | ||
use_keys: Optional[Union[Callable, str, List[str]]] = None, | ||
): | ||
""" | ||
Normalize semi-structured JSON data into a flat table. | ||
|
@@ -161,6 +238,11 @@ def json_normalize( | |
|
||
.. versionadded:: 0.25.0 | ||
|
||
use_keys : str or a list of str, callable, optional, default None | ||
Includes or excludes a given key based on a given condition. | ||
|
||
.. versionadded:: 1.0.0 | ||
|
||
Returns | ||
------- | ||
frame : DataFrame | ||
|
@@ -179,6 +261,8 @@ def json_normalize( | |
1 NaN NaN Regner NaN Mose NaN | ||
2 2.0 Faye Raker NaN NaN NaN NaN | ||
|
||
Normalizes list of dict into a flattened data frame. | ||
|
||
>>> data = [{'id': 1, | ||
... 'name': "Cole Volk", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
|
@@ -187,12 +271,12 @@ def json_normalize( | |
... {'id': 2, 'name': 'Faye Raker', | ||
... 'fitness': {'height': 130, 'weight': 60}}] | ||
>>> json_normalize(data, max_level=0) | ||
fitness id name | ||
0 {'height': 130, 'weight': 60} 1.0 Cole Volk | ||
1 {'height': 130, 'weight': 60} NaN Mose Reg | ||
2 {'height': 130, 'weight': 60} 2.0 Faye Raker | ||
id name fitness | ||
0 1.0 Cole Volk {'height': 130, 'weight': 60} | ||
1 NaN Mose Reg {'height': 130, 'weight': 60} | ||
2 2.0 Faye Raker {'height': 130, 'weight': 60} | ||
|
||
Normalizes nested data upto level 1. | ||
Normalizes nested data up to level 0. | ||
|
||
>>> data = [{'id': 1, | ||
... 'name': "Cole Volk", | ||
|
@@ -202,10 +286,42 @@ def json_normalize( | |
... {'id': 2, 'name': 'Faye Raker', | ||
... 'fitness': {'height': 130, 'weight': 60}}] | ||
>>> json_normalize(data, max_level=1) | ||
fitness.height fitness.weight id name | ||
0 130 60 1.0 Cole Volk | ||
1 130 60 NaN Mose Reg | ||
2 130 60 2.0 Faye Raker | ||
id name fitness.height fitness.weight | ||
0 1.0 Cole Volk 130 60 | ||
1 NaN Mose Reg 130 60 | ||
2 2.0 Faye Raker 130 60 | ||
|
||
Normalizes nested data up to level 1. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add an example with a str / list-of-str? are these useful? better to just make this a callable only? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The initial proposal was to provide a list of keys to ignore. After a discussion with @WillAyd he suggested list and list of str be consistent with the inclusion in other modules which made sense. With callable only I'm not sure if we can achieve the multi-level support we are talking about iin Let me know your thoughts |
||
|
||
>>> data = [{'id': 1, | ||
... 'name': "Cole Volk", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'name': "Mose Reg", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'id': 2, 'name': 'Faye Raker', | ||
... 'fitness': {'height': 130, 'weight': 60}}] | ||
>>> json_normalize(data, use_keys=lambda key: key not in ["fitness"]) | ||
id name fitness | ||
0 1.0 Cole Volk {'height': 130, 'weight': 60} | ||
1 NaN Mose Reg {'height': 130, 'weight': 60} | ||
2 2.0 Faye Raker {'height': 130, 'weight': 60} | ||
|
||
Ignores specific keys from being flattened | ||
|
||
>>> data = [{'id': 1, | ||
... 'name': "Cole Volk", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'name': "Mose Reg", | ||
... 'fitness': {'height': 130, 'weight': 60}}, | ||
... {'id': 2, 'name': 'Faye Raker', | ||
... 'fitness': {'height': 130, 'weight': 60}}] | ||
>>> json_normalize(data, use_keys="fitness") | ||
id name fitness.height fitness.weight | ||
0 1.0 Cole Volk 130 60 | ||
1 NaN Mose Reg 130 60 | ||
2 2.0 Faye Raker 130 60 | ||
|
||
Flattens specific set of selected keys | ||
|
||
>>> data = [{'state': 'Florida', | ||
... 'shortname': 'FL', | ||
|
@@ -254,6 +370,8 @@ def _pull_field(js, spec): | |
if isinstance(data, dict): | ||
data = [data] | ||
|
||
use_key = _parse_use_keys(use_keys) | ||
|
||
if record_path is None: | ||
if any([isinstance(x, dict) for x in y.values()] for y in data): | ||
# naive normalization, this is idempotent for flat records | ||
|
@@ -263,7 +381,9 @@ def _pull_field(js, spec): | |
# | ||
# TODO: handle record value which are lists, at least error | ||
# reasonably | ||
data = nested_to_record(data, sep=sep, max_level=max_level) | ||
data = nested_to_record( | ||
ds=data, sep=sep, max_level=max_level, use_keys=use_key | ||
) | ||
return DataFrame(data) | ||
elif not isinstance(record_path, list): | ||
record_path = [record_path] | ||
|
@@ -296,7 +416,9 @@ def _recursive_extract(data, path, seen_meta, level=0): | |
for obj in data: | ||
recs = _pull_field(obj, path[0]) | ||
recs = [ | ||
nested_to_record(r, sep=sep, max_level=max_level) | ||
nested_to_record( | ||
ds=r, sep=sep, max_level=max_level, use_keys=use_key | ||
) | ||
if isinstance(r, dict) | ||
else r | ||
for r in recs | ||
|
Uh oh!
There was an error while loading. Please reload this page.