From b25faf7044364fe095b179d03e84d3c39ae37e9d Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Thu, 27 Jun 2019 11:31:36 +0800 Subject: [PATCH 1/3] add optional fill_value for nan in json_normalize --- pandas/_libs/lib.pyx | 11 ++++++++--- pandas/core/frame.py | 5 +++-- pandas/core/internals/construction.py | 11 +++++++---- pandas/io/json/normalize.py | 17 ++++++++++++++--- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c09fb96eb9182..5be9da983f449 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -308,17 +308,22 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): @cython.wraparound(False) @cython.boundscheck(False) -def dicts_to_array(dicts: list, columns: list): +def dicts_to_array(dicts: list, columns: list, fill_value=None): cdef: Py_ssize_t i, j, k, n ndarray[object, ndim=2] result dict row - object col, onan = np.nan + object col + list onan k = len(columns) n = len(dicts) result = np.empty((n, k), dtype='O') + if fill_value: + onan = [fill_value[col] if col in fill_value else np.nan for col in columns] + else: + onan = list(np.full(k, np.nan)) for i in range(n): row = dicts[i] @@ -327,7 +332,7 @@ def dicts_to_array(dicts: list, columns: list): if col in row: result[i, j] = row[col] else: - result[i, j] = onan + result[i, j] = onan[j] return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fd2e1e3e41ced..db0f23dff994f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -375,7 +375,7 @@ def _constructor_expanddim(self): # Constructors def __init__(self, data=None, index=None, columns=None, dtype=None, - copy=False): + copy=False, fill_value=None): if data is None: data = {} if dtype is not None: @@ -431,7 +431,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields - arrays, columns = to_arrays(data, columns, dtype=dtype) + arrays, columns = to_arrays(data, columns, dtype=dtype, + fill_value=fill_value) columns = ensure_index(columns) # set the index diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d766d7f06d34a..fb22302c25014 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -371,7 +371,7 @@ def _get_axes(N, K, index, columns): # --------------------------------------------------------------------- # Conversion of Inputs to Arrays -def to_arrays(data, columns, coerce_float=False, dtype=None): +def to_arrays(data, columns, coerce_float=False, dtype=None, fill_value=None): """ Return list of arrays, columns. """ @@ -396,7 +396,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): dtype=dtype) elif isinstance(data[0], abc.Mapping): return _list_of_dict_to_arrays(data, columns, - coerce_float=coerce_float, dtype=dtype) + coerce_float=coerce_float, dtype=dtype, + fill_value=fill_value) elif isinstance(data[0], ABCSeries): return _list_of_series_to_arrays(data, columns, coerce_float=coerce_float, @@ -463,7 +464,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): return values.T, columns -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None, + fill_value=None): if columns is None: gen = (list(x.keys()) for x in data) sort = not any(isinstance(d, OrderedDict) for d in data) @@ -473,7 +475,8 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): # classes data = [(type(d) is dict) and d or dict(d) for d in data] - content = list(lib.dicts_to_array(data, list(columns)).T) + content = list(lib.dicts_to_array(data, list(columns), + fill_value=fill_value).T) return _convert_object_array(content, columns, dtype=dtype, coerce_float=coerce_float) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 2d8bc20b1195e..b3c8933d450a7 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -96,7 +96,7 @@ def nested_to_record(ds, prefix="", sep=".", level=0): return new_ds -def json_normalize(data, record_path=None, meta=None, +def json_normalize(data, fill_value=None, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', @@ -108,6 +108,8 @@ def json_normalize(data, record_path=None, meta=None, ---------- data : dict or list of dicts Unserialized JSON objects + fill_value: dict, default None + default na values for specified columns record_path : string or list of strings, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records @@ -149,6 +151,12 @@ def json_normalize(data, record_path=None, meta=None, 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN + >>> json_normalize(data, fill_value={'id' : -1}) + id name name.family name.first name.given name.last + 0 1 NaN NaN Coleen NaN Volk + 1 -1 NaN Regner NaN Mose NaN + 2 2 Faye Raker NaN NaN NaN NaN + >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { @@ -197,6 +205,9 @@ def _pull_field(js, spec): if isinstance(data, dict): data = [data] + if fill_value and not isinstance(fill_value, dict): + raise ValueError('Invalid fill_value, fill_value only accepts a dict') + if record_path is None: if any([isinstance(x, dict) for x in y.values()] for y in data): # naive normalization, this is idempotent for flat records @@ -207,7 +218,7 @@ def _pull_field(js, spec): # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep) - return DataFrame(data) + return DataFrame(data, fill_value=fill_value) elif not isinstance(record_path, list): record_path = [record_path] @@ -265,7 +276,7 @@ def _recursive_extract(data, path, seen_meta, level=0): _recursive_extract(data, record_path, {}, level=0) - result = DataFrame(records) + result = DataFrame(records, fill_value=fill_value) if record_prefix is not None: result = result.rename( From e53b620dfca0b4b9f1a7b2cdfd45c175136c8121 Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Thu, 27 Jun 2019 13:22:39 +0800 Subject: [PATCH 2/3] add test cases --- pandas/io/json/normalize.py | 7 +-- pandas/tests/io/json/test_normalize.py | 63 ++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index b3c8933d450a7..ec8a125fe00e1 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -96,11 +96,12 @@ def nested_to_record(ds, prefix="", sep=".", level=0): return new_ds -def json_normalize(data, fill_value=None, record_path=None, meta=None, +def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', - sep='.'): + sep='.', + fill_value=None): """ Normalize semi-structured JSON data into a flat table. @@ -151,7 +152,7 @@ def json_normalize(data, fill_value=None, record_path=None, meta=None, 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN - >>> json_normalize(data, fill_value={'id' : -1}) + >>> json_normalize(data, fill_value={'id': -1}) id name name.family name.first name.given name.last 0 1 NaN NaN Coleen NaN Volk 1 -1 NaN Regner NaN Mose NaN diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a7407d843c6c9..a7f9b6f37e2f2 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -38,6 +38,32 @@ def deep_nested(): } ] +@pytest.fixture +def deep_nested_missing(): + # deeply nested data with some missing values + return [{'country': 'USA', + 'states': [{'name': 'California', + 'cities': [{'name': 'San Francisco', + 'pop': 12345}, + {'name': 'Los Angeles', + 'pop': 12346}] + }, + {'name': 'Ohio', + 'cities': [{'name': 'Columbus', + 'pop': 1234}, + {'pop': 1236}]} + ] + }, + {'country': 'Germany', + 'states': [{'name': 'Bayern', + 'cities': [{'name': 'Munich'}] + }, + {'name': 'Nordrhein-Westfalen', + 'cities': [{'name': 'Duesseldorf', 'pop': 1238}, + {'name': 'Koeln'}]} + ] + } + ] @pytest.fixture def state_data(): @@ -294,6 +320,43 @@ def test_missing_field(self, author_missing_data): expected = DataFrame(ex_data) tm.assert_frame_equal(result, expected) + def test_fill_value(self, author_missing_data, deep_nested_missing): + # GH16918 + result = json_normalize( + author_missing_data, + fill_value={'info.last_updated': '27/06/2019'}) + ex_data = [ + {'info': np.nan, + 'author_name.first': np.nan, + 'author_name.last_name': np.nan, + 'info.created_at': np.nan, + 'info.last_updated': '27/06/2019'}, + {'info': None, + 'author_name.first': 'Jane', + 'author_name.last_name': 'Doe', + 'info.created_at': '11/08/1993', + 'info.last_updated': '26/05/2012'} + ] + expected = DataFrame(ex_data) + print(result['info'], expected['info']) + tm.assert_frame_equal(result, expected) + + result = json_normalize(deep_nested_missing, ['states', 'cities'], + meta=['country', ['states', 'name']], + fill_value={'pop': 0, 'name': 'N/A'}) + # meta_prefix={'states': 'state_'}) + + ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3, + 'states.name': ['California', 'California', 'Ohio', 'Ohio', + 'Bayern', 'Nordrhein-Westfalen', + 'Nordrhein-Westfalen'], + 'name': ['San Francisco', 'Los Angeles', 'Columbus', + 'N/A', 'Munich', 'Duesseldorf', 'Koeln'], + 'pop': [12345, 12346, 1234, 1236, 0, 1238, 0]} + + expected = DataFrame(ex_data, columns=result.columns) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord: From 41baaa3aa47af844f3471cb6a81dd4299050be5c Mon Sep 17 00:00:00 2001 From: Jiang Yue Date: Thu, 27 Jun 2019 13:30:38 +0800 Subject: [PATCH 3/3] add whatsnew entry --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/lib.pyx | 3 ++- pandas/io/json/normalize.py | 7 +++++-- pandas/tests/io/json/test_normalize.py | 2 ++ 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a58cdc8c93ab7..933deb7a97451 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -133,6 +133,7 @@ Other Enhancements - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where ``