From b25faf7044364fe095b179d03e84d3c39ae37e9d Mon Sep 17 00:00:00 2001
From: Jiang Yue <jyue@wup-sgpubu02.us.drwholdings.com>
Date: Thu, 27 Jun 2019 11:31:36 +0800
Subject: [PATCH 1/3] add optional fill_value for nan in json_normalize

---
 pandas/_libs/lib.pyx                  | 11 ++++++++---
 pandas/core/frame.py                  |  5 +++--
 pandas/core/internals/construction.py | 11 +++++++----
 pandas/io/json/normalize.py           | 17 ++++++++++++++---
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index c09fb96eb9182..5be9da983f449 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -308,17 +308,22 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def dicts_to_array(dicts: list, columns: list):
+def dicts_to_array(dicts: list, columns: list, fill_value=None):
     cdef:
         Py_ssize_t i, j, k, n
         ndarray[object, ndim=2] result
         dict row
-        object col, onan = np.nan
+        object col
+        list onan
 
     k = len(columns)
     n = len(dicts)
 
     result = np.empty((n, k), dtype='O')
+    if fill_value:
+        onan = [fill_value[col] if col in fill_value else np.nan for col in columns]
+    else:
+        onan = list(np.full(k, np.nan))
 
     for i in range(n):
         row = dicts[i]
@@ -327,7 +332,7 @@ def dicts_to_array(dicts: list, columns: list):
             if col in row:
                 result[i, j] = row[col]
             else:
-                result[i, j] = onan
+                result[i, j] = onan[j]
 
     return result
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index fd2e1e3e41ced..db0f23dff994f 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -375,7 +375,7 @@ def _constructor_expanddim(self):
     # Constructors
 
     def __init__(self, data=None, index=None, columns=None, dtype=None,
-                 copy=False):
+                 copy=False, fill_value=None):
         if data is None:
             data = {}
         if dtype is not None:
@@ -431,7 +431,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
                     if is_named_tuple(data[0]) and columns is None:
                         columns = data[0]._fields
-                    arrays, columns = to_arrays(data, columns, dtype=dtype)
+                    arrays, columns = to_arrays(data, columns, dtype=dtype,
+                                                fill_value=fill_value)
                     columns = ensure_index(columns)
 
                     # set the index
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index d766d7f06d34a..fb22302c25014 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -371,7 +371,7 @@ def _get_axes(N, K, index, columns):
 # ---------------------------------------------------------------------
 # Conversion of Inputs to Arrays
 
-def to_arrays(data, columns, coerce_float=False, dtype=None):
+def to_arrays(data, columns, coerce_float=False, dtype=None, fill_value=None):
     """
     Return list of arrays, columns.
     """
@@ -396,7 +396,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
                                dtype=dtype)
     elif isinstance(data[0], abc.Mapping):
         return _list_of_dict_to_arrays(data, columns,
-                                       coerce_float=coerce_float, dtype=dtype)
+                                       coerce_float=coerce_float, dtype=dtype,
+                                       fill_value=fill_value)
     elif isinstance(data[0], ABCSeries):
         return _list_of_series_to_arrays(data, columns,
                                          coerce_float=coerce_float,
@@ -463,7 +464,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
         return values.T, columns
 
 
-def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
+def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None,
+                            fill_value=None):
     if columns is None:
         gen = (list(x.keys()) for x in data)
         sort = not any(isinstance(d, OrderedDict) for d in data)
@@ -473,7 +475,8 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
     # classes
     data = [(type(d) is dict) and d or dict(d) for d in data]
 
-    content = list(lib.dicts_to_array(data, list(columns)).T)
+    content = list(lib.dicts_to_array(data, list(columns),
+                                      fill_value=fill_value).T)
     return _convert_object_array(content, columns, dtype=dtype,
                                  coerce_float=coerce_float)
 
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
index 2d8bc20b1195e..b3c8933d450a7 100644
--- a/pandas/io/json/normalize.py
+++ b/pandas/io/json/normalize.py
@@ -96,7 +96,7 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
     return new_ds
 
 
-def json_normalize(data, record_path=None, meta=None,
+def json_normalize(data, fill_value=None, record_path=None, meta=None,
                    meta_prefix=None,
                    record_prefix=None,
                    errors='raise',
@@ -108,6 +108,8 @@ def json_normalize(data, record_path=None, meta=None,
     ----------
     data : dict or list of dicts
         Unserialized JSON objects
+    fill_value: dict, default None
+        default na values for specified columns
     record_path : string or list of strings, default None
         Path in each object to list of records. If not passed, data will be
         assumed to be an array of records
@@ -149,6 +151,12 @@ def json_normalize(data, record_path=None, meta=None,
     1  NaN         NaN      Regner        NaN       Mose       NaN
     2  2.0  Faye Raker         NaN        NaN        NaN       NaN
 
+    >>> json_normalize(data, fill_value={'id' : -1})
+       id        name name.family name.first name.given name.last
+    0   1         NaN         NaN     Coleen        NaN      Volk
+    1  -1         NaN      Regner        NaN       Mose       NaN
+    2   2  Faye Raker         NaN        NaN        NaN       NaN
+
     >>> data = [{'state': 'Florida',
     ...          'shortname': 'FL',
     ...          'info': {
@@ -197,6 +205,9 @@ def _pull_field(js, spec):
     if isinstance(data, dict):
         data = [data]
 
+    if fill_value and not isinstance(fill_value, dict):
+        raise ValueError('Invalid fill_value, fill_value only accepts a dict')
+
     if record_path is None:
         if any([isinstance(x, dict) for x in y.values()] for y in data):
             # naive normalization, this is idempotent for flat records
@@ -207,7 +218,7 @@ def _pull_field(js, spec):
             # TODO: handle record value which are lists, at least error
             #       reasonably
             data = nested_to_record(data, sep=sep)
-        return DataFrame(data)
+        return DataFrame(data, fill_value=fill_value)
     elif not isinstance(record_path, list):
         record_path = [record_path]
 
@@ -265,7 +276,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
 
     _recursive_extract(data, record_path, {}, level=0)
 
-    result = DataFrame(records)
+    result = DataFrame(records, fill_value=fill_value)
 
     if record_prefix is not None:
         result = result.rename(

From e53b620dfca0b4b9f1a7b2cdfd45c175136c8121 Mon Sep 17 00:00:00 2001
From: Jiang Yue <jyue@wup-sgpubu02.us.drwholdings.com>
Date: Thu, 27 Jun 2019 13:22:39 +0800
Subject: [PATCH 2/3] add test cases

---
 pandas/io/json/normalize.py            |  7 +--
 pandas/tests/io/json/test_normalize.py | 63 ++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
index b3c8933d450a7..ec8a125fe00e1 100644
--- a/pandas/io/json/normalize.py
+++ b/pandas/io/json/normalize.py
@@ -96,11 +96,12 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
     return new_ds
 
 
-def json_normalize(data, fill_value=None, record_path=None, meta=None,
+def json_normalize(data, record_path=None, meta=None,
                    meta_prefix=None,
                    record_prefix=None,
                    errors='raise',
-                   sep='.'):
+                   sep='.',
+                   fill_value=None):
     """
     Normalize semi-structured JSON data into a flat table.
 
@@ -151,7 +152,7 @@ def json_normalize(data, fill_value=None, record_path=None, meta=None,
     1  NaN         NaN      Regner        NaN       Mose       NaN
     2  2.0  Faye Raker         NaN        NaN        NaN       NaN
 
-    >>> json_normalize(data, fill_value={'id' : -1})
+    >>> json_normalize(data, fill_value={'id': -1})
        id        name name.family name.first name.given name.last
     0   1         NaN         NaN     Coleen        NaN      Volk
     1  -1         NaN      Regner        NaN       Mose       NaN
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
index a7407d843c6c9..a7f9b6f37e2f2 100644
--- a/pandas/tests/io/json/test_normalize.py
+++ b/pandas/tests/io/json/test_normalize.py
@@ -38,6 +38,32 @@ def deep_nested():
              }
             ]
 
+@pytest.fixture
+def deep_nested_missing():
+    # deeply nested data with some missing values
+    return [{'country': 'USA',
+             'states': [{'name': 'California',
+                         'cities': [{'name': 'San Francisco',
+                                     'pop': 12345},
+                                    {'name': 'Los Angeles',
+                                     'pop': 12346}]
+                         },
+                        {'name': 'Ohio',
+                         'cities': [{'name': 'Columbus',
+                                     'pop': 1234},
+                                    {'pop': 1236}]}
+                        ]
+             },
+            {'country': 'Germany',
+             'states': [{'name': 'Bayern',
+                         'cities': [{'name': 'Munich'}]
+                         },
+                        {'name': 'Nordrhein-Westfalen',
+                         'cities': [{'name': 'Duesseldorf', 'pop': 1238},
+                                    {'name': 'Koeln'}]}
+                        ]
+             }
+            ]
 
 @pytest.fixture
 def state_data():
@@ -294,6 +320,43 @@ def test_missing_field(self, author_missing_data):
         expected = DataFrame(ex_data)
         tm.assert_frame_equal(result, expected)
 
+    def test_fill_value(self, author_missing_data, deep_nested_missing):
+        # GH16918
+        result = json_normalize(
+            author_missing_data,
+            fill_value={'info.last_updated': '27/06/2019'})
+        ex_data = [
+            {'info': np.nan,
+             'author_name.first': np.nan,
+             'author_name.last_name': np.nan,
+             'info.created_at': np.nan,
+             'info.last_updated': '27/06/2019'},
+            {'info': None,
+             'author_name.first': 'Jane',
+             'author_name.last_name': 'Doe',
+             'info.created_at': '11/08/1993',
+             'info.last_updated': '26/05/2012'}
+        ]
+        expected = DataFrame(ex_data)
+        print(result['info'], expected['info'])
+        tm.assert_frame_equal(result, expected)
+
+        result = json_normalize(deep_nested_missing, ['states', 'cities'],
+                                meta=['country', ['states', 'name']],
+                                fill_value={'pop': 0, 'name': 'N/A'})
+        # meta_prefix={'states': 'state_'})
+
+        ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
+                   'states.name': ['California', 'California', 'Ohio', 'Ohio',
+                                   'Bayern', 'Nordrhein-Westfalen',
+                                   'Nordrhein-Westfalen'],
+                   'name': ['San Francisco', 'Los Angeles', 'Columbus',
+                            'N/A', 'Munich', 'Duesseldorf', 'Koeln'],
+                   'pop': [12345, 12346, 1234, 1236, 0, 1238, 0]}
+
+        expected = DataFrame(ex_data, columns=result.columns)
+        tm.assert_frame_equal(result, expected)
+
 
 class TestNestedToRecord:
 

From 41baaa3aa47af844f3471cb6a81dd4299050be5c Mon Sep 17 00:00:00 2001
From: Jiang Yue <jyue@wup-sgpubu02.us.drwholdings.com>
Date: Thu, 27 Jun 2019 13:30:38 +0800
Subject: [PATCH 3/3] add whatsnew entry

---
 doc/source/whatsnew/v0.25.0.rst        | 1 +
 pandas/_libs/lib.pyx                   | 3 ++-
 pandas/io/json/normalize.py            | 7 +++++--
 pandas/tests/io/json/test_normalize.py | 2 ++
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index a58cdc8c93ab7..933deb7a97451 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -133,6 +133,7 @@ Other Enhancements
 - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
 - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
 - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
+- :meth:`io.json.json_normalize` now accepts a `fill_value` argument to fill NaN fields in given columns (:issue:`16918`)
 
 .. _whatsnew_0250.api_breaking:
 
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 5be9da983f449..4be63309920d7 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -321,7 +321,8 @@ def dicts_to_array(dicts: list, columns: list, fill_value=None):
 
     result = np.empty((n, k), dtype='O')
     if fill_value:
-        onan = [fill_value[col] if col in fill_value else np.nan for col in columns]
+        onan = ([fill_value[col] if col in fill_value
+                else np.nan for col in columns])
     else:
         onan = list(np.full(k, np.nan))
 
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
index ec8a125fe00e1..71e977d527f92 100644
--- a/pandas/io/json/normalize.py
+++ b/pandas/io/json/normalize.py
@@ -109,8 +109,6 @@ def json_normalize(data, record_path=None, meta=None,
     ----------
     data : dict or list of dicts
         Unserialized JSON objects
-    fill_value: dict, default None
-        default na values for specified columns
     record_path : string or list of strings, default None
         Path in each object to list of records. If not passed, data will be
         assumed to be an array of records
@@ -135,6 +133,11 @@ def json_normalize(data, record_path=None, meta=None,
 
         .. versionadded:: 0.20.0
 
+    fill_value : dict, default None
+        default na values for specified columns
+
+        .. versionadded:: 0.25.0
+
     Returns
     -------
     frame : DataFrame
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
index a7f9b6f37e2f2..c851d61df0e82 100644
--- a/pandas/tests/io/json/test_normalize.py
+++ b/pandas/tests/io/json/test_normalize.py
@@ -38,6 +38,7 @@ def deep_nested():
              }
             ]
 
+
 @pytest.fixture
 def deep_nested_missing():
     # deeply nested data with some missing values
@@ -65,6 +66,7 @@ def deep_nested_missing():
              }
             ]
 
+
 @pytest.fixture
 def state_data():
     return [