feat!: reading JSON data as a custom arrow extension type (#1458)

chelsea-lin · tswast · web-flow · commit e720f41ef643 · 2025-03-11T14:13:24.000-05:00
* feat: use JSONArrowType for JSON data

* fix related system tests

* fixes lint and doctest

* switch db_dtypes into 1.4.2

* fix tests

* fix test_df_drop_duplicates_w_json

* commit suggestion

---------

Co-authored-by: Tim Sweña (Swast) &lt;tswast@gmail.com&gt;
Release-As: 1.40.0
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
@@ -53,7 +53,7 @@ def json_set(
         >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
         >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
             0    {"a":100,"b":"hi"}
-            Name: data, dtype: dbjson
+            Name: data, dtype: extension<dbjson<JSONArrowType>>[pyarrow]
 
     Args:
         input (bigframes.series.Series):
@@ -253,7 +253,7 @@ def parse_json(
         dtype: string
         >>> bbq.parse_json(s)
         0    {"class":{"students":[{"id":5},{"id":12}]}}
-        dtype: dbjson
+        dtype: extension<dbjson<JSONArrowType>>[pyarrow]
 
     Args:
         input (bigframes.series.Series):
diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py
@@ -108,8 +108,8 @@ def from_table(
             raise ValueError("must set at most one of 'offests', 'primary_key'")
         if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
             msg = bfe.format_message(
-                "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is "
-                "in preview; this behavior may change in future versions."
+                "JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
+                "is a preview feature and subject to change."
             )
             warnings.warn(msg, bfe.PreviewWarning)
         # define data source only for needed columns, this makes row-hashing cheaper
diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
@@ -75,7 +75,7 @@
         IBIS_GEO_TYPE,
         gpd.array.GeometryDtype(),
     ),
-    (ibis_dtypes.json, db_dtypes.JSONDtype()),
+    (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
 )
 
 BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -62,7 +62,9 @@
 # No arrow equivalent
 GEO_DTYPE = gpd.array.GeometryDtype()
 # JSON
-JSON_DTYPE = db_dtypes.JSONDtype()
+# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
+JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
+JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
 OBJ_REF_DTYPE = pd.ArrowDtype(
     pa.struct(
         (
@@ -80,7 +82,7 @@
             ),
             pa.field(
                 "details",
-                db_dtypes.JSONArrowType(),
+                JSON_ARROW_TYPE,
             ),
         )
     )
@@ -301,7 +303,6 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool:
     return type_ in ("object", "O") or (
         getattr(type_, "kind", None) == "O"
         and getattr(type_, "storage", None) != "pyarrow"
-        and getattr(type_, "name", None) != "dbjson"
     )
 
 
diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
@@ -18,7 +18,6 @@
 from typing import Collection, Union
 
 import bigframes_vendored.constants as constants
-import db_dtypes  # type: ignore
 import geopandas  # type: ignore
 import numpy as np
 import pandas
@@ -125,8 +124,6 @@ def arrow_to_pandas(
             )
         elif isinstance(dtype, pandas.ArrowDtype):
             series = _arrow_to_pandas_arrowdtype(column, dtype)
-        elif isinstance(dtype, db_dtypes.JSONDtype):
-            series = db_dtypes.JSONArray(column)
         else:
             series = column.to_pandas(types_mapper=lambda _: dtype)
 
diff --git a/setup.py b/setup.py
@@ -60,7 +60,7 @@
     "ipywidgets >=7.7.1",
     "humanize >=4.6.0",
     "matplotlib >=3.7.1",
-    "db-dtypes >=1.4.0",
+    "db-dtypes >=1.4.2",
     # For vendored ibis-framework.
     "atpublic>=2.3,<6",
     "parsy>=2,<3",
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
@@ -25,7 +25,7 @@ tabulate==0.9
 ipywidgets==7.7.1
 humanize==4.6.0
 matplotlib==3.7.1
-db-dtypes==1.4.0
+db-dtypes==1.4.2
 # For vendored ibis-framework.
 atpublic==2.3
 parsy==2.0
diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py
@@ -12,30 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import pandas as pd
 import pyarrow as pa
 import pytest
 
 import bigframes.bigquery as bbq
-import bigframes.dtypes
+import bigframes.dtypes as dtypes
 import bigframes.pandas as bpd
 
 
 @pytest.mark.parametrize(
     ("json_path", "expected_json"),
     [
-        pytest.param("$.a", [{"a": 10}], id="simple"),
-        pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"),
+        pytest.param("$.a", ['{"a": 10}'], id="simple"),
+        pytest.param("$.a.b.c", ['{"a": {"b": {"c": 10, "d": []}}}'], id="nested"),
     ],
 )
 def test_json_set_at_json_path(json_path, expected_json):
-    original_json = [{"a": {"b": {"c": "tester", "d": []}}}]
-    s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
+    original_json = ['{"a": {"b": {"c": "tester", "d": []}}}']
+    s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
     actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)])
 
-    expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
+    expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
@@ -45,47 +44,49 @@ def test_json_set_at_json_path(json_path, expected_json):
 @pytest.mark.parametrize(
     ("json_value", "expected_json"),
     [
-        pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"),
-        pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"),
-        pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"),
-        pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"),
+        pytest.param(10, ['{"a": {"b": 10}}', '{"a": {"b": 10}}'], id="int"),
+        pytest.param(0.333, ['{"a": {"b": 0.333}}', '{"a": {"b": 0.333}}'], id="float"),
+        pytest.param(
+            "eng", ['{"a": {"b": "eng"}}', '{"a": {"b": "eng"}}'], id="string"
+        ),
+        pytest.param([1, 2], ['{"a": {"b": 1}}', '{"a": {"b": 2}}'], id="series"),
     ],
 )
 def test_json_set_at_json_value_type(json_value, expected_json):
-    original_json = [{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}]
-    s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
+    original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}']
+    s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
     actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)])
 
-    expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
+    expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
     )
 
 
 def test_json_set_w_more_pairs():
-    original_json = [{"a": 2}, {"b": 5}, {"c": 1}]
-    s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
+    original_json = ['{"a": 2}', '{"b": 5}', '{"c": 1}']
+    s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
     actual = bbq.json_set(
         s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])]
     )
 
-    expected_json = [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}]
-    expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
+    expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}']
+    expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
     )
 
 
 def test_json_set_w_invalid_json_path_value_pairs():
-    s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype())
+    s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
     with pytest.raises(ValueError):
         bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)])  # type: ignore
 
 
 def test_json_set_w_invalid_value_type():
-    s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype())
+    s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
     with pytest.raises(TypeError):
         bbq.json_set(
             s,
@@ -101,17 +102,18 @@ def test_json_set_w_invalid_value_type():
 
 
 def test_json_set_w_invalid_series_type():
+    s = bpd.Series([1, 2])
     with pytest.raises(TypeError):
-        bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)])
+        bbq.json_set(s, json_path_value_pairs=[("$.a", 1)])
 
 
 def test_json_extract_from_json():
     s = bpd.Series(
-        [{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}],
-        dtype=db_dtypes.JSONDtype(),
+        ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],
+        dtype=dtypes.JSON_DTYPE,
     )
     actual = bbq.json_extract(s, "$.a.b").to_pandas()
-    expected = bpd.Series([[1, 2], None, 0], dtype=db_dtypes.JSONDtype()).to_pandas()
+    expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas()
     pd.testing.assert_series_equal(
         actual,
         expected,
@@ -132,14 +134,15 @@ def test_json_extract_from_string():
 
 
 def test_json_extract_w_invalid_series_type():
+    s = bpd.Series([1, 2])
     with pytest.raises(TypeError):
-        bbq.json_extract(bpd.Series([1, 2]), "$.a")
+        bbq.json_extract(s, "$.a")
 
 
 def test_json_extract_array_from_json():
     s = bpd.Series(
-        [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}],
-        dtype=db_dtypes.JSONDtype(),
+        ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"],
+        dtype=dtypes.JSON_DTYPE,
     )
     actual = bbq.json_extract_array(s, "$.a")
 
@@ -225,7 +228,7 @@ def test_json_extract_string_array_from_array_strings():
 
 def test_json_extract_string_array_as_float_array_from_array_strings():
     s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"])
-    actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE)
+    actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE)
     expected = bpd.Series([[1, 2.5, 3], [], [4, 5]])
     pd.testing.assert_series_equal(
         actual.to_pandas(),
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -30,6 +30,7 @@
 import bigframes._config.display_options as display_options
 import bigframes.core.indexes as bf_indexes
 import bigframes.dataframe as dataframe
+import bigframes.dtypes as dtypes
 import bigframes.pandas as bpd
 import bigframes.series as series
 from tests.system.utils import (
@@ -4584,7 +4585,17 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub
 )
 def test_df_drop_duplicates_w_json(json_df, keep):
     bf_df = json_df.drop_duplicates(keep=keep).to_pandas()
-    pd_df = json_df.to_pandas().drop_duplicates(keep=keep)
+
+    # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible
+    # with Arrow string extension types. Temporary conversion to standard Pandas
+    # strings is required.
+    json_pandas_df = json_df.to_pandas()
+    json_pandas_df["json_col"] = json_pandas_df["json_col"].astype(
+        pd.StringDtype(storage="pyarrow")
+    )
+
+    pd_df = json_pandas_df.drop_duplicates(keep=keep)
+    pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE)
     pd.testing.assert_frame_equal(
         pd_df,
         bf_df,
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@`
`75`	`75`	`IBIS_GEO_TYPE,`
`76`	`76`	`gpd.array.GeometryDtype(),`
`77`	`77`	`),`
`78`		`- (ibis_dtypes.json, db_dtypes.JSONDtype()),`
	`78`	`+ (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),`
`79`	`79`	`)`
`80`	`80`
`81`	`81`	`BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,9 @@`
`62`	`62`	`# No arrow equivalent`
`63`	`63`	`GEO_DTYPE = gpd.array.GeometryDtype()`
`64`	`64`	`# JSON`
`65`		`-JSON_DTYPE = db_dtypes.JSONDtype()`
	`65`	`+# TODO: switch to pyarrow.json_(pyarrow.string()) when available.`
	`66`	`+JSON_ARROW_TYPE = db_dtypes.JSONArrowType()`
	`67`	`+JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)`
`66`	`68`	`OBJ_REF_DTYPE = pd.ArrowDtype(`
`67`	`69`	`pa.struct(`
`68`	`70`	`(`
`@@ -80,7 +82,7 @@`
`80`	`82`	`),`
`81`	`83`	`pa.field(`
`82`	`84`	`"details",`
`83`		`- db_dtypes.JSONArrowType(),`
	`85`	`+ JSON_ARROW_TYPE,`
`84`	`86`	`),`
`85`	`87`	`)`
`86`	`88`	`)`
`@@ -301,7 +303,6 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool:`
`301`	`303`	`return type_ in ("object", "O") or (`
`302`	`304`	`getattr(type_, "kind", None) == "O"`
`303`	`305`	`and getattr(type_, "storage", None) != "pyarrow"`
`304`		`- and getattr(type_, "name", None) != "dbjson"`
`305`	`306`	`)`
`306`	`307`
`307`	`308`