Skip to content

feat!: reading JSON data as a custom arrow extension type #1458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bigframes/bigquery/_operations/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def json_set(
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
0 {"a":100,"b":"hi"}
Name: data, dtype: dbjson
Name: data, dtype: extension<dbjson<JSONArrowType>>[pyarrow]

Args:
input (bigframes.series.Series):
Expand Down Expand Up @@ -253,7 +253,7 @@ def parse_json(
dtype: string
>>> bbq.parse_json(s)
0 {"class":{"students":[{"id":5},{"id":12}]}}
dtype: dbjson
dtype: extension<dbjson<JSONArrowType>>[pyarrow]

Args:
input (bigframes.series.Series):
Expand Down
4 changes: 2 additions & 2 deletions bigframes/core/array_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def from_table(
raise ValueError("must set at most one of 'offests', 'primary_key'")
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
msg = bfe.format_message(
"Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is "
"in preview; this behavior may change in future versions."
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
"is a preview feature and subject to change."
)
warnings.warn(msg, bfe.PreviewWarning)
# define data source only for needed columns, this makes row-hashing cheaper
Expand Down
2 changes: 1 addition & 1 deletion bigframes/core/compile/ibis_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
IBIS_GEO_TYPE,
gpd.array.GeometryDtype(),
),
(ibis_dtypes.json, db_dtypes.JSONDtype()),
(ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
)

BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
Expand Down
7 changes: 4 additions & 3 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@
# No arrow equivalent
GEO_DTYPE = gpd.array.GeometryDtype()
# JSON
JSON_DTYPE = db_dtypes.JSONDtype()
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
OBJ_REF_DTYPE = pd.ArrowDtype(
pa.struct(
(
Expand All @@ -80,7 +82,7 @@
),
pa.field(
"details",
db_dtypes.JSONArrowType(),
JSON_ARROW_TYPE,
),
)
)
Expand Down Expand Up @@ -301,7 +303,6 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool:
return type_ in ("object", "O") or (
getattr(type_, "kind", None) == "O"
and getattr(type_, "storage", None) != "pyarrow"
and getattr(type_, "name", None) != "dbjson"
)


Expand Down
3 changes: 0 additions & 3 deletions bigframes/session/_io/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from typing import Collection, Union

import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
import geopandas # type: ignore
import numpy as np
import pandas
Expand Down Expand Up @@ -125,8 +124,6 @@ def arrow_to_pandas(
)
elif isinstance(dtype, pandas.ArrowDtype):
series = _arrow_to_pandas_arrowdtype(column, dtype)
elif isinstance(dtype, db_dtypes.JSONDtype):
series = db_dtypes.JSONArray(column)
else:
series = column.to_pandas(types_mapper=lambda _: dtype)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
"ipywidgets >=7.7.1",
"humanize >=4.6.0",
"matplotlib >=3.7.1",
"db-dtypes >=1.4.0",
"db-dtypes >=1.4.2",
# For vendored ibis-framework.
"atpublic>=2.3,<6",
"parsy>=2,<3",
Expand Down
2 changes: 1 addition & 1 deletion testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ tabulate==0.9
ipywidgets==7.7.1
humanize==4.6.0
matplotlib==3.7.1
db-dtypes==1.4.0
db-dtypes==1.4.2
# For vendored ibis-framework.
atpublic==2.3
parsy==2.0
Expand Down
59 changes: 31 additions & 28 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import pandas as pd
import pyarrow as pa
import pytest

import bigframes.bigquery as bbq
import bigframes.dtypes
import bigframes.dtypes as dtypes
import bigframes.pandas as bpd


@pytest.mark.parametrize(
("json_path", "expected_json"),
[
pytest.param("$.a", [{"a": 10}], id="simple"),
pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"),
pytest.param("$.a", ['{"a": 10}'], id="simple"),
pytest.param("$.a.b.c", ['{"a": {"b": {"c": 10, "d": []}}}'], id="nested"),
],
)
def test_json_set_at_json_path(json_path, expected_json):
original_json = [{"a": {"b": {"c": "tester", "d": []}}}]
s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
original_json = ['{"a": {"b": {"c": "tester", "d": []}}}']
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)])

expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
Expand All @@ -45,47 +44,49 @@ def test_json_set_at_json_path(json_path, expected_json):
@pytest.mark.parametrize(
("json_value", "expected_json"),
[
pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"),
pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"),
pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"),
pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"),
pytest.param(10, ['{"a": {"b": 10}}', '{"a": {"b": 10}}'], id="int"),
pytest.param(0.333, ['{"a": {"b": 0.333}}', '{"a": {"b": 0.333}}'], id="float"),
pytest.param(
"eng", ['{"a": {"b": "eng"}}', '{"a": {"b": "eng"}}'], id="string"
),
pytest.param([1, 2], ['{"a": {"b": 1}}', '{"a": {"b": 2}}'], id="series"),
],
)
def test_json_set_at_json_value_type(json_value, expected_json):
original_json = [{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}]
s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}']
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)])

expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_set_w_more_pairs():
original_json = [{"a": 2}, {"b": 5}, {"c": 1}]
s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
original_json = ['{"a": 2}', '{"b": 5}', '{"c": 1}']
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_set(
s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])]
)

expected_json = [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}]
expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}']
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_set_w_invalid_json_path_value_pairs():
s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype())
s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
with pytest.raises(ValueError):
bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore


def test_json_set_w_invalid_value_type():
s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype())
s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
with pytest.raises(TypeError):
bbq.json_set(
s,
Expand All @@ -101,17 +102,18 @@ def test_json_set_w_invalid_value_type():


def test_json_set_w_invalid_series_type():
s = bpd.Series([1, 2])
with pytest.raises(TypeError):
bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)])
bbq.json_set(s, json_path_value_pairs=[("$.a", 1)])


def test_json_extract_from_json():
s = bpd.Series(
[{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}],
dtype=db_dtypes.JSONDtype(),
['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],
dtype=dtypes.JSON_DTYPE,
)
actual = bbq.json_extract(s, "$.a.b").to_pandas()
expected = bpd.Series([[1, 2], None, 0], dtype=db_dtypes.JSONDtype()).to_pandas()
expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas()
pd.testing.assert_series_equal(
actual,
expected,
Expand All @@ -132,14 +134,15 @@ def test_json_extract_from_string():


def test_json_extract_w_invalid_series_type():
s = bpd.Series([1, 2])
with pytest.raises(TypeError):
bbq.json_extract(bpd.Series([1, 2]), "$.a")
bbq.json_extract(s, "$.a")


def test_json_extract_array_from_json():
s = bpd.Series(
[{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}],
dtype=db_dtypes.JSONDtype(),
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"],
dtype=dtypes.JSON_DTYPE,
)
actual = bbq.json_extract_array(s, "$.a")

Expand Down Expand Up @@ -225,7 +228,7 @@ def test_json_extract_string_array_from_array_strings():

def test_json_extract_string_array_as_float_array_from_array_strings():
s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE)
actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE)
expected = bpd.Series([[1, 2.5, 3], [], [4, 5]])
pd.testing.assert_series_equal(
actual.to_pandas(),
Expand Down
13 changes: 12 additions & 1 deletion tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import bigframes._config.display_options as display_options
import bigframes.core.indexes as bf_indexes
import bigframes.dataframe as dataframe
import bigframes.dtypes as dtypes
import bigframes.pandas as bpd
import bigframes.series as series
from tests.system.utils import (
Expand Down Expand Up @@ -4584,7 +4585,17 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub
)
def test_df_drop_duplicates_w_json(json_df, keep):
bf_df = json_df.drop_duplicates(keep=keep).to_pandas()
pd_df = json_df.to_pandas().drop_duplicates(keep=keep)

# drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible
# with Arrow string extension types. Temporary conversion to standard Pandas
# strings is required.
json_pandas_df = json_df.to_pandas()
json_pandas_df["json_col"] = json_pandas_df["json_col"].astype(
pd.StringDtype(storage="pyarrow")
)

pd_df = json_pandas_df.drop_duplicates(keep=keep)
pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE)
pd.testing.assert_frame_equal(
pd_df,
bf_df,
Expand Down
Loading