Skip to content

Commit e720f41

Browse files
chelsea-lintswast
andauthored
feat!: reading JSON data as a custom arrow extension type (#1458)
* feat: use JSONArrowType for JSON data * fix related system tests * fixes lint and doctest * switch db_dtypes into 1.4.2 * fix tests * fix test_df_drop_duplicates_w_json * commit suggestion --------- Co-authored-by: Tim Sweña (Swast) <[email protected]> Release-As: 1.40.0
1 parent 01dfe83 commit e720f41

File tree

12 files changed

+119
-113
lines changed

12 files changed

+119
-113
lines changed

bigframes/bigquery/_operations/json.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def json_set(
5353
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
5454
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
5555
0 {"a":100,"b":"hi"}
56-
Name: data, dtype: dbjson
56+
Name: data, dtype: extension<dbjson<JSONArrowType>>[pyarrow]
5757
5858
Args:
5959
input (bigframes.series.Series):
@@ -253,7 +253,7 @@ def parse_json(
253253
dtype: string
254254
>>> bbq.parse_json(s)
255255
0 {"class":{"students":[{"id":5},{"id":12}]}}
256-
dtype: dbjson
256+
dtype: extension<dbjson<JSONArrowType>>[pyarrow]
257257
258258
Args:
259259
input (bigframes.series.Series):

bigframes/core/array_value.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ def from_table(
108108
raise ValueError("must set at most one of 'offests', 'primary_key'")
109109
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
110110
msg = bfe.format_message(
111-
"Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is "
112-
"in preview; this behavior may change in future versions."
111+
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
112+
"is a preview feature and subject to change."
113113
)
114114
warnings.warn(msg, bfe.PreviewWarning)
115115
# define data source only for needed columns, this makes row-hashing cheaper

bigframes/core/compile/ibis_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
IBIS_GEO_TYPE,
7676
gpd.array.GeometryDtype(),
7777
),
78-
(ibis_dtypes.json, db_dtypes.JSONDtype()),
78+
(ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
7979
)
8080

8181
BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {

bigframes/dtypes.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@
6262
# No arrow equivalent
6363
GEO_DTYPE = gpd.array.GeometryDtype()
6464
# JSON
65-
JSON_DTYPE = db_dtypes.JSONDtype()
65+
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
66+
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
67+
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
6668
OBJ_REF_DTYPE = pd.ArrowDtype(
6769
pa.struct(
6870
(
@@ -80,7 +82,7 @@
8082
),
8183
pa.field(
8284
"details",
83-
db_dtypes.JSONArrowType(),
85+
JSON_ARROW_TYPE,
8486
),
8587
)
8688
)
@@ -301,7 +303,6 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool:
301303
return type_ in ("object", "O") or (
302304
getattr(type_, "kind", None) == "O"
303305
and getattr(type_, "storage", None) != "pyarrow"
304-
and getattr(type_, "name", None) != "dbjson"
305306
)
306307

307308

bigframes/session/_io/pandas.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from typing import Collection, Union
1919

2020
import bigframes_vendored.constants as constants
21-
import db_dtypes # type: ignore
2221
import geopandas # type: ignore
2322
import numpy as np
2423
import pandas
@@ -125,8 +124,6 @@ def arrow_to_pandas(
125124
)
126125
elif isinstance(dtype, pandas.ArrowDtype):
127126
series = _arrow_to_pandas_arrowdtype(column, dtype)
128-
elif isinstance(dtype, db_dtypes.JSONDtype):
129-
series = db_dtypes.JSONArray(column)
130127
else:
131128
series = column.to_pandas(types_mapper=lambda _: dtype)
132129

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
"ipywidgets >=7.7.1",
6161
"humanize >=4.6.0",
6262
"matplotlib >=3.7.1",
63-
"db-dtypes >=1.4.0",
63+
"db-dtypes >=1.4.2",
6464
# For vendored ibis-framework.
6565
"atpublic>=2.3,<6",
6666
"parsy>=2,<3",

testing/constraints-3.9.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ tabulate==0.9
2525
ipywidgets==7.7.1
2626
humanize==4.6.0
2727
matplotlib==3.7.1
28-
db-dtypes==1.4.0
28+
db-dtypes==1.4.2
2929
# For vendored ibis-framework.
3030
atpublic==2.3
3131
parsy==2.0

tests/system/small/bigquery/test_json.py

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,29 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import db_dtypes # type: ignore
1615
import geopandas as gpd # type: ignore
1716
import pandas as pd
1817
import pyarrow as pa
1918
import pytest
2019

2120
import bigframes.bigquery as bbq
22-
import bigframes.dtypes
21+
import bigframes.dtypes as dtypes
2322
import bigframes.pandas as bpd
2423

2524

2625
@pytest.mark.parametrize(
2726
("json_path", "expected_json"),
2827
[
29-
pytest.param("$.a", [{"a": 10}], id="simple"),
30-
pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"),
28+
pytest.param("$.a", ['{"a": 10}'], id="simple"),
29+
pytest.param("$.a.b.c", ['{"a": {"b": {"c": 10, "d": []}}}'], id="nested"),
3130
],
3231
)
3332
def test_json_set_at_json_path(json_path, expected_json):
34-
original_json = [{"a": {"b": {"c": "tester", "d": []}}}]
35-
s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
33+
original_json = ['{"a": {"b": {"c": "tester", "d": []}}}']
34+
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
3635
actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)])
3736

38-
expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
37+
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
3938
pd.testing.assert_series_equal(
4039
actual.to_pandas(),
4140
expected.to_pandas(),
@@ -45,47 +44,49 @@ def test_json_set_at_json_path(json_path, expected_json):
4544
@pytest.mark.parametrize(
4645
("json_value", "expected_json"),
4746
[
48-
pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"),
49-
pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"),
50-
pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"),
51-
pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"),
47+
pytest.param(10, ['{"a": {"b": 10}}', '{"a": {"b": 10}}'], id="int"),
48+
pytest.param(0.333, ['{"a": {"b": 0.333}}', '{"a": {"b": 0.333}}'], id="float"),
49+
pytest.param(
50+
"eng", ['{"a": {"b": "eng"}}', '{"a": {"b": "eng"}}'], id="string"
51+
),
52+
pytest.param([1, 2], ['{"a": {"b": 1}}', '{"a": {"b": 2}}'], id="series"),
5253
],
5354
)
5455
def test_json_set_at_json_value_type(json_value, expected_json):
55-
original_json = [{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}]
56-
s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
56+
original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}']
57+
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
5758
actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)])
5859

59-
expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
60+
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
6061
pd.testing.assert_series_equal(
6162
actual.to_pandas(),
6263
expected.to_pandas(),
6364
)
6465

6566

6667
def test_json_set_w_more_pairs():
67-
original_json = [{"a": 2}, {"b": 5}, {"c": 1}]
68-
s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype())
68+
original_json = ['{"a": 2}', '{"b": 5}', '{"c": 1}']
69+
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
6970
actual = bbq.json_set(
7071
s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])]
7172
)
7273

73-
expected_json = [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}]
74-
expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype())
74+
expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}']
75+
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
7576
pd.testing.assert_series_equal(
7677
actual.to_pandas(),
7778
expected.to_pandas(),
7879
)
7980

8081

8182
def test_json_set_w_invalid_json_path_value_pairs():
82-
s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype())
83+
s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
8384
with pytest.raises(ValueError):
8485
bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore
8586

8687

8788
def test_json_set_w_invalid_value_type():
88-
s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype())
89+
s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
8990
with pytest.raises(TypeError):
9091
bbq.json_set(
9192
s,
@@ -101,17 +102,18 @@ def test_json_set_w_invalid_value_type():
101102

102103

103104
def test_json_set_w_invalid_series_type():
105+
s = bpd.Series([1, 2])
104106
with pytest.raises(TypeError):
105-
bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)])
107+
bbq.json_set(s, json_path_value_pairs=[("$.a", 1)])
106108

107109

108110
def test_json_extract_from_json():
109111
s = bpd.Series(
110-
[{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}],
111-
dtype=db_dtypes.JSONDtype(),
112+
['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],
113+
dtype=dtypes.JSON_DTYPE,
112114
)
113115
actual = bbq.json_extract(s, "$.a.b").to_pandas()
114-
expected = bpd.Series([[1, 2], None, 0], dtype=db_dtypes.JSONDtype()).to_pandas()
116+
expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas()
115117
pd.testing.assert_series_equal(
116118
actual,
117119
expected,
@@ -132,14 +134,15 @@ def test_json_extract_from_string():
132134

133135

134136
def test_json_extract_w_invalid_series_type():
137+
s = bpd.Series([1, 2])
135138
with pytest.raises(TypeError):
136-
bbq.json_extract(bpd.Series([1, 2]), "$.a")
139+
bbq.json_extract(s, "$.a")
137140

138141

139142
def test_json_extract_array_from_json():
140143
s = bpd.Series(
141-
[{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}],
142-
dtype=db_dtypes.JSONDtype(),
144+
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"],
145+
dtype=dtypes.JSON_DTYPE,
143146
)
144147
actual = bbq.json_extract_array(s, "$.a")
145148

@@ -225,7 +228,7 @@ def test_json_extract_string_array_from_array_strings():
225228

226229
def test_json_extract_string_array_as_float_array_from_array_strings():
227230
s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"])
228-
actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE)
231+
actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE)
229232
expected = bpd.Series([[1, 2.5, 3], [], [4, 5]])
230233
pd.testing.assert_series_equal(
231234
actual.to_pandas(),

tests/system/small/test_dataframe.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import bigframes._config.display_options as display_options
3131
import bigframes.core.indexes as bf_indexes
3232
import bigframes.dataframe as dataframe
33+
import bigframes.dtypes as dtypes
3334
import bigframes.pandas as bpd
3435
import bigframes.series as series
3536
from tests.system.utils import (
@@ -4584,7 +4585,17 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub
45844585
)
45854586
def test_df_drop_duplicates_w_json(json_df, keep):
45864587
bf_df = json_df.drop_duplicates(keep=keep).to_pandas()
4587-
pd_df = json_df.to_pandas().drop_duplicates(keep=keep)
4588+
4589+
# drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible
4590+
# with Arrow string extension types. Temporary conversion to standard Pandas
4591+
# strings is required.
4592+
json_pandas_df = json_df.to_pandas()
4593+
json_pandas_df["json_col"] = json_pandas_df["json_col"].astype(
4594+
pd.StringDtype(storage="pyarrow")
4595+
)
4596+
4597+
pd_df = json_pandas_df.drop_duplicates(keep=keep)
4598+
pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE)
45884599
pd.testing.assert_frame_equal(
45894600
pd_df,
45904601
bf_df,

0 commit comments

Comments
 (0)