diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index fb9503dc72..bec7b5ff0e 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -239,6 +239,38 @@ def json_extract( return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) +def json_extract_array( + series: series.Series, + json_path: str = "$", +) -> series.Series: + """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON` + values. This function uses single quotes and brackets to escape invalid JSONPath + characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_extract_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) + + # Search functions defined from # https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 4818d3ca76..86501214ad 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -947,6 +947,11 @@ def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): return json_extract(json_obj=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True) +def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray): + return json_extract_array(json_obj=x, json_path=op.json_path) + + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" @@ -1581,6 +1586,13 @@ def json_extract( """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" +@ibis.udf.scalar.builtin(name="json_extract_array") +def json_extract_array( + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str +) -> ibis_dtypes.Array[ibis_dtypes.String]: + """Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values.""" + + @ibis.udf.scalar.builtin(name="ML.DISTANCE") def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index fb333d7a53..cd9e70819e 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -652,6 +652,23 @@ def output_type(self, *input_types): return input_type +@dataclasses.dataclass(frozen=True) +class JSONExtractArray(UnaryOp): + name: typing.ClassVar[str] = "json_extract_array" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be an valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE)) + ) + + # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 18ccadd9f5..68356f4a15 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -139,3 +139,28 @@ def test_json_extract_from_string(): def test_json_extract_w_invalid_series_type(): with pytest.raises(TypeError): bbq.json_extract(bpd.Series([1, 2]), "$.a") + + +def test_json_extract_array_from_json_strings(): + s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}']) + actual = bbq.json_extract_array(s, "$.a") + expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_extract_array_from_array_strings(): + s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) + actual = bbq.json_extract_array(s) + expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_extract_array_w_invalid_series_type(): + with pytest.raises(TypeError): + bbq.json_extract_array(bpd.Series([1, 2]))