Skip to content

feat: Support audio_transcribe with partial ordering #1908

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2488,6 +2488,11 @@ def join(
)
if result is not None:
return result

# For block identify joins with null indices, perform cross join
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem desirable. If df1 is n rows and df2 is m rows, won't this end up with n x m rows?

if block_identity_join and how == "left":
return join_with_single_row(self, other)

raise bigframes.exceptions.NullIndexError(
"Cannot implicitly align objects. Set an explicit index using set_index."
)
Expand Down
7 changes: 6 additions & 1 deletion bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2288,8 +2288,13 @@ def _assign_scalar(self, label: str, value: Union[int, float, str]) -> DataFrame
def _assign_series_join_on_index(
self, label: str, series: bigframes.series.Series
) -> DataFrame:
# Only use block_identity_join for null indices
use_block_identity_join = (
self._block.index.nlevels == 0 and series._block.index.nlevels == 0
)

block, (get_column_left, get_column_right) = self._block.join(
series._block, how="left"
series._block, how="left", block_identity_join=use_block_identity_join
)

column_ids = [
Expand Down
30 changes: 30 additions & 0 deletions tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,27 @@ def scalars_df_2_index(
return session.read_gbq(scalars_table_id_2, index_col="rowindex")


@pytest.fixture(scope="session")
def scalars_df_null_index_partial_ordering(
scalars_table_id: str, unordered_session: bigframes.Session
) -> bigframes.dataframe.DataFrame:
"""DataFrame pointing at test data with null index in partial ordering mode."""
return unordered_session.read_gbq(
scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.NULL
).sort_values("rowindex")


@pytest.fixture(scope="session")
def scalars_series_null_index_partial_ordering(
scalars_table_id: str, unordered_session: bigframes.Session
) -> bigframes.series.Series:
"""Series pointing at test data with null index in partial ordering mode."""
df = unordered_session.read_gbq(
scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.NULL
).sort_values("rowindex")
return df["int64_col"]


@pytest.fixture(scope="session")
def scalars_pandas_df_default_index() -> pd.DataFrame:
"""pd.DataFrame pointing at test data."""
Expand Down Expand Up @@ -1529,3 +1550,12 @@ def audio_mm_df(
return session.from_glob_path(
audio_gcs_path, name="audio", connection=bq_connection
)


@pytest.fixture(scope="session")
def audio_mm_df_partial_ordering(
audio_gcs_path, unordered_session: bigframes.Session, bq_connection: str
) -> bpd.DataFrame:
return unordered_session.from_glob_path(
audio_gcs_path, name="audio", connection=bq_connection
)
23 changes: 23 additions & 0 deletions tests/system/large/blob/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,26 @@ def test_blob_transcribe(
assert (
keyword.lower() in actual_text.lower()
), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in transcribed text. "


@pytest.mark.parametrize(
"model_name",
[
"gemini-2.0-flash-001",
"gemini-2.0-flash-lite-001",
],
)
def test_audio_transcribe_partial_ordering_integration(
audio_mm_df_partial_ordering: bpd.DataFrame,
model_name: str,
):
"""Integration test for audio_transcribe with partial ordering mode."""
df = audio_mm_df_partial_ordering.copy()
bpd.options.bigquery.ordering_mode = "partial"

df["transcribed_text"] = df["audio"].blob.audio_transcribe(model_name=model_name)
result = df.to_pandas(ordered=False)

assert "transcribed_text" in result.columns
assert len(result) > 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The number of rows in result should be exactly equal to the number of rows in audio_mm_df_partial_ordering.

assert result["transcribed_text"].iloc[0] is not None
36 changes: 36 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2949,6 +2949,42 @@ def test_df_join_series(scalars_dfs, how):
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)


def test_assign_series_with_null_index_should_add_column_correctly(
scalars_df_null_index_partial_ordering: bigframes.dataframe.DataFrame,
scalars_series_null_index_partial_ordering: bigframes.series.Series,
):
"""Test that DataFrame column assignment works with null indices in partial ordering mode."""
df = scalars_df_null_index_partial_ordering[["int64_col", "string_col"]].head(3)
series_to_assign = scalars_series_null_index_partial_ordering.head(3)
expected_series = pd.Series(
[
-987654321,
-987654321,
-987654321,
314159,
314159,
314159,
123456789,
123456789,
123456789,
],
dtype="Int64",
)

# Assign the Series as a new column in the DataFrame
df["new_col"] = series_to_assign

# Materialize the full DataFrame to a pandas object to get the computed result.
result_df = df[["int64_col", "new_col"]].to_pandas()
result_series = result_df["new_col"]

pd.testing.assert_series_equal(
result_series.sort_values().reset_index(drop=True),
expected_series,
check_names=False,
)


@pytest.mark.parametrize(
("by", "ascending", "na_position"),
[
Expand Down
41 changes: 39 additions & 2 deletions tests/system/small/test_null_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@


import pandas as pd
import pandas.testing
import pytest

import bigframes.core
import bigframes.core.blocks as blocks
import bigframes.exceptions
import bigframes.pandas as bpd

Expand Down Expand Up @@ -398,5 +401,39 @@ def test_null_index_transpose(scalars_df_null_index):
_ = scalars_df_null_index.T


def test_null_index_contains(scalars_df_null_index):
assert 3 not in scalars_df_null_index
@pytest.mark.parametrize(
("session_fixture",),
[
pytest.param("session"),
pytest.param("unordered_session"),
],
)
def test_identity_join_with_null_index_should_return_cartesian_product(
request, session_fixture
):
"""Test the Block.join method with block_identity_join=True and null indices."""
session = request.getfixturevalue(session_fixture)
left_data = pd.DataFrame({"a": [1, 2, 3]})
right_data = pd.DataFrame({"b": [10, 20, 30]})

left_block = blocks.Block.from_local(left_data, session=session)
right_block = blocks.Block.from_local(right_data, session=session)

expected_df = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [10, 20, 30],
}
)

# Perform the identity join on the two blocks
result_block, (left_mapping, right_mapping) = left_block.join(
right_block, how="left", block_identity_join=True
)

result_df, _ = result_block.to_pandas()
pandas.testing.assert_frame_equal(
result_df.sort_values(by=["a", "b"]).reset_index(drop=True),
expected_df,
check_dtype=False,
)