diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c8632ebc8c..bf91f709b0 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2488,6 +2488,11 @@ def join( ) if result is not None: return result + + # For block identify joins with null indices, perform cross join + if block_identity_join and how == "left": + return join_with_single_row(self, other) + raise bigframes.exceptions.NullIndexError( "Cannot implicitly align objects. Set an explicit index using set_index." ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7de4bdbc91..0d6d9d60d0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2288,8 +2288,13 @@ def _assign_scalar(self, label: str, value: Union[int, float, str]) -> DataFrame def _assign_series_join_on_index( self, label: str, series: bigframes.series.Series ) -> DataFrame: + # Only use block_identity_join for null indices + use_block_identity_join = ( + self._block.index.nlevels == 0 and series._block.index.nlevels == 0 + ) + block, (get_column_left, get_column_right) = self._block.join( - series._block, how="left" + series._block, how="left", block_identity_join=use_block_identity_join ) column_ids = [ diff --git a/tests/system/conftest.py b/tests/system/conftest.py index a75918ed23..16a0b72379 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -601,6 +601,27 @@ def scalars_df_2_index( return session.read_gbq(scalars_table_id_2, index_col="rowindex") +@pytest.fixture(scope="session") +def scalars_df_null_index_partial_ordering( + scalars_table_id: str, unordered_session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data with null index in partial ordering mode.""" + return unordered_session.read_gbq( + scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.NULL + ).sort_values("rowindex") + + +@pytest.fixture(scope="session") +def scalars_series_null_index_partial_ordering( + scalars_table_id: str, unordered_session: bigframes.Session +) -> bigframes.series.Series: + """Series pointing at test data with null index in partial ordering mode.""" + df = unordered_session.read_gbq( + scalars_table_id, index_col=bigframes.enums.DefaultIndexKind.NULL + ).sort_values("rowindex") + return df["int64_col"] + + @pytest.fixture(scope="session") def scalars_pandas_df_default_index() -> pd.DataFrame: """pd.DataFrame pointing at test data.""" @@ -1529,3 +1550,12 @@ def audio_mm_df( return session.from_glob_path( audio_gcs_path, name="audio", connection=bq_connection ) + + +@pytest.fixture(scope="session") +def audio_mm_df_partial_ordering( + audio_gcs_path, unordered_session: bigframes.Session, bq_connection: str +) -> bpd.DataFrame: + return unordered_session.from_glob_path( + audio_gcs_path, name="audio", connection=bq_connection + ) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index a594b144f5..03a8ffb61e 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -454,3 +454,26 @@ def test_blob_transcribe( assert ( keyword.lower() in actual_text.lower() ), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in transcribed text. " + + +@pytest.mark.parametrize( + "model_name", + [ + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", + ], +) +def test_audio_transcribe_partial_ordering_integration( + audio_mm_df_partial_ordering: bpd.DataFrame, + model_name: str, +): + """Integration test for audio_transcribe with partial ordering mode.""" + df = audio_mm_df_partial_ordering.copy() + bpd.options.bigquery.ordering_mode = "partial" + + df["transcribed_text"] = df["audio"].blob.audio_transcribe(model_name=model_name) + result = df.to_pandas(ordered=False) + + assert "transcribed_text" in result.columns + assert len(result) > 0 + assert result["transcribed_text"].iloc[0] is not None diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index caf39bd9e9..456a0be9be 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2949,6 +2949,42 @@ def test_df_join_series(scalars_dfs, how): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_assign_series_with_null_index_should_add_column_correctly( + scalars_df_null_index_partial_ordering: bigframes.dataframe.DataFrame, + scalars_series_null_index_partial_ordering: bigframes.series.Series, +): + """Test that DataFrame column assignment works with null indices in partial ordering mode.""" + df = scalars_df_null_index_partial_ordering[["int64_col", "string_col"]].head(3) + series_to_assign = scalars_series_null_index_partial_ordering.head(3) + expected_series = pd.Series( + [ + -987654321, + -987654321, + -987654321, + 314159, + 314159, + 314159, + 123456789, + 123456789, + 123456789, + ], + dtype="Int64", + ) + + # Assign the Series as a new column in the DataFrame + df["new_col"] = series_to_assign + + # Materialize the full DataFrame to a pandas object to get the computed result. + result_df = df[["int64_col", "new_col"]].to_pandas() + result_series = result_df["new_col"] + + pd.testing.assert_series_equal( + result_series.sort_values().reset_index(drop=True), + expected_series, + check_names=False, + ) + + @pytest.mark.parametrize( ("by", "ascending", "na_position"), [ diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index a1c7c0f1a3..fe0eb76366 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -14,8 +14,11 @@ import pandas as pd +import pandas.testing import pytest +import bigframes.core +import bigframes.core.blocks as blocks import bigframes.exceptions import bigframes.pandas as bpd @@ -398,5 +401,39 @@ def test_null_index_transpose(scalars_df_null_index): _ = scalars_df_null_index.T -def test_null_index_contains(scalars_df_null_index): - assert 3 not in scalars_df_null_index +@pytest.mark.parametrize( + ("session_fixture",), + [ + pytest.param("session"), + pytest.param("unordered_session"), + ], +) +def test_identity_join_with_null_index_should_return_cartesian_product( + request, session_fixture +): + """Test the Block.join method with block_identity_join=True and null indices.""" + session = request.getfixturevalue(session_fixture) + left_data = pd.DataFrame({"a": [1, 2, 3]}) + right_data = pd.DataFrame({"b": [10, 20, 30]}) + + left_block = blocks.Block.from_local(left_data, session=session) + right_block = blocks.Block.from_local(right_data, session=session) + + expected_df = pd.DataFrame( + { + "a": [1, 2, 3], + "b": [10, 20, 30], + } + ) + + # Perform the identity join on the two blocks + result_block, (left_mapping, right_mapping) = left_block.join( + right_block, how="left", block_identity_join=True + ) + + result_df, _ = result_block.to_pandas() + pandas.testing.assert_frame_equal( + result_df.sort_values(by=["a", "b"]).reset_index(drop=True), + expected_df, + check_dtype=False, + )