Skip to content

fix: Use bytes limit on frame inlining rather than element count #576

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@
"UTF-32LE",
}

# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type.
# TODO(tbergeron): Convert to bytes-based limit
MAX_INLINE_DF_SIZE = 5000
# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table.
# Also must assume that text encoding as literals is much less efficient than in-memory representation.
MAX_INLINE_DF_BYTES = 5000

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1051,7 +1051,7 @@ def _read_pandas_inline(
) -> Optional[dataframe.DataFrame]:
import bigframes.dataframe as dataframe

if pandas_dataframe.size > MAX_INLINE_DF_SIZE:
if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
return None

try:
Expand Down
7 changes: 7 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ def test_df_construct_pandas_default(scalars_dfs):
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_df_construct_large_strings():
data = [["hello", "w" + "o" * 50000 + "rld"]]
bf_result = dataframe.DataFrame(data).to_pandas()
pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow"))
pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)


def test_df_construct_pandas_load_job(scalars_dfs):
# This should trigger the inlined codepath
columns = [
Expand Down
4 changes: 2 additions & 2 deletions tests/system/small/test_progress_bar.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import bigframes as bf
import bigframes.formatting_helpers as formatting_helpers
from bigframes.session import MAX_INLINE_DF_SIZE
from bigframes.session import MAX_INLINE_DF_BYTES

job_load_message_regex = r"\w+ job [\w-]+ is \w+\."

Expand Down Expand Up @@ -70,7 +70,7 @@ def test_progress_bar_load_jobs(
):
# repeat the DF to be big enough to trigger the load job.
df = penguins_pandas_df_default_index
while len(df) < MAX_INLINE_DF_SIZE:
while len(df) < MAX_INLINE_DF_BYTES:
df = pd.DataFrame(np.repeat(df.values, 2, axis=0))

bf.options.display.progress_bar = "terminal"
Expand Down