Skip to content

Commit 0c9eb40

Browse files
authored
Revert "feat: Support max_columns in repr and make repr more efficient (#515)"
This reverts commit 54e49cf.
1 parent 347f2dd commit 0c9eb40

File tree

5 files changed

+65
-70
lines changed

5 files changed

+65
-70
lines changed

bigframes/core/blocks.py

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -467,23 +467,6 @@ def to_pandas_batches(self):
467467
self._copy_index_to_pandas(df)
468468
yield df
469469

470-
def download_pandas_preview(
471-
self, max_rows: int
472-
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
473-
"""Download one page of results and return the query job."""
474-
dtypes = dict(zip(self.index_columns, self.index.dtypes))
475-
dtypes.update(zip(self.value_columns, self.dtypes))
476-
results_iterator, query_job = self.session._execute(
477-
self.expr, sorted=True, max_results=max_rows
478-
)
479-
arrow_results_iterator = results_iterator.to_arrow_iterable()
480-
arrow_table = next(arrow_results_iterator)
481-
downloaded_df = bigframes.session._io.pandas.arrow_to_pandas(
482-
arrow_table, dtypes
483-
)
484-
self._copy_index_to_pandas(downloaded_df)
485-
return downloaded_df, query_job
486-
487470
def _copy_index_to_pandas(self, df: pd.DataFrame):
488471
"""Set the index on pandas DataFrame to match this block.
489472
@@ -1314,25 +1297,26 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1):
13141297
# queries.
13151298
@functools.cache
13161299
def retrieve_repr_request_results(
1317-
self, max_results: int, max_columns: int
1318-
) -> Tuple[pd.DataFrame, Tuple[int, int], bigquery.QueryJob]:
1300+
self, max_results: int
1301+
) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
13191302
"""
13201303
Retrieves a pandas dataframe containing only max_results many rows for use
13211304
with printing methods.
13221305
1323-
Returns a tuple of the dataframe preview for printing and the overall number
1324-
of rows and columns of the table, as well as the query job used.
1306+
Returns a tuple of the dataframe and the overall number of rows of the query.
13251307
"""
1326-
pandas_df, query_job = self.download_pandas_preview(max_results)
1327-
row_count = self.session._get_table_row_count(query_job.destination)
1328-
column_count = len(self.value_columns)
1329-
1330-
formatted_df = pandas_df.set_axis(self.column_labels, axis=1)
1308+
# TODO(swast): Select a subset of columns if max_columns is less than the
1309+
# number of columns in the schema.
1310+
count = self.shape[0]
1311+
if count > max_results:
1312+
head_block = self.slice(0, max_results)
1313+
else:
1314+
head_block = self
1315+
computed_df, query_job = head_block.to_pandas()
1316+
formatted_df = computed_df.set_axis(self.column_labels, axis=1)
13311317
# we reset the axis and substitute the bf index name for the default
13321318
formatted_df.index.name = self.index.name
1333-
# limit column count
1334-
formatted_df = formatted_df.iloc[:, 0:max_columns]
1335-
return formatted_df, (row_count, column_count), query_job
1319+
return formatted_df, count, query_job
13361320

13371321
def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
13381322
result_id = guid.generate_guid()

bigframes/core/indexes/index.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -205,17 +205,17 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
205205
return self._query_job
206206

207207
def __repr__(self) -> str:
208+
# TODO(swast): Add a timeout here? If the query is taking a long time,
209+
# maybe we just print the job metadata that we have so far?
210+
# TODO(swast): Avoid downloading the whole series by using job
211+
# metadata, like we do with DataFrame.
208212
opts = bigframes.options.display
209213
max_results = opts.max_rows
210-
max_columns = opts.max_columns
211214
if opts.repr_mode == "deferred":
212215
return formatter.repr_query_job(self.query_job)
213216

214-
pandas_df, _, query_job = self._block.retrieve_repr_request_results(
215-
max_results, max_columns
216-
)
217+
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
217218
self._query_job = query_job
218-
219219
return repr(pandas_df.index)
220220

221221
def copy(self, name: Optional[Hashable] = None):

bigframes/dataframe.py

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -579,57 +579,71 @@ def __setattr__(self, key: str, value):
579579
object.__setattr__(self, key, value)
580580

581581
def __repr__(self) -> str:
582-
"""Converts a DataFrame to a string using pandas dataframe __repr__.
582+
"""Converts a DataFrame to a string. Calls to_pandas.
583583
584-
Only represents the first `bigframes.options.display.max_rows`
585-
and `bigframes.options.display.max_columns`.
584+
Only represents the first `bigframes.options.display.max_rows`.
586585
"""
587-
if bigframes.options.display.repr_mode == "deferred":
586+
opts = bigframes.options.display
587+
max_results = opts.max_rows
588+
if opts.repr_mode == "deferred":
588589
return formatter.repr_query_job(self.query_job)
589590

590-
pandas_df, shape = self._perform_repr_request()
591-
with display_options.pandas_repr(bigframes.options.display):
591+
self._cached()
592+
# TODO(swast): pass max_columns and get the true column count back. Maybe
593+
# get 1 more column than we have requested so that pandas can add the
594+
# ... for us?
595+
pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
596+
max_results
597+
)
598+
599+
self._set_internal_query_job(query_job)
600+
601+
column_count = len(pandas_df.columns)
602+
603+
with display_options.pandas_repr(opts):
592604
repr_string = repr(pandas_df)
593605

594606
# Modify the end of the string to reflect count.
595607
lines = repr_string.split("\n")
596608
pattern = re.compile("\\[[0-9]+ rows x [0-9]+ columns\\]")
597609
if pattern.match(lines[-1]):
598610
lines = lines[:-2]
599-
if shape[0] > len(lines) - 1:
611+
612+
if row_count > len(lines) - 1:
600613
lines.append("...")
614+
601615
lines.append("")
602-
lines.append(f"[{shape[0]} rows x {shape[1]} columns]")
616+
lines.append(f"[{row_count} rows x {column_count} columns]")
603617
return "\n".join(lines)
604618

605-
def _perform_repr_request(self) -> Tuple[pandas.DataFrame, Tuple[int, int]]:
606-
max_results = bigframes.options.display.max_rows
607-
max_columns = bigframes.options.display.max_columns
608-
self._cached()
609-
pandas_df, shape, query_job = self._block.retrieve_repr_request_results(
610-
max_results, max_columns
611-
)
612-
self._set_internal_query_job(query_job)
613-
return pandas_df, shape
614-
615619
def _repr_html_(self) -> str:
616620
"""
617621
Returns an html string primarily for use by notebooks for displaying
618-
a representation of the DataFrame. Displays at most the number of rows
619-
and columns given by `bigframes.options.display.max_rows` and
620-
`bigframes.options.display.max_columns`.
622+
a representation of the DataFrame. Displays 20 rows by default since
623+
many notebooks are not configured for large tables.
621624
"""
622-
623-
if bigframes.options.display.repr_mode == "deferred":
625+
opts = bigframes.options.display
626+
max_results = bigframes.options.display.max_rows
627+
if opts.repr_mode == "deferred":
624628
return formatter.repr_query_job_html(self.query_job)
625629

626-
pandas_df, shape = self._perform_repr_request()
630+
self._cached()
631+
# TODO(swast): pass max_columns and get the true column count back. Maybe
632+
# get 1 more column than we have requested so that pandas can add the
633+
# ... for us?
634+
pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
635+
max_results
636+
)
637+
638+
self._set_internal_query_job(query_job)
639+
640+
column_count = len(pandas_df.columns)
627641

628-
with display_options.pandas_repr(bigframes.options.display):
642+
with display_options.pandas_repr(opts):
629643
# _repr_html_ stub is missing so mypy thinks it's a Series. Ignore mypy.
630644
html_string = pandas_df._repr_html_() # type:ignore
631645

632-
html_string += f"[{shape[0]} rows x {shape[1]} columns in total]"
646+
html_string += f"[{row_count} rows x {column_count} columns in total]"
633647
return html_string
634648

635649
def __setitem__(self, key: str, value: SingleItemValue):

bigframes/series.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,16 +272,17 @@ def reset_index(
272272
return bigframes.dataframe.DataFrame(block)
273273

274274
def __repr__(self) -> str:
275+
# TODO(swast): Add a timeout here? If the query is taking a long time,
276+
# maybe we just print the job metadata that we have so far?
277+
# TODO(swast): Avoid downloading the whole series by using job
278+
# metadata, like we do with DataFrame.
275279
opts = bigframes.options.display
276280
max_results = opts.max_rows
277-
max_columns = opts.max_columns
278281
if opts.repr_mode == "deferred":
279282
return formatter.repr_query_job(self.query_job)
280283

281284
self._cached()
282-
pandas_df, _, query_job = self._block.retrieve_repr_request_results(
283-
max_results, max_columns
284-
)
285+
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
285286
self._set_internal_query_job(query_job)
286287

287288
return repr(pandas_df.iloc[:, 0])

bigframes/session/__init__.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1832,7 +1832,6 @@ def _execute(
18321832
sorted: bool = True,
18331833
dry_run=False,
18341834
col_id_overrides: Mapping[str, str] = {},
1835-
max_results: Optional[int] = None,
18361835
) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
18371836
sql = self._to_sql(
18381837
array_value, sorted=sorted, col_id_overrides=col_id_overrides
@@ -1842,7 +1841,8 @@ def _execute(
18421841
else:
18431842
job_config.dry_run = dry_run
18441843
return self._start_query(
1845-
sql=sql, job_config=job_config, max_results=max_results
1844+
sql=sql,
1845+
job_config=job_config,
18461846
)
18471847

18481848
def _peek(
@@ -1887,10 +1887,6 @@ def _get_table_size(self, destination_table):
18871887
table = self.bqclient.get_table(destination_table)
18881888
return table.num_bytes
18891889

1890-
def _get_table_row_count(self, destination_table) -> int:
1891-
table = self.bqclient.get_table(destination_table)
1892-
return table.num_rows
1893-
18941890
def _rows_to_dataframe(
18951891
self, row_iterator: bigquery.table.RowIterator, dtypes: Dict
18961892
) -> pandas.DataFrame:

0 commit comments

Comments
 (0)