Skip to content

Commit 2b9a01d

Browse files
authored
fix: renable to_csv and to_json related tests (#468)
* fix: renable to_csv and to_json related tests * fix gcs file path * add global FIRST_GCS_FILE_SUFFIX * trying to avoid import functions
1 parent 21b2188 commit 2b9a01d

File tree

5 files changed

+39
-36
lines changed

5 files changed

+39
-36
lines changed

tests/system/small/test_dataframe_io.py

+18-17
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import pyarrow as pa
2020
import pytest
2121

22-
from tests.system.utils import assert_pandas_df_equal, convert_pandas_dtypes
22+
from tests.system import utils
2323

2424
try:
2525
import pandas_gbq # type: ignore
@@ -115,7 +115,6 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
115115
pd.testing.assert_series_equal(actual, expected)
116116

117117

118-
@pytest.mark.skip(reason="Disable to unblock kokoro tests")
119118
@pytest.mark.parametrize(
120119
("index"),
121120
[True, False],
@@ -150,12 +149,12 @@ def test_to_csv_index(
150149
# read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
151150
dtype.pop("bytes_col")
152151
gcs_df = pd.read_csv(
153-
path,
152+
utils.get_first_file_from_wildcard(path),
154153
dtype=dtype,
155154
date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
156155
index_col=index_col,
157156
)
158-
convert_pandas_dtypes(gcs_df, bytes_col=True)
157+
utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
159158
gcs_df.index.name = scalars_df.index.name
160159

161160
scalars_pandas_df = scalars_pandas_df.copy()
@@ -164,7 +163,6 @@ def test_to_csv_index(
164163
pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df)
165164

166165

167-
@pytest.mark.skip(reason="Disable to unblock kokoro tests")
168166
def test_to_csv_tabs(
169167
scalars_dfs: Tuple[bigframes.dataframe.DataFrame, pd.DataFrame],
170168
gcs_folder: str,
@@ -189,13 +187,13 @@ def test_to_csv_tabs(
189187
# read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string
190188
dtype.pop("bytes_col")
191189
gcs_df = pd.read_csv(
192-
path,
190+
utils.get_first_file_from_wildcard(path),
193191
sep="\t",
194192
dtype=dtype,
195193
date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"},
196194
index_col=index_col,
197195
)
198-
convert_pandas_dtypes(gcs_df, bytes_col=True)
196+
utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
199197
gcs_df.index.name = scalars_df.index.name
200198

201199
scalars_pandas_df = scalars_pandas_df.copy()
@@ -229,7 +227,7 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index):
229227
else:
230228
df_out = df_out.sort_values("rowindex_2").reset_index(drop=True)
231229

232-
convert_pandas_dtypes(df_out, bytes_col=False)
230+
utils.convert_pandas_dtypes(df_out, bytes_col=False)
233231
# pd.read_gbq interpets bytes_col as object, reconvert to pyarrow binary
234232
df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary()))
235233
expected = scalars_pandas_df.copy()
@@ -415,7 +413,6 @@ def test_to_json_index_invalid_lines(
415413
scalars_df.to_json(path, index=index)
416414

417415

418-
@pytest.mark.skip(reason="Disable to unblock kokoro tests")
419416
@pytest.mark.parametrize(
420417
("index"),
421418
[True, False],
@@ -435,8 +432,12 @@ def test_to_json_index_records_orient(
435432
""" Test the `to_json` API with `orient` is `records` and `lines` is True"""
436433
scalars_df.to_json(path, index=index, orient="records", lines=True)
437434

438-
gcs_df = pd.read_json(path, lines=True, convert_dates=["datetime_col"])
439-
convert_pandas_dtypes(gcs_df, bytes_col=True)
435+
gcs_df = pd.read_json(
436+
utils.get_first_file_from_wildcard(path),
437+
lines=True,
438+
convert_dates=["datetime_col"],
439+
)
440+
utils.convert_pandas_dtypes(gcs_df, bytes_col=True)
440441
if index and scalars_df.index.name is not None:
441442
gcs_df = gcs_df.set_index(scalars_df.index.name)
442443

@@ -474,8 +475,8 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index):
474475
# table.
475476
scalars_df.to_parquet(path, index=index)
476477

477-
gcs_df = pd.read_parquet(path.replace("*", "000000000000"))
478-
convert_pandas_dtypes(gcs_df, bytes_col=False)
478+
gcs_df = pd.read_parquet(utils.get_first_file_from_wildcard(path))
479+
utils.convert_pandas_dtypes(gcs_df, bytes_col=False)
479480
if index and scalars_df.index.name is not None:
480481
gcs_df = gcs_df.set_index(scalars_df.index.name)
481482

@@ -507,7 +508,7 @@ def test_to_sql_query_unnamed_index_included(
507508
pd_df = scalars_pandas_df_default_index.reset_index(drop=True)
508509
roundtrip = session.read_gbq(sql, index_col=idx_ids)
509510
roundtrip.index.names = [None]
510-
assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False)
511+
utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False)
511512

512513

513514
def test_to_sql_query_named_index_included(
@@ -524,7 +525,7 @@ def test_to_sql_query_named_index_included(
524525

525526
pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True)
526527
roundtrip = session.read_gbq(sql, index_col=idx_ids)
527-
assert_pandas_df_equal(roundtrip.to_pandas(), pd_df)
528+
utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df)
528529

529530

530531
def test_to_sql_query_unnamed_index_excluded(
@@ -539,7 +540,7 @@ def test_to_sql_query_unnamed_index_excluded(
539540

540541
pd_df = scalars_pandas_df_default_index.reset_index(drop=True)
541542
roundtrip = session.read_gbq(sql)
542-
assert_pandas_df_equal(
543+
utils.assert_pandas_df_equal(
543544
roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True
544545
)
545546

@@ -558,6 +559,6 @@ def test_to_sql_query_named_index_excluded(
558559
"rowindex_2", drop=True
559560
).reset_index(drop=True)
560561
roundtrip = session.read_gbq(sql)
561-
assert_pandas_df_equal(
562+
utils.assert_pandas_df_equal(
562563
roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True
563564
)

tests/system/small/test_encryption.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import bigframes
2121
import bigframes.ml.linear_model
22+
from tests.system import utils
2223

2324

2425
@pytest.fixture(scope="module")
@@ -160,7 +161,7 @@ def test_read_csv_gcs(
160161
# Create a csv in gcs
161162
write_path = gcs_folder + "test_read_csv_gcs_bigquery_engine*.csv"
162163
read_path = (
163-
write_path.replace("*", "000000000000") if engine is None else write_path
164+
utils.get_first_file_from_wildcard(write_path) if engine is None else write_path
164165
)
165166
scalars_df_index.to_csv(write_path)
166167

tests/system/small/test_series.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from tests.system.utils import (
2828
assert_pandas_df_equal,
2929
assert_series_equal,
30+
get_first_file_from_wildcard,
3031
skip_legacy_pandas,
3132
)
3233

@@ -2390,11 +2391,10 @@ def test_to_frame(scalars_dfs):
23902391
assert_pandas_df_equal(bf_result, pd_result)
23912392

23922393

2393-
@pytest.mark.skip(reason="Disable to unblock kokoro tests")
23942394
def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
23952395
path = gcs_folder + "test_series_to_json*.jsonl"
23962396
scalars_df_index["int64_col"].to_json(path, lines=True, orient="records")
2397-
gcs_df = pd.read_json(path, lines=True)
2397+
gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True)
23982398

23992399
pd.testing.assert_series_equal(
24002400
gcs_df["int64_col"].astype(pd.Int64Dtype()),
@@ -2404,11 +2404,10 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index):
24042404
)
24052405

24062406

2407-
@pytest.mark.skip(reason="Disable to unblock kokoro tests")
24082407
def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index):
24092408
path = gcs_folder + "test_series_to_csv*.csv"
24102409
scalars_df_index["int64_col"].to_csv(path)
2411-
gcs_df = pd.read_csv(path)
2410+
gcs_df = pd.read_csv(get_first_file_from_wildcard(path))
24122411

24132412
pd.testing.assert_series_equal(
24142413
gcs_df["int64_col"].astype(pd.Int64Dtype()),

tests/system/small/test_session.py

+12-14
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@
3030
import bigframes.dataframe
3131
import bigframes.dtypes
3232
import bigframes.ml.linear_model
33-
from tests.system.utils import skip_legacy_pandas
34-
35-
FIRST_FILE = "000000000000"
33+
from tests.system import utils
3634

3735

3836
def test_read_gbq_tokyo(
@@ -435,14 +433,14 @@ def test_read_pandas_tokyo(
435433
pd.testing.assert_frame_equal(result, expected)
436434

437435

438-
@skip_legacy_pandas
436+
@utils.skip_legacy_pandas
439437
def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder):
440438
scalars_df, _ = scalars_dfs
441439
if scalars_df.index.name is not None:
442440
path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv"
443441
else:
444442
path = gcs_folder + "test_read_csv_gcs_default_engine_wo_index*.csv"
445-
read_path = path.replace("*", FIRST_FILE)
443+
read_path = utils.get_first_file_from_wildcard(path)
446444
scalars_df.to_csv(path, index=False)
447445
dtype = scalars_df.dtypes.to_dict()
448446
dtype.pop("geography_col")
@@ -492,7 +490,7 @@ def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder):
492490
pytest.param("\t", id="custom_sep"),
493491
],
494492
)
495-
@skip_legacy_pandas
493+
@utils.skip_legacy_pandas
496494
def test_read_csv_local_default_engine(session, scalars_dfs, sep):
497495
scalars_df, scalars_pandas_df = scalars_dfs
498496
with tempfile.TemporaryDirectory() as dir:
@@ -641,15 +639,15 @@ def test_read_csv_default_engine_throws_not_implemented_error(
641639
gcs_folder
642640
+ "test_read_csv_gcs_default_engine_throws_not_implemented_error*.csv"
643641
)
644-
read_path = path.replace("*", FIRST_FILE)
642+
read_path = utils.get_first_file_from_wildcard(path)
645643
scalars_df_index.to_csv(path)
646644
with pytest.raises(NotImplementedError, match=match):
647645
session.read_csv(read_path, **kwargs)
648646

649647

650648
def test_read_csv_gcs_default_engine_w_header(session, scalars_df_index, gcs_folder):
651649
path = gcs_folder + "test_read_csv_gcs_default_engine_w_header*.csv"
652-
read_path = path.replace("*", FIRST_FILE)
650+
read_path = utils.get_first_file_from_wildcard(path)
653651
scalars_df_index.to_csv(path)
654652

655653
# Skips header=N rows, normally considers the N+1th row as the header, but overridden by
@@ -716,7 +714,7 @@ def test_read_csv_gcs_default_engine_w_index_col_name(
716714
session, scalars_df_default_index, gcs_folder
717715
):
718716
path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_name*.csv"
719-
read_path = path.replace("*", FIRST_FILE)
717+
read_path = utils.get_first_file_from_wildcard(path)
720718
scalars_df_default_index.to_csv(path)
721719

722720
df = session.read_csv(read_path, index_col="rowindex")
@@ -731,7 +729,7 @@ def test_read_csv_gcs_default_engine_w_index_col_index(
731729
session, scalars_df_default_index, gcs_folder
732730
):
733731
path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_index*.csv"
734-
read_path = path.replace("*", FIRST_FILE)
732+
read_path = utils.get_first_file_from_wildcard(path)
735733
scalars_df_default_index.to_csv(path)
736734

737735
index_col = scalars_df_default_index.columns.to_list().index("rowindex")
@@ -790,7 +788,7 @@ def test_read_csv_local_default_engine_w_index_col_index(
790788
def test_read_csv_gcs_w_usecols(session, scalars_df_index, gcs_folder, engine):
791789
path = gcs_folder + "test_read_csv_gcs_w_usecols"
792790
path = path + "_default_engine*.csv" if engine is None else path + "_bq_engine*.csv"
793-
read_path = path.replace("*", FIRST_FILE) if engine is None else path
791+
read_path = utils.get_first_file_from_wildcard(path) if engine is None else path
794792
scalars_df_index.to_csv(path)
795793

796794
# df should only have 1 column which is bool_col.
@@ -902,7 +900,7 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder, e
902900

903901
# Only bigquery engine for reads supports wildcards in path name.
904902
if engine != "bigquery":
905-
path = path.replace("*", "000000000000")
903+
path = utils.get_first_file_from_wildcard(path)
906904

907905
df_out = (
908906
session.read_parquet(path, engine=engine)
@@ -1012,7 +1010,7 @@ def test_read_parquet_gcs_compression_not_supported(
10121010
def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder):
10131011
scalars_df, _ = scalars_dfs
10141012
path = gcs_folder + "test_read_json_gcs_bq_engine_w_index*.json"
1015-
read_path = path.replace("*", FIRST_FILE)
1013+
read_path = utils.get_first_file_from_wildcard(path)
10161014
scalars_df.to_json(path, index=False, lines=True, orient="records")
10171015
df = session.read_json(read_path, lines=True, orient="records", engine="bigquery")
10181016

@@ -1036,7 +1034,7 @@ def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder):
10361034
def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder):
10371035
scalars_df, _ = scalars_dfs
10381036
path = gcs_folder + "test_read_json_gcs_default_engine_w_index*.json"
1039-
read_path = path.replace("*", FIRST_FILE)
1037+
read_path = utils.get_first_file_from_wildcard(path)
10401038
scalars_df.to_json(
10411039
path,
10421040
index=False,

tests/system/utils.py

+4
Original file line numberDiff line numberDiff line change
@@ -304,3 +304,7 @@ def delete_cloud_function(
304304
request = functions_v2.DeleteFunctionRequest(name=full_name)
305305
operation = functions_client.delete_function(request=request)
306306
return operation
307+
308+
309+
def get_first_file_from_wildcard(path):
310+
return path.replace("*", "000000000000")

0 commit comments

Comments
 (0)