Skip to content

Commit 60f13e7

Browse files
chore: make benchmark to read subfolders and three groupby benchmarks. (#754)
* chore: make benchmark to read subfolders. * group_by test for 1E9 rows * update q1 * update noxfile for lint path. * update performance report logic * format update * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent dff4d6e commit 60f13e7

File tree

4 files changed

+64
-14
lines changed

4 files changed

+64
-14
lines changed

noxfile.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,15 @@
3535
# https://github.com/str0zzapreti/pytest-retry/issues/32
3636
PYTEST_VERSION = "pytest<8.0.0dev"
3737
SPHINX_VERSION = "sphinx==4.5.0"
38-
LINT_PATHS = ["docs", "bigframes", "tests", "third_party", "noxfile.py", "setup.py"]
38+
LINT_PATHS = [
39+
"docs",
40+
"bigframes",
41+
"tests",
42+
"third_party",
43+
"noxfile.py",
44+
"setup.py",
45+
os.path.join("scripts", "benchmark"),
46+
]
3947

4048
DEFAULT_PYTHON_VERSION = "3.10"
4149

@@ -813,22 +821,17 @@ def notebook(session: nox.Session):
813821
@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
814822
def benchmark(session: nox.Session):
815823
session.install("-e", ".[all]")
824+
base_path = os.path.join("scripts", "benchmark")
816825

817-
benchmark_script_list = list(Path("scripts/benchmark/").glob("*.py"))
818-
826+
benchmark_script_list = list(Path(base_path).rglob("*.py"))
819827
# Run benchmarks in parallel session.run's, since each benchmark
820828
# takes an environment variable for performance logging
821829
processes = []
822830
for benchmark in benchmark_script_list:
823831
process = Process(
824832
target=session.run,
825833
args=("python", benchmark),
826-
kwargs={
827-
"env": {
828-
LOGGING_NAME_ENV_VAR: "scripts/benchmark/"
829-
+ os.path.basename(benchmark)
830-
}
831-
},
834+
kwargs={"env": {LOGGING_NAME_ENV_VAR: benchmark.as_posix()}},
832835
)
833836
process.start()
834837
processes.append(process)
@@ -839,7 +842,7 @@ def benchmark(session: nox.Session):
839842
# when the environment variable is set as it is above,
840843
# notebooks output a .bytesprocessed and .slotmillis report
841844
# collect those reports and print a summary
842-
_print_performance_report("scripts/")
845+
_print_performance_report(base_path)
843846

844847

845848
def _print_performance_report(path: str):
@@ -852,19 +855,24 @@ def _print_performance_report(path: str):
852855
"""
853856
print("---BIGQUERY USAGE REPORT---")
854857
results_dict = {}
855-
for bytes_report in Path(path).glob("*/*.bytesprocessed"):
858+
bytes_reports = sorted(Path(path).rglob("*.bytesprocessed"), key=lambda x: x.name)
859+
for bytes_report in bytes_reports:
856860
with open(bytes_report, "r") as bytes_file:
857-
filename = bytes_report.stem
861+
filename = bytes_report.relative_to(path).with_suffix("")
858862
lines = bytes_file.read().splitlines()
859863
query_count = len(lines)
860864
total_bytes = sum([int(line) for line in lines])
861865
results_dict[filename] = [query_count, total_bytes]
862-
for millis_report in Path(path).glob("*/*.slotmillis"):
866+
os.remove(bytes_report)
867+
868+
millis_reports = sorted(Path(path).rglob("*.slotmillis"), key=lambda x: x.name)
869+
for millis_report in millis_reports:
863870
with open(millis_report, "r") as millis_file:
864-
filename = millis_report.stem
871+
filename = millis_report.relative_to(path).with_suffix("")
865872
lines = millis_file.read().splitlines()
866873
total_slot_millis = sum([int(line) for line in lines])
867874
results_dict[filename] += [total_slot_millis]
875+
os.remove(millis_report)
868876

869877
cumulative_queries = 0
870878
cumulative_bytes = 0
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py
2+
3+
import bigframes.pandas as bpd
4+
5+
print("Groupby benchmark 1: sum v1 by id1")
6+
7+
x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0")
8+
9+
ans = x.groupby("id1", as_index=False, dropna=False).agg({"v1": "sum"})
10+
print(ans.shape)
11+
chk = [ans["v1"].sum()]
12+
print(chk)
13+
14+
bpd.reset_session()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py
2+
3+
import bigframes.pandas as bpd
4+
5+
print("Groupby benchmark 2: sum v1 by id1:id2")
6+
7+
x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0")
8+
9+
ans = x.groupby(["id1", "id2"], as_index=False, dropna=False).agg({"v1": "sum"})
10+
print(ans.shape)
11+
chk = [ans["v1"].sum()]
12+
print(chk)
13+
14+
bpd.reset_session()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py
2+
3+
import bigframes.pandas as bpd
4+
5+
print("Groupby benchmark 3: sum v1 mean v3 by id3")
6+
7+
x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0")
8+
9+
ans = x.groupby("id3", as_index=False, dropna=False).agg({"v1": "sum", "v3": "mean"})
10+
print(ans.shape)
11+
chk = [ans["v1"].sum(), ans["v3"].sum()]
12+
print(chk)
13+
14+
bpd.reset_session()

0 commit comments

Comments
 (0)