From 37b423a18f536fc45b6679ecdc23da3fa5d5a26b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 14:31:57 -0500 Subject: [PATCH 01/43] DBGymConfig -> DBGymWorkspace --- benchmark/cli.py | 6 +-- benchmark/job/cli.py | 44 +++++++-------- benchmark/job/load_info.py | 8 +-- benchmark/tpch/cli.py | 62 ++++++++++----------- benchmark/tpch/load_info.py | 8 +-- dbms/cli.py | 6 +-- dbms/postgres/cli.py | 90 +++++++++++++++---------------- env/integtest_pg_conn.py | 2 +- env/integtest_replay.py | 4 +- env/integtest_tuning_artifacts.py | 6 +-- env/integtest_util.py | 30 +++++------ env/integtest_workload.py | 4 +- env/pg_conn.py | 16 +++--- env/replay.py | 8 +-- env/tuning_artifacts.py | 8 +-- env/workload.py | 10 ++-- manage/cli.py | 46 ++++++++-------- manage/tests/unittest_clean.py | 50 ++++++++--------- task.py | 8 +-- util/pg.py | 10 ++-- util/workspace.py | 78 +++++++++++++-------------- 21 files changed, 252 insertions(+), 252 deletions(-) diff --git a/benchmark/cli.py b/benchmark/cli.py index 17ca442b..1cf63418 100644 --- a/benchmark/cli.py +++ b/benchmark/cli.py @@ -2,13 +2,13 @@ from benchmark.job.cli import job_group from benchmark.tpch.cli import tpch_group -from util.workspace import DBGymConfig +from util.workspace import DBGymWorkspace @click.group(name="benchmark") @click.pass_obj -def benchmark_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("benchmark") +def benchmark_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("benchmark") benchmark_group.add_command(tpch_group) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index d13b5ddf..3cf2e60d 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -7,7 +7,7 @@ from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run from util.workspace import ( - DBGymConfig, + DBGymWorkspace, get_default_tables_dname, get_workload_name, is_fully_resolved, @@ -136,8 +136,8 @@ @click.group(name="job") @click.pass_obj -def job_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("job") +def job_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("job") @job_group.command(name="data") @@ -146,9 +146,9 @@ def job_group(dbgym_cfg: DBGymConfig) -> None: @click.pass_obj # The reason generate data is separate from create dbdata is because generate-data is generic # to all DBMSs while create dbdata is specific to a single DBMS. -def job_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: +def job_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: assert scale_factor == DEFAULT_SCALE_FACTOR - _download_job_data(dbgym_cfg) + _download_job_data(dbgym_workspace) @job_group.command(name="workload") @@ -160,25 +160,25 @@ def job_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR) @click.pass_obj def job_workload( - dbgym_cfg: DBGymConfig, query_subset: str, scale_factor: float + dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float ) -> None: assert scale_factor == DEFAULT_SCALE_FACTOR - _download_job_queries(dbgym_cfg) - _generate_job_workload(dbgym_cfg, query_subset) + _download_job_queries(dbgym_workspace) + _generate_job_workload(dbgym_workspace, query_subset) -def _download_job_data(dbgym_cfg: DBGymConfig) -> None: +def _download_job_data(dbgym_workspace: DBGymWorkspace) -> None: _download_and_untar_dir( - dbgym_cfg, + dbgym_workspace, JOB_TABLES_URL, "imdb.tgz", get_default_tables_dname(DEFAULT_SCALE_FACTOR), ) -def _download_job_queries(dbgym_cfg: DBGymConfig) -> None: +def _download_job_queries(dbgym_workspace: DBGymWorkspace) -> None: _download_and_untar_dir( - dbgym_cfg, + dbgym_workspace, JOB_QUERIES_URL, "job.tgz", JOB_QUERIES_DNAME, @@ -187,7 +187,7 @@ def _download_job_queries(dbgym_cfg: DBGymConfig) -> None: def _download_and_untar_dir( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, download_url: str, download_tarred_fname: str, untarred_dname: str, @@ -200,7 +200,7 @@ def _download_and_untar_dir( `untarred_original_dname` to ensure that it gets renamed to `untarred_dname`. """ expected_symlink_dpath = ( - dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link" + dbgym_workspace.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link" ) if expected_symlink_dpath.exists(): logging.getLogger(DBGYM_LOGGER_NAME).info( @@ -209,9 +209,9 @@ def _download_and_untar_dir( return logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}") - real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) + real_data_path = dbgym_workspace.cur_task_runs_data_path(mkdir=True) subprocess_run(f"curl -O {download_url}", cwd=real_data_path) - untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname) + untarred_data_dpath = dbgym_workspace.cur_task_runs_data_path(untarred_dname) if untarred_original_dname is not None: assert not untarred_data_dpath.exists() @@ -226,24 +226,24 @@ def _download_and_untar_dir( assert untarred_data_dpath.exists() subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path) - symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath) + symlink_dpath = link_result(dbgym_workspace, untarred_data_dpath) assert expected_symlink_dpath.samefile(symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}") def _generate_job_workload( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, query_subset: str, ) -> None: workload_name = get_workload_name(DEFAULT_SCALE_FACTOR, query_subset) - expected_workload_symlink_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True) / ( + expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path(mkdir=True) / ( workload_name + ".link" ) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generating: {expected_workload_symlink_dpath}" ) - real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True) + real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True) query_names = None if query_subset == "all": @@ -258,7 +258,7 @@ def _generate_job_workload( with open(real_dpath / "order.txt", "w") as f: for qname in query_names: sql_fpath = ( - dbgym_cfg.cur_symlinks_data_path(mkdir=True) + dbgym_workspace.cur_symlinks_data_path(mkdir=True) / (f"{JOB_QUERIES_DNAME}.link") ).resolve() / f"{qname}.sql" assert is_fully_resolved( @@ -266,7 +266,7 @@ def _generate_job_workload( ), "We should only write existent real absolute paths to a file" f.write(f"Q{qname},{sql_fpath}\n") - workload_symlink_dpath = link_result(dbgym_cfg, real_dpath) + workload_symlink_dpath = link_result(dbgym_workspace, real_dpath) assert workload_symlink_dpath == expected_workload_symlink_dpath logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated: {expected_workload_symlink_dpath}" diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py index 4df370b6..58482b4d 100644 --- a/benchmark/job/load_info.py +++ b/benchmark/job/load_info.py @@ -3,7 +3,7 @@ from benchmark.constants import DEFAULT_SCALE_FACTOR from dbms.load_info_base_class import LoadInfoBaseClass -from util.workspace import DBGymConfig, get_default_tables_dname, is_fully_resolved +from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved JOB_SCHEMA_FNAME = "job_schema.sql" @@ -35,9 +35,9 @@ class JobLoadInfo(LoadInfoBaseClass): "title", ] - def __init__(self, dbgym_cfg: DBGymConfig): + def __init__(self, dbgym_workspace: DBGymWorkspace): # schema and constraints - schema_root_dpath = dbgym_cfg.dbgym_repo_path + schema_root_dpath = dbgym_workspace.dbgym_repo_path for component in JobLoadInfo.CODEBASE_PATH_COMPONENTS[ 1: ]: # [1:] to skip "dbgym" @@ -49,7 +49,7 @@ def __init__(self, dbgym_cfg: DBGymConfig): # Tables data_root_dpath = ( - dbgym_cfg.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data" + dbgym_workspace.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data" ) tables_symlink_dpath = ( data_root_dpath / f"{get_default_tables_dname(DEFAULT_SCALE_FACTOR)}.link" diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 1a3314bb..5d6ce80d 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -8,7 +8,7 @@ from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run from util.workspace import ( - DBGymConfig, + DBGymWorkspace, get_default_tables_dname, get_scale_factor_string, get_workload_name, @@ -19,8 +19,8 @@ @click.group(name="tpch") @click.pass_obj -def tpch_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("tpch") +def tpch_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("tpch") @tpch_group.command(name="data") @@ -28,9 +28,9 @@ def tpch_group(dbgym_cfg: DBGymConfig) -> None: @click.pass_obj # The reason generate data is separate from create dbdata is because generate-data is generic # to all DBMSs while create dbdata is specific to a single DBMS. -def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: - _clone_tpch_kit(dbgym_cfg) - _generate_data(dbgym_cfg, scale_factor) +def tpch_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: + _clone_tpch_kit(dbgym_workspace) + _generate_data(dbgym_workspace, scale_factor) @tpch_group.command(name="workload") @@ -54,7 +54,7 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR) @click.pass_obj def tpch_workload( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, query_subset: str, @@ -63,18 +63,18 @@ def tpch_workload( assert ( seed_start <= seed_end ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})" - _clone_tpch_kit(dbgym_cfg) - _generate_tpch_queries(dbgym_cfg, seed_start, seed_end, scale_factor) - _generate_tpch_workload(dbgym_cfg, seed_start, seed_end, query_subset, scale_factor) + _clone_tpch_kit(dbgym_workspace) + _generate_tpch_queries(dbgym_workspace, seed_start, seed_end, scale_factor) + _generate_tpch_workload(dbgym_workspace, seed_start, seed_end, query_subset, scale_factor) def _get_queries_dname(seed: int, scale_factor: float) -> str: return f"queries_{seed}_sf{get_scale_factor_string(scale_factor)}" -def _clone_tpch_kit(dbgym_cfg: DBGymConfig) -> None: +def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None: expected_symlink_dpath = ( - dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link" + dbgym_workspace.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link" ) if expected_symlink_dpath.exists(): logging.getLogger(DBGYM_LOGGER_NAME).info( @@ -83,26 +83,26 @@ def _clone_tpch_kit(dbgym_cfg: DBGymConfig) -> None: return logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}") - real_build_path = dbgym_cfg.cur_task_runs_build_path() + real_build_path = dbgym_workspace.cur_task_runs_build_path() subprocess_run( - f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path() + f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_workspace.cur_source_path() ) - symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit") + symlink_dpath = link_result(dbgym_workspace, real_build_path / "tpch-kit") assert expected_symlink_dpath.samefile(symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}") -def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path: - tpch_kit_dpath = (dbgym_cfg.cur_symlinks_build_path() / "tpch-kit.link").resolve() +def _get_tpch_kit_dpath(dbgym_workspace: DBGymWorkspace) -> Path: + tpch_kit_dpath = (dbgym_workspace.cur_symlinks_build_path() / "tpch-kit.link").resolve() assert is_fully_resolved(tpch_kit_dpath) return tpch_kit_dpath def _generate_tpch_queries( - dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float + dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float ) -> None: - tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg) - data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True) + tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace) + data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generating queries: {data_path} [{seed_start}, {seed_end}]" ) @@ -113,7 +113,7 @@ def _generate_tpch_queries( if expected_queries_symlink_dpath.exists(): continue - real_dir = dbgym_cfg.cur_task_runs_data_path( + real_dir = dbgym_workspace.cur_task_runs_data_path( _get_queries_dname(seed, scale_factor), mkdir=True ) for i in range(1, NUM_TPCH_QUERIES + 1): @@ -123,16 +123,16 @@ def _generate_tpch_queries( cwd=tpch_kit_dpath / "dbgen", verbose=False, ) - queries_symlink_dpath = link_result(dbgym_cfg, real_dir) + queries_symlink_dpath = link_result(dbgym_workspace, real_dir) assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated queries: {data_path} [{seed_start}, {seed_end}]" ) -def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: - tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg) - data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True) +def _generate_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: + tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace) + data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True) expected_tables_symlink_dpath = ( data_path / f"{get_default_tables_dname(scale_factor)}.link" ) @@ -146,12 +146,12 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: f"Generating: {expected_tables_symlink_dpath}" ) subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen") - real_dir = dbgym_cfg.cur_task_runs_data_path( + real_dir = dbgym_workspace.cur_task_runs_data_path( get_default_tables_dname(scale_factor), mkdir=True ) subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen") - tables_symlink_dpath = link_result(dbgym_cfg, real_dir) + tables_symlink_dpath = link_result(dbgym_workspace, real_dir) assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated: {expected_tables_symlink_dpath}" @@ -159,13 +159,13 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: def _generate_tpch_workload( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, query_subset: str, scale_factor: float, ) -> None: - symlink_data_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True) + symlink_data_dpath = dbgym_workspace.cur_symlinks_data_path(mkdir=True) workload_name = get_workload_name( scale_factor, f"{seed_start}_{seed_end}_{query_subset}" ) @@ -174,7 +174,7 @@ def _generate_tpch_workload( logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generating: {expected_workload_symlink_dpath}" ) - real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True) + real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True) query_names = None if query_subset == "all": @@ -199,7 +199,7 @@ def _generate_tpch_workload( f.write(f"S{seed}-Q{qname},{sql_fpath}\n") # TODO(WAN): add option to deep-copy the workload. - workload_symlink_dpath = link_result(dbgym_cfg, real_dpath) + workload_symlink_dpath = link_result(dbgym_workspace, real_dpath) assert workload_symlink_dpath == expected_workload_symlink_dpath logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated: {expected_workload_symlink_dpath}" diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py index fb660906..5162b8b1 100644 --- a/benchmark/tpch/load_info.py +++ b/benchmark/tpch/load_info.py @@ -2,7 +2,7 @@ from typing import Optional from dbms.load_info_base_class import LoadInfoBaseClass -from util.workspace import DBGymConfig, get_default_tables_dname, is_fully_resolved +from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved TPCH_SCHEMA_FNAME = "tpch_schema.sql" TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql" @@ -24,9 +24,9 @@ class TpchLoadInfo(LoadInfoBaseClass): "lineitem", ] - def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float): + def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float): # schema and constraints - schema_root_dpath = dbgym_cfg.dbgym_repo_path + schema_root_dpath = dbgym_workspace.dbgym_repo_path for component in TpchLoadInfo.CODEBASE_PATH_COMPONENTS[ 1: ]: # [1:] to skip "dbgym" @@ -42,7 +42,7 @@ def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float): # tables data_root_dpath = ( - dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data" + dbgym_workspace.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data" ) tables_symlink_dpath = ( data_root_dpath / f"{get_default_tables_dname(scale_factor)}.link" diff --git a/dbms/cli.py b/dbms/cli.py index b3984637..6d97c5e2 100644 --- a/dbms/cli.py +++ b/dbms/cli.py @@ -1,13 +1,13 @@ import click from dbms.postgres.cli import postgres_group -from util.workspace import DBGymConfig +from util.workspace import DBGymWorkspace @click.group(name="dbms") @click.pass_obj -def dbms_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("dbms") +def dbms_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("dbms") dbms_group.add_command(postgres_group) diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 8d34e366..18d8f3f0 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -31,7 +31,7 @@ from util.shell import subprocess_run from util.workspace import ( WORKSPACE_PATH_PLACEHOLDER, - DBGymConfig, + DBGymWorkspace, fully_resolve_path, get_dbdata_tgz_name, get_default_dbdata_parent_dpath, @@ -46,8 +46,8 @@ @click.group(name="postgres") @click.pass_obj -def postgres_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("postgres") +def postgres_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("postgres") @postgres_group.command( @@ -60,8 +60,8 @@ def postgres_group(dbgym_cfg: DBGymConfig) -> None: is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.", ) -def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: - _build_repo(dbgym_cfg, rebuild) +def postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: + _build_repo(dbgym_workspace, rebuild) @postgres_group.command( @@ -90,7 +90,7 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", ) def postgres_dbdata( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float, pgbin_path: Optional[Path], @@ -99,15 +99,15 @@ def postgres_dbdata( ) -> None: # Set args to defaults programmatically (do this before doing anything else in the function) if pgbin_path is None: - pgbin_path = get_default_pgbin_path(dbgym_cfg.dbgym_workspace_path) + pgbin_path = get_default_pgbin_path(dbgym_workspace.dbgym_workspace_path) if dbdata_parent_dpath is None: dbdata_parent_dpath = get_default_dbdata_parent_dpath( - dbgym_cfg.dbgym_workspace_path + dbgym_workspace.dbgym_workspace_path ) # Fully resolve all input paths. - pgbin_path = fully_resolve_path(dbgym_cfg, pgbin_path) - dbdata_parent_dpath = fully_resolve_path(dbgym_cfg, dbdata_parent_dpath) + pgbin_path = fully_resolve_path(dbgym_workspace, pgbin_path) + dbdata_parent_dpath = fully_resolve_path(dbgym_workspace, dbdata_parent_dpath) # Check assertions on args if intended_dbdata_hardware == "hdd": @@ -123,22 +123,22 @@ def postgres_dbdata( # Create dbdata _create_dbdata( - dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath + dbgym_workspace, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath ) -def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path: - return dbgym_cfg.cur_symlinks_build_path( +def _get_pgbin_symlink_path(dbgym_workspace: DBGymWorkspace) -> Path: + return dbgym_workspace.cur_symlinks_build_path( "repo.link", "boot", "build", "postgres", "bin" ) -def _get_repo_symlink_path(dbgym_cfg: DBGymConfig) -> Path: - return dbgym_cfg.cur_symlinks_build_path("repo.link") +def _get_repo_symlink_path(dbgym_workspace: DBGymWorkspace) -> Path: + return dbgym_workspace.cur_symlinks_build_path("repo.link") -def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: - expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_cfg) +def _build_repo(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: + expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_workspace) if not rebuild and expected_repo_symlink_dpath.exists(): logging.getLogger(DBGYM_LOGGER_NAME).info( f"Skipping _build_repo: {expected_repo_symlink_dpath}" @@ -148,13 +148,13 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: logging.getLogger(DBGYM_LOGGER_NAME).info( f"Setting up repo in {expected_repo_symlink_dpath}" ) - repo_real_dpath = dbgym_cfg.cur_task_runs_build_path("repo", mkdir=True) + repo_real_dpath = dbgym_workspace.cur_task_runs_build_path("repo", mkdir=True) subprocess_run( - f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_cfg.cur_source_path() + f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_workspace.cur_source_path() ) # only link at the end so that the link only ever points to a complete repo - repo_symlink_dpath = link_result(dbgym_cfg, repo_real_dpath) + repo_symlink_dpath = link_result(dbgym_workspace, repo_real_dpath) assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Set up repo in {expected_repo_symlink_dpath}" @@ -162,7 +162,7 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: def _create_dbdata( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float, pgbin_path: Path, @@ -184,23 +184,23 @@ def _create_dbdata( # Call initdb. # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run. - save_file(dbgym_cfg, pgbin_path / "initdb") + save_file(dbgym_workspace, pgbin_path / "initdb") subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path) # Start Postgres (all other dbdata setup requires postgres to be started). # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead. - start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) + start_postgres(dbgym_workspace, pgbin_path, dbdata_dpath) # Set up Postgres. - _generic_dbdata_setup(dbgym_cfg) - _load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor) + _generic_dbdata_setup(dbgym_workspace) + _load_benchmark_into_dbdata(dbgym_workspace, benchmark_name, scale_factor) # Stop Postgres so that we don't "leak" processes. - stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) + stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath) # Create .tgz file. # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir. - dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path( + dbdata_tgz_real_fpath = dbgym_workspace.cur_task_runs_data_path( mkdir=True ) / get_dbdata_tgz_name(benchmark_name, scale_factor) # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath. @@ -208,22 +208,22 @@ def _create_dbdata( # Create symlink. # Only link at the end so that the link only ever points to a complete dbdata. - dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath) + dbdata_tgz_symlink_path = link_result(dbgym_workspace, dbdata_tgz_real_fpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Created dbdata in {dbdata_tgz_symlink_path}" ) -def _generic_dbdata_setup(dbgym_cfg: DBGymConfig) -> None: +def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None: # get necessary vars - pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve() + pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_workspace).resolve() assert pgbin_real_dpath.exists() dbgym_pguser = DBGYM_POSTGRES_USER dbgym_pgpass = DBGYM_POSTGRES_PASS pgport = DEFAULT_POSTGRES_PORT # Create user - save_file(dbgym_cfg, pgbin_real_dpath / "psql") + save_file(dbgym_workspace, pgbin_real_dpath / "psql") subprocess_run( f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost", cwd=pgbin_real_dpath, @@ -251,34 +251,34 @@ def _generic_dbdata_setup(dbgym_cfg: DBGymConfig) -> None: def _load_benchmark_into_dbdata( - dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float + dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float ) -> None: load_info: LoadInfoBaseClass with create_sqlalchemy_conn() as conn: if benchmark_name == "tpch": - load_info = TpchLoadInfo(dbgym_cfg, scale_factor) + load_info = TpchLoadInfo(dbgym_workspace, scale_factor) elif benchmark_name == "job": - load_info = JobLoadInfo(dbgym_cfg) + load_info = JobLoadInfo(dbgym_workspace) else: raise AssertionError( f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented" ) - _load_into_dbdata(dbgym_cfg, conn, load_info) + _load_into_dbdata(dbgym_workspace, conn, load_info) def _load_into_dbdata( - dbgym_cfg: DBGymConfig, conn: sqlalchemy.Connection, load_info: LoadInfoBaseClass + dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, load_info: LoadInfoBaseClass ) -> None: - sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath()) + sql_file_execute(dbgym_workspace, conn, load_info.get_schema_fpath()) # Truncate all tables first before even loading a single one. for table, _ in load_info.get_tables_and_fpaths(): sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE") # Then, load the tables. for table, table_fpath in load_info.get_tables_and_fpaths(): - with open_and_save(dbgym_cfg, table_fpath, "r") as table_csv: + with open_and_save(dbgym_workspace, table_fpath, "r") as table_csv: assert conn.connection.dbapi_connection is not None cur = conn.connection.dbapi_connection.cursor() try: @@ -292,7 +292,7 @@ def _load_into_dbdata( constraints_fpath = load_info.get_constraints_fpath() if constraints_fpath is not None: - sql_file_execute(dbgym_cfg, conn, constraints_fpath) + sql_file_execute(dbgym_workspace, conn, constraints_fpath) # The start and stop functions slightly duplicate functionality from pg_conn.py. However, I chose to do it this way @@ -301,23 +301,23 @@ def _load_into_dbdata( # even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the # tuning environment. def start_postgres( - dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path + dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path ) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True) + _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, True) -def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False) +def stop_postgres(dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path) -> None: + _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, False) def _start_or_stop_postgres( - dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool + dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path, is_start: bool ) -> None: # They should be absolute paths and should exist assert is_fully_resolved(pgbin_path) assert is_fully_resolved(dbdata_dpath) pgport = DEFAULT_POSTGRES_PORT - save_file(dbgym_cfg, pgbin_path / "pg_ctl") + save_file(dbgym_workspace, pgbin_path / "pg_ctl") if is_start: # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start". diff --git a/env/integtest_pg_conn.py b/env/integtest_pg_conn.py index 3e6655de..83a09847 100644 --- a/env/integtest_pg_conn.py +++ b/env/integtest_pg_conn.py @@ -38,7 +38,7 @@ def tearDown(self) -> None: def create_pg_conn(self, pgport: int = DEFAULT_POSTGRES_PORT) -> PostgresConn: return PostgresConn( - IntegtestWorkspace.get_dbgym_cfg(), + IntegtestWorkspace.get_dbgym_workspace(), pgport, self.metadata.pristine_dbdata_snapshot_path, self.metadata.dbdata_parent_path, diff --git a/env/integtest_replay.py b/env/integtest_replay.py index 49af2909..e318cf60 100644 --- a/env/integtest_replay.py +++ b/env/integtest_replay.py @@ -19,7 +19,7 @@ def setUpClass() -> None: def test_replay(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), + IntegtestWorkspace.get_dbgym_workspace(), IntegtestWorkspace.get_default_metadata(), ) writer.write_step( @@ -41,7 +41,7 @@ def test_replay(self) -> None: ) ) replay_data = replay( - IntegtestWorkspace.get_dbgym_cfg(), writer.tuning_artifacts_dpath + IntegtestWorkspace.get_dbgym_workspace(), writer.tuning_artifacts_dpath ) # We do some very simple sanity checks here due to the inherent randomness of executing a workload. diff --git a/env/integtest_tuning_artifacts.py b/env/integtest_tuning_artifacts.py index 1e8760f1..91f86564 100644 --- a/env/integtest_tuning_artifacts.py +++ b/env/integtest_tuning_artifacts.py @@ -26,7 +26,7 @@ def make_config(letter: str) -> DBMSConfigDelta: def test_get_delta_at_step(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), + IntegtestWorkspace.get_dbgym_workspace(), IntegtestWorkspace.get_default_metadata(), ) @@ -51,7 +51,7 @@ def test_get_delta_at_step(self) -> None: def test_get_all_deltas_in_order(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), + IntegtestWorkspace.get_dbgym_workspace(), IntegtestWorkspace.get_default_metadata(), ) @@ -72,7 +72,7 @@ def test_get_all_deltas_in_order(self) -> None: def test_get_metadata(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), + IntegtestWorkspace.get_dbgym_workspace(), IntegtestWorkspace.get_default_metadata(), ) reader = TuningArtifactsReader(writer.tuning_artifacts_dpath) diff --git a/env/integtest_util.py b/env/integtest_util.py index c996d83f..5ed296f6 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -6,7 +6,7 @@ from env.tuning_artifacts import DBMSConfigDelta, TuningMetadata from util.workspace import ( - DBGymConfig, + DBGymWorkspace, fully_resolve_path, get_default_dbdata_parent_dpath, get_default_pgbin_path, @@ -25,11 +25,11 @@ class IntegtestWorkspace: """ This is essentially a singleton class. This avoids multiple integtest_*.py files creating - the workspace and/or the DBGymConfig redundantly. + the workspace and/or the DBGymWorkspace redundantly. """ ENV_INTEGTESTS_DBGYM_CONFIG_FPATH = Path("env/env_integtests_dbgym_config.yaml") - INTEGTEST_DBGYM_CFG: Optional[DBGymConfig] = None + INTEGTEST_DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None @staticmethod def set_up_workspace() -> None: @@ -38,18 +38,18 @@ def set_up_workspace() -> None: subprocess.run(["./env/set_up_env_integtests.sh"], check=True) # Once we get here, we have an invariant that the workspace exists. We need this - # invariant to be true in order to create the DBGymConfig. + # invariant to be true in order to create the DBGymWorkspace. # # However, it also can't be created more than once so we need to check `is None`. - if IntegtestWorkspace.INTEGTEST_DBGYM_CFG is None: - IntegtestWorkspace.INTEGTEST_DBGYM_CFG = DBGymConfig( + if IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE is None: + IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE = DBGymWorkspace( IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH ) @staticmethod - def get_dbgym_cfg() -> DBGymConfig: - assert IntegtestWorkspace.INTEGTEST_DBGYM_CFG is not None - return IntegtestWorkspace.INTEGTEST_DBGYM_CFG + def get_dbgym_workspace() -> DBGymWorkspace: + assert IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE is not None + return IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE @staticmethod def get_workspace_path() -> Path: @@ -58,13 +58,13 @@ def get_workspace_path() -> Path: @staticmethod def get_default_metadata() -> TuningMetadata: - dbgym_cfg = IntegtestWorkspace.get_dbgym_cfg() + dbgym_workspace = IntegtestWorkspace.get_dbgym_workspace() workspace_path = fully_resolve_path( - dbgym_cfg, IntegtestWorkspace.get_workspace_path() + dbgym_workspace, IntegtestWorkspace.get_workspace_path() ) return TuningMetadata( workload_path=fully_resolve_path( - dbgym_cfg, + dbgym_workspace, get_default_workload_path( workspace_path, INTEGTEST_BENCHMARK, @@ -75,15 +75,15 @@ def get_default_metadata() -> TuningMetadata: ), ), pristine_dbdata_snapshot_path=fully_resolve_path( - dbgym_cfg, + dbgym_workspace, get_default_pristine_dbdata_snapshot_path( workspace_path, INTEGTEST_BENCHMARK, INTEGTEST_SCALE_FACTOR ), ), dbdata_parent_path=fully_resolve_path( - dbgym_cfg, get_default_dbdata_parent_dpath(workspace_path) + dbgym_workspace, get_default_dbdata_parent_dpath(workspace_path) ), pgbin_path=fully_resolve_path( - dbgym_cfg, get_default_pgbin_path(workspace_path) + dbgym_workspace, get_default_pgbin_path(workspace_path) ), ) diff --git a/env/integtest_workload.py b/env/integtest_workload.py index b1f7c35d..be6af8c3 100644 --- a/env/integtest_workload.py +++ b/env/integtest_workload.py @@ -22,7 +22,7 @@ def setUpClass() -> None: def test_workload(self) -> None: workload_dpath = fully_resolve_path( - IntegtestWorkspace.get_dbgym_cfg(), + IntegtestWorkspace.get_dbgym_workspace(), get_default_workload_path( IntegtestWorkspace.get_workspace_path(), INTEGTEST_BENCHMARK, @@ -33,7 +33,7 @@ def test_workload(self) -> None: ), ) - workload = Workload(IntegtestWorkspace.get_dbgym_cfg(), workload_dpath) + workload = Workload(IntegtestWorkspace.get_dbgym_workspace(), workload_dpath) # Check the order of query IDs. self.assertEqual( diff --git a/env/pg_conn.py b/env/pg_conn.py index d28cf1d4..17b36499 100644 --- a/env/pg_conn.py +++ b/env/pg_conn.py @@ -24,7 +24,7 @@ from util.log import DBGYM_LOGGER_NAME from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr -from util.workspace import DBGymConfig, open_and_save, parent_dpath_of_path +from util.workspace import DBGymWorkspace, open_and_save, parent_dpath_of_path CONNECT_TIMEOUT = 300 @@ -35,7 +35,7 @@ class PostgresConn: # are organized in the workspace. def __init__( self, - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, pgport: int, pristine_dbdata_snapshot_fpath: Path, dbdata_parent_dpath: Path, @@ -44,7 +44,7 @@ def __init__( boot_config_fpath: Optional[Path], ) -> None: - self.dbgym_cfg = dbgym_cfg + self.dbgym_workspace = dbgym_workspace self.pgport = pgport self.pgbin_path = pgbin_path self.boot_config_fpath = boot_config_fpath @@ -59,7 +59,7 @@ def __init__( # state of the database as it is being tuned. It is generated while tuning and is # discarded once tuning is completed. self.checkpoint_dbdata_snapshot_fpath = ( - dbgym_cfg.dbgym_tmp_path / "checkpoint_dbdata.tgz" + dbgym_workspace.dbgym_tmp_path / "checkpoint_dbdata.tgz" ) # dbdata_parent_dpath is the parent directory of the dbdata that is *actively being tuned*. # It is *not* the parent directory of pristine_dbdata_snapshot_fpath. @@ -103,11 +103,11 @@ def disconnect(self) -> None: def move_log(self) -> None: pglog_fpath = ( - self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log" ) pglog_this_step_fpath = ( - self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log.{self.log_step}" ) if pglog_fpath.exists(): @@ -299,7 +299,7 @@ def restart_with_changes( "-l", # We log to pg{self.pgport}.log instead of pg.log so that different PostgresConn objects # don't all try to write to the same file. - self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log", "start", ].run(retcode=None) @@ -346,7 +346,7 @@ def restart_with_changes( # Set up Boot if we're told to do so if self.boot_config_fpath is not None: - with open_and_save(self.dbgym_cfg, self.boot_config_fpath) as f: + with open_and_save(self.dbgym_workspace, self.boot_config_fpath) as f: boot_config = yaml.safe_load(f) self._set_up_boot( diff --git a/env/replay.py b/env/replay.py index ea364c31..8dcfe8e4 100644 --- a/env/replay.py +++ b/env/replay.py @@ -5,11 +5,11 @@ from env.tuning_artifacts import TuningArtifactsReader from env.workload import Workload from util.pg import DEFAULT_POSTGRES_PORT -from util.workspace import DBGymConfig +from util.workspace import DBGymWorkspace def replay( - dbgym_cfg: DBGymConfig, tuning_artifacts_dpath: Path + dbgym_workspace: DBGymWorkspace, tuning_artifacts_dpath: Path ) -> list[tuple[float, int]]: """ Returns the total runtime and the number of timed out queries for each step. @@ -20,7 +20,7 @@ def replay( reader = TuningArtifactsReader(tuning_artifacts_dpath) pg_conn = PostgresConn( - dbgym_cfg, + dbgym_workspace, DEFAULT_POSTGRES_PORT, reader.get_metadata().pristine_dbdata_snapshot_path, reader.get_metadata().dbdata_parent_path, @@ -28,7 +28,7 @@ def replay( None, ) workload = Workload( - dbgym_cfg, + dbgym_workspace, reader.get_metadata().workload_path, ) diff --git a/env/tuning_artifacts.py b/env/tuning_artifacts.py index cac3c4b8..a999ac98 100644 --- a/env/tuning_artifacts.py +++ b/env/tuning_artifacts.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any, NewType, TypedDict -from util.workspace import DBGymConfig, is_fully_resolved +from util.workspace import DBGymWorkspace, is_fully_resolved # PostgresConn doesn't use these types because PostgresConn is used internally by tuning agents # while these types are only used in the interface between the orchestrator and the tuning agents. @@ -77,9 +77,9 @@ def get_metadata_fpath(tuning_artifacts_dpath: Path) -> Path: class TuningArtifactsWriter: - def __init__(self, dbgym_cfg: DBGymConfig, metadata: TuningMetadata) -> None: - self.dbgym_cfg = dbgym_cfg - self.tuning_artifacts_dpath = self.dbgym_cfg.cur_task_runs_artifacts_path( + def __init__(self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata) -> None: + self.dbgym_workspace = dbgym_workspace + self.tuning_artifacts_dpath = self.dbgym_workspace.cur_task_runs_artifacts_path( "tuning_artifacts", mkdir=True ) assert is_fully_resolved(self.tuning_artifacts_dpath) diff --git a/env/workload.py b/env/workload.py index 669594ae..89dde426 100644 --- a/env/workload.py +++ b/env/workload.py @@ -1,11 +1,11 @@ from pathlib import Path -from util.workspace import DBGymConfig, is_fully_resolved, open_and_save +from util.workspace import DBGymWorkspace, is_fully_resolved, open_and_save class Workload: - def __init__(self, dbgym_cfg: DBGymConfig, workload_dpath: Path) -> None: - self.dbgym_cfg = dbgym_cfg + def __init__(self, dbgym_workspace: DBGymWorkspace, workload_dpath: Path) -> None: + self.dbgym_workspace = dbgym_workspace self.workload_dpath = workload_dpath assert is_fully_resolved(self.workload_dpath) @@ -15,13 +15,13 @@ def __init__(self, dbgym_cfg: DBGymConfig, workload_dpath: Path) -> None: assert order_fpath.exists() - with open_and_save(self.dbgym_cfg, order_fpath) as f: + with open_and_save(self.dbgym_workspace, order_fpath) as f: for line in f: qid, qpath = line.strip().split(",") qpath = Path(qpath) assert is_fully_resolved(qpath) - with open_and_save(self.dbgym_cfg, qpath) as qf: + with open_and_save(self.dbgym_workspace, qpath) as qf: self.queries[qid] = qf.read() self.query_order.append(qid) diff --git a/manage/cli.py b/manage/cli.py index 123f50d4..2da52481 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -8,7 +8,7 @@ from util.log import DBGYM_LOGGER_NAME, DBGYM_OUTPUT_LOGGER_NAME from util.workspace import ( - DBGymConfig, + DBGymWorkspace, get_runs_path_from_workspace_path, get_symlinks_path_from_workspace_path, is_child_path, @@ -17,7 +17,7 @@ # This is used in test_clean.py. It's defined here to avoid a circular import. -class MockDBGymConfig: +class MockDBGymWorkspace: def __init__(self, scratchspace_path: Path): self.dbgym_workspace_path = scratchspace_path self.dbgym_symlinks_path = get_symlinks_path_from_workspace_path( @@ -39,16 +39,16 @@ def manage_group() -> None: default="safe", help='The mode to clean the workspace (default="safe"). "aggressive" means "only keep run_*/ folders referenced by a file in symlinks/". "safe" means "in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping."', ) -def manage_clean(dbgym_cfg: DBGymConfig, mode: str) -> None: - clean_workspace(dbgym_cfg, mode=mode, verbose=True) +def manage_clean(dbgym_workspace: DBGymWorkspace, mode: str) -> None: + clean_workspace(dbgym_workspace, mode=mode, verbose=True) @click.command("count") @click.pass_obj -def manage_count(dbgym_cfg: DBGymConfig) -> None: - num_files = _count_files_in_workspace(dbgym_cfg) +def manage_count(dbgym_workspace: DBGymWorkspace) -> None: + num_files = _count_files_in_workspace(dbgym_workspace) logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info( - f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." + f"The workspace ({dbgym_workspace.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." ) @@ -68,13 +68,13 @@ def add_symlinks_in_dpath( processed_symlinks.add(file_path) -def _count_files_in_workspace(dbgym_cfg: DBGymConfig | MockDBGymConfig) -> int: +def _count_files_in_workspace(dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace) -> int: """ Counts the number of files (regular file or dir or symlink) in the workspace. """ total_count = 0 for dirpath, dirnames, filenames in os.walk( - dbgym_cfg.dbgym_workspace_path, followlinks=False + dbgym_workspace.dbgym_workspace_path, followlinks=False ): # Check if any of the directories are symbolic links and remove them from dirnames dirnames[:] = [ @@ -88,7 +88,7 @@ def _count_files_in_workspace(dbgym_cfg: DBGymConfig | MockDBGymConfig) -> int: def clean_workspace( - dbgym_cfg: DBGymConfig | MockDBGymConfig, mode: str = "safe", verbose: bool = False + dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, mode: str = "safe", verbose: bool = False ) -> None: """ Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks". @@ -102,9 +102,9 @@ def clean_workspace( processed_symlinks: set[Path] = set() # 1. Initialize paths to process - if dbgym_cfg.dbgym_symlinks_path.exists(): + if dbgym_workspace.dbgym_symlinks_path.exists(): add_symlinks_in_dpath( - symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks + symlink_fpaths_to_process, dbgym_workspace.dbgym_symlinks_path, processed_symlinks ) # 2. Go through symlinks, figuring out which "children of task runs" to keep @@ -114,7 +114,7 @@ def clean_workspace( # instead of "run_dpaths". task_run_child_fordpaths_to_keep = set() - if dbgym_cfg.dbgym_runs_path.exists(): + if dbgym_workspace.dbgym_runs_path.exists(): while symlink_fpaths_to_process: symlink_fpath: Path = symlink_fpaths_to_process.pop() assert symlink_fpath.is_symlink() @@ -132,14 +132,14 @@ def clean_workspace( continue # We're only trying to figure out which direct children of task_runs/ to save. If the file isn't # even a descendant, we don't care about it. - if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path): + if not is_child_path(real_fordpath, dbgym_workspace.dbgym_runs_path): continue - assert not real_fordpath.samefile(dbgym_cfg.dbgym_runs_path) + assert not real_fordpath.samefile(dbgym_workspace.dbgym_runs_path) # Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep task_run_child_fordpath = None - if parent_dpath_of_path(real_fordpath).samefile(dbgym_cfg.dbgym_runs_path): + if parent_dpath_of_path(real_fordpath).samefile(dbgym_workspace.dbgym_runs_path): # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/, # we'll just not delete it if the user happens to have one like this. Even if the user messed up # the structure somehow, it's just a good idea not to delete it. @@ -150,15 +150,15 @@ def clean_workspace( # some reason. task_run_child_fordpath = real_fordpath while not parent_dpath_of_path(task_run_child_fordpath).samefile( - dbgym_cfg.dbgym_runs_path + dbgym_workspace.dbgym_runs_path ): task_run_child_fordpath = parent_dpath_of_path( task_run_child_fordpath ) assert task_run_child_fordpath != None assert parent_dpath_of_path(task_run_child_fordpath).samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path" + dbgym_workspace.dbgym_runs_path + ), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_workspace.dbgym_runs_path" task_run_child_fordpaths_to_keep.add(task_run_child_fordpath) # If on safe mode, add symlinks inside the task_run_child_fordpath to be processed @@ -171,15 +171,15 @@ def clean_workspace( # 3. Go through all children of task_runs/*, deleting any that we weren't told to keep # It's true that symlinks might link outside of task_runs/*. We'll just not care about those - starting_num_files = _count_files_in_workspace(dbgym_cfg) - if dbgym_cfg.dbgym_runs_path.exists(): - for child_fordpath in dbgym_cfg.dbgym_runs_path.iterdir(): + starting_num_files = _count_files_in_workspace(dbgym_workspace) + if dbgym_workspace.dbgym_runs_path.exists(): + for child_fordpath in dbgym_workspace.dbgym_runs_path.iterdir(): if child_fordpath not in task_run_child_fordpaths_to_keep: if child_fordpath.is_dir(): shutil.rmtree(child_fordpath) else: os.remove(child_fordpath) - ending_num_files = _count_files_in_workspace(dbgym_cfg) + ending_num_files = _count_files_in_workspace(dbgym_workspace) if verbose: logging.getLogger(DBGYM_LOGGER_NAME).info( diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 056d9eb7..b565d914 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any, NewType, cast -from manage.cli import MockDBGymConfig, clean_workspace +from manage.cli import MockDBGymWorkspace, clean_workspace from util.workspace import path_exists_dont_follow_symlinks # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which @@ -218,13 +218,13 @@ def test_structure_helpers(self) -> None: ) def test_nonexistent_workspace(self) -> None: - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) def test_no_symlinks_dir_and_no_task_runs_dir(self) -> None: starting_structure = FilesystemStructure({}) ending_structure = FilesystemStructure({}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -235,7 +235,7 @@ def test_no_symlinks_dir_and_yes_task_runs_dir(self) -> None: ) ending_structure = FilesystemStructure({"task_runs": {}}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -244,7 +244,7 @@ def test_yes_symlinks_dir_and_no_task_runs_dir(self) -> None: starting_structure = FilesystemStructure({"symlinks": {}}) ending_structure = FilesystemStructure({"symlinks": {}}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -262,7 +262,7 @@ def test_no_symlinks_in_dir_and_no_task_runs_in_dir(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -280,7 +280,7 @@ def test_no_links_in_symlinks(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -304,7 +304,7 @@ def test_link_to_file_directly_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -333,7 +333,7 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -362,7 +362,7 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -393,7 +393,7 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -414,7 +414,7 @@ def test_link_to_link_crashes(self) -> None: CleanTests.create_structure(self.scratchspace_path, starting_structure) with self.assertRaises(AssertionError): - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) def test_safe_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -444,7 +444,7 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -483,7 +483,7 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -520,7 +520,7 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -552,7 +552,7 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="aggressive") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="aggressive") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -575,7 +575,7 @@ def test_link_to_link_to_file_gives_error(self) -> None: # We disallow links to links so it's an AssertionError with self.assertRaises(AssertionError): - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") def test_multi_link_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -594,7 +594,7 @@ def test_multi_link_loop_gives_error(self) -> None: # pathlib disallows multi-link loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") def test_link_self_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -609,7 +609,7 @@ def test_link_self_loop_gives_error(self) -> None: # pathlib disallows link self-loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs( self, @@ -652,7 +652,7 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -690,7 +690,7 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -726,7 +726,7 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -758,7 +758,7 @@ def test_broken_symlink_has_no_effect(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -809,7 +809,7 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save } CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -829,7 +829,7 @@ def test_outside_task_runs_doesnt_get_deleted(self) -> None: ending_structure["external"] = FilesystemStructure({"file1.txt": ("file",)}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) diff --git a/task.py b/task.py index 0b59cae5..1205999b 100644 --- a/task.py +++ b/task.py @@ -4,7 +4,7 @@ from dbms.cli import dbms_group from manage.cli import manage_group from util.log import set_up_loggers, set_up_warnings -from util.workspace import make_standard_dbgym_cfg +from util.workspace import make_standard_dbgym_workspace # TODO(phw2): Save commit, git diff, and run command. # TODO(phw2): Remove write permissions on old run_*/ dirs to enforce that they are immutable. @@ -15,10 +15,10 @@ @click.pass_context def task(ctx: click.Context) -> None: """🛢️ CMU-DB Database Gym: github.com/cmu-db/dbgym 🏋️""" - dbgym_cfg = make_standard_dbgym_cfg() - ctx.obj = dbgym_cfg + dbgym_workspace = make_standard_dbgym_workspace() + ctx.obj = dbgym_workspace - log_dpath = dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + log_dpath = dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) set_up_loggers(log_dpath) set_up_warnings(log_dpath) diff --git a/util/pg.py b/util/pg.py index 43252727..32f2fdee 100644 --- a/util/pg.py +++ b/util/pg.py @@ -11,7 +11,7 @@ import sqlalchemy from sqlalchemy import create_engine, text -from util.workspace import DBGymConfig, open_and_save +from util.workspace import DBGymWorkspace, open_and_save DBGYM_POSTGRES_USER = "dbgym_user" DBGYM_POSTGRES_PASS = "dbgym_pass" @@ -27,8 +27,8 @@ def sqlalchemy_conn_execute( return conn.execute(text(sql)) -def sql_file_queries(dbgym_cfg: DBGymConfig, filepath: Path) -> list[str]: - with open_and_save(dbgym_cfg, filepath) as f: +def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]: + with open_and_save(dbgym_workspace, filepath) as f: lines: list[str] = [] for line in f: if line.startswith("--"): @@ -42,9 +42,9 @@ def sql_file_queries(dbgym_cfg: DBGymConfig, filepath: Path) -> list[str]: def sql_file_execute( - dbgym_cfg: DBGymConfig, conn: sqlalchemy.Connection, filepath: Path + dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path ) -> None: - for sql in sql_file_queries(dbgym_cfg, filepath): + for sql in sql_file_queries(dbgym_workspace, filepath): sqlalchemy_conn_execute(conn, sql) diff --git a/util/workspace.py b/util/workspace.py index 3c580a7f..52d5ed23 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -99,7 +99,7 @@ def get_default_tables_dname(scale_factor: float | str) -> str: # Paths of dependencies in the workspace. These are named "*_path" because they will be an absolute path # The reason these _cannot_ be relative paths is because relative paths are relative to the codebase root, not the workspace root # Note that it's okay to hardcode the codebase paths (like dbgym_dbms_postgres) here. In the worst case, we'll just break an -# integration test. The "source of truth" of codebase paths is based on DBGymConfig.cur_source_path(), which will always +# integration test. The "source of truth" of codebase paths is based on DBGymWorkspace.cur_source_path(), which will always # reflect the actual codebase structure. As long as we automatically enforce getting the right codebase paths when writing, it's # ok to have to hardcode them when reading. # Details @@ -159,7 +159,7 @@ def get_default_pgbin_path(workspace_path: Path) -> Path: return get_default_repo_path(workspace_path) / "boot" / "build" / "postgres" / "bin" -class DBGymConfig: +class DBGymWorkspace: """ Global configurations that apply to all parts of DB-Gym """ @@ -167,11 +167,11 @@ class DBGymConfig: num_times_created_this_run: int = 0 def __init__(self, dbgym_config_path: Path): - # The logic around dbgym_tmp_path assumes that DBGymConfig is only constructed once. - DBGymConfig.num_times_created_this_run += 1 + # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once. + DBGymWorkspace.num_times_created_this_run += 1 assert ( - DBGymConfig.num_times_created_this_run == 1 - ), f"DBGymConfig has been created {DBGymConfig.num_times_created_this_run} times. It should only be created once per run." + DBGymWorkspace.num_times_created_this_run == 1 + ), f"DBGymWorkspace has been created {DBGymWorkspace.num_times_created_this_run} times. It should only be created once per run." assert is_base_git_dir( os.getcwd() @@ -211,8 +211,8 @@ def __init__(self, dbgym_config_path: Path): self.dbgym_tmp_path = get_tmp_path_from_workspace_path( self.dbgym_workspace_path ) - # The best place to delete the old dbgym_tmp_path is in DBGymConfig.__init__(). - # This is better than deleting the dbgym_tmp_path is in DBGymConfig.__del__() because DBGymConfig may get deleted before execution has completed. + # The best place to delete the old dbgym_tmp_path is in DBGymWorkspace.__init__(). + # This is better than deleting the dbgym_tmp_path is in DBGymWorkspace.__del__() because DBGymWorkspace may get deleted before execution has completed. # Also, by keeping the tmp directory around, you can look at it to debug issues. if self.dbgym_tmp_path.exists(): shutil.rmtree(self.dbgym_tmp_path) @@ -292,17 +292,17 @@ def cur_task_runs_artifacts_path(self, *dirs: str, mkdir: bool = False) -> Path: return self.cur_task_runs_path("artifacts", *dirs, mkdir=mkdir) -def make_standard_dbgym_cfg() -> DBGymConfig: +def make_standard_dbgym_workspace() -> DBGymWorkspace: """ - The "standard" way to make a DBGymConfig using the DBGYM_CONFIG_PATH envvar and the + The "standard" way to make a DBGymWorkspace using the DBGYM_CONFIG_PATH envvar and the default path of dbgym_config.yaml. """ dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) - dbgym_cfg = DBGymConfig(dbgym_config_path) - return dbgym_cfg + dbgym_workspace = DBGymWorkspace(dbgym_config_path) + return dbgym_workspace -def fully_resolve_path(dbgym_cfg: DBGymConfig, inputpath: os.PathLike[str]) -> Path: +def fully_resolve_path(dbgym_workspace: DBGymWorkspace, inputpath: os.PathLike[str]) -> Path: """ Fully resolve any path to a real, absolute path. @@ -320,10 +320,10 @@ def fully_resolve_path(dbgym_cfg: DBGymConfig, inputpath: os.PathLike[str]) -> P realabspath = Path(inputpath) # `expanduser()` is always "ok" to call first. realabspath = realabspath.expanduser() - # The reason we don't call Path.absolute() is because the path should be relative to dbgym_cfg.dbgym_repo_path, + # The reason we don't call Path.absolute() is because the path should be relative to dbgym_workspace.dbgym_repo_path, # which is not necessary where cwd() points at the time of calling this function. if not realabspath.is_absolute(): - realabspath = dbgym_cfg.dbgym_repo_path / realabspath + realabspath = dbgym_workspace.dbgym_repo_path / realabspath # `resolve()` has two uses: normalize the path (remove ..) and resolve symlinks. # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does it this # way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs) @@ -436,7 +436,7 @@ def is_child_path(child_path: os.PathLike[str], parent_dpath: os.PathLike[str]) ) -def open_and_save(dbgym_cfg: DBGymConfig, open_fpath: Path, mode: str = "r") -> IO[Any]: +def open_and_save(dbgym_workspace: DBGymWorkspace, open_fpath: Path, mode: str = "r") -> IO[Any]: """ Open a file and "save" it to [workspace]/task_runs/run_*/. It takes in a str | Path to match the interface of open(). @@ -468,14 +468,14 @@ def open_and_save(dbgym_cfg: DBGymConfig, open_fpath: Path, mode: str = "r") -> assert os.path.isfile(open_fpath), f"open_fpath ({open_fpath}) is not a file" # save - save_file(dbgym_cfg, open_fpath) + save_file(dbgym_workspace, open_fpath) # open return open(open_fpath, mode=mode) def extract_from_task_run_fordpath( - dbgym_cfg: DBGymConfig, task_run_fordpath: Path + dbgym_workspace: DBGymWorkspace, task_run_fordpath: Path ) -> tuple[Path, str, Path, str]: """ The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want. @@ -486,19 +486,19 @@ def extract_from_task_run_fordpath( parent_dpath = task_run_fordpath.parent # TODO(phw2): make this a common function assert not parent_dpath.samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_cfg.dbgym_runs_path ({dbgym_cfg.dbgym_runs_path})" + dbgym_workspace.dbgym_runs_path + ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})" assert not parent_dpath_of_path(parent_dpath).samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" + dbgym_workspace.dbgym_runs_path + ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" assert not parent_dpath_of_path(parent_dpath_of_path(parent_dpath)).samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" + dbgym_workspace.dbgym_runs_path + ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" # org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_fordpath is in org_dpath = parent_dpath while not parent_dpath_of_path( parent_dpath_of_path(parent_dpath_of_path(org_dpath)) - ).samefile(dbgym_cfg.dbgym_runs_path): + ).samefile(dbgym_workspace.dbgym_runs_path): org_dpath = parent_dpath_of_path(org_dpath) org_dname = basename_of_path(org_dpath) codebase_dpath = parent_dpath_of_path(org_dpath) @@ -509,7 +509,7 @@ def extract_from_task_run_fordpath( # TODO(phw2): really look at the clean PR to see what it changed # TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save() -def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: +def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: """ If an external function takes in a file/directory as input, you will not be able to call open_and_save(). In these situations, just call save_file(). @@ -525,19 +525,19 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: assert os.path.exists(fpath), f"fpath ({fpath}) does not exist" assert os.path.isfile(fpath), f"fpath ({fpath}) is not a file" assert not is_child_path( - fpath, dbgym_cfg.dbgym_this_run_path - ), f"fpath ({fpath}) was generated in this task run ({dbgym_cfg.dbgym_this_run_path}). You do not need to save it" + fpath, dbgym_workspace.dbgym_this_run_path + ), f"fpath ({fpath}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it" # save _something_ to dbgym_this_run_path # save a symlink if the opened file was generated by a run. this is for two reasons: # 1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe # 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them - if is_child_path(fpath, dbgym_cfg.dbgym_runs_path): + if is_child_path(fpath, dbgym_workspace.dbgym_runs_path): # get paths we'll need later. _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath( - dbgym_cfg, fpath + dbgym_workspace, fpath ) - this_run_save_dpath = dbgym_cfg.dbgym_this_run_path / codebase_dname / org_dname + this_run_save_dpath = dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname os.makedirs(this_run_save_dpath, exist_ok=True) # if the fpath file is directly in org_dpath, we symlink the file directly @@ -562,7 +562,7 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: # if it wasn't generated by a run else: # since we don't know where the file is at all, the location is "unknown" and the org is "all" - this_run_save_dpath = dbgym_cfg.dbgym_this_run_path / "unknown" / "all" + this_run_save_dpath = dbgym_workspace.dbgym_this_run_path / "unknown" / "all" os.makedirs(this_run_save_dpath, exist_ok=True) fname = basename_of_path(fpath) # in this case, we want to copy instead of symlinking since it might disappear in the future @@ -572,12 +572,12 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead def link_result( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, result_fordpath: Path, custom_result_name: Optional[str] = None, ) -> Path: """ - result_fordpath must be a "result", meaning it was generated inside dbgym_cfg.dbgym_this_run_path. + result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. Further, result_fordpath must have been generated by this invocation to task.py. This also means that result_fordpath itself can be a file or a dir but not a symlink. Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside @@ -590,8 +590,8 @@ def link_result( assert is_fully_resolved( result_fordpath ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" - result_fordpath = fully_resolve_path(dbgym_cfg, result_fordpath) - assert is_child_path(result_fordpath, dbgym_cfg.dbgym_this_run_path) + result_fordpath = fully_resolve_path(dbgym_workspace, result_fordpath) + assert is_child_path(result_fordpath, dbgym_workspace.dbgym_this_run_path) assert not os.path.islink(result_fordpath) if type(custom_result_name) is str: @@ -606,13 +606,13 @@ def link_result( # Figure out the parent directory path of the symlink codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath( - dbgym_cfg, result_fordpath + dbgym_workspace, result_fordpath ) # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path() assert codebase_dpath.samefile( - dbgym_cfg.cur_task_runs_path() + dbgym_workspace.cur_task_runs_path() ), f"link_result should only be called on files generated by this invocation to task.py" - symlink_parent_dpath = dbgym_cfg.dbgym_symlinks_path / codebase_dname / org_dname + symlink_parent_dpath = dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname symlink_parent_dpath.mkdir(parents=True, exist_ok=True) # Remove the old symlink ("old" meaning created in an earlier run) if there is one From 58937bf3891be4244930abf9a79842fb26883cf2 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 14:44:07 -0500 Subject: [PATCH 02/43] fmt --- benchmark/job/cli.py | 6 +++--- benchmark/tpch/cli.py | 8 ++++++-- dbms/postgres/cli.py | 13 ++++++++++--- env/integtest_util.py | 4 ++-- env/tuning_artifacts.py | 4 +++- manage/cli.py | 16 ++++++++++++---- util/workspace.py | 16 ++++++++++++---- 7 files changed, 48 insertions(+), 19 deletions(-) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index 3cf2e60d..84df02ef 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -236,9 +236,9 @@ def _generate_job_workload( query_subset: str, ) -> None: workload_name = get_workload_name(DEFAULT_SCALE_FACTOR, query_subset) - expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path(mkdir=True) / ( - workload_name + ".link" - ) + expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path( + mkdir=True + ) / (workload_name + ".link") logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generating: {expected_workload_symlink_dpath}" diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 5d6ce80d..eea9dc5c 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -65,7 +65,9 @@ def tpch_workload( ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})" _clone_tpch_kit(dbgym_workspace) _generate_tpch_queries(dbgym_workspace, seed_start, seed_end, scale_factor) - _generate_tpch_workload(dbgym_workspace, seed_start, seed_end, query_subset, scale_factor) + _generate_tpch_workload( + dbgym_workspace, seed_start, seed_end, query_subset, scale_factor + ) def _get_queries_dname(seed: int, scale_factor: float) -> str: @@ -93,7 +95,9 @@ def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None: def _get_tpch_kit_dpath(dbgym_workspace: DBGymWorkspace) -> Path: - tpch_kit_dpath = (dbgym_workspace.cur_symlinks_build_path() / "tpch-kit.link").resolve() + tpch_kit_dpath = ( + dbgym_workspace.cur_symlinks_build_path() / "tpch-kit.link" + ).resolve() assert is_fully_resolved(tpch_kit_dpath) return tpch_kit_dpath diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 18d8f3f0..33c2ec0a 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -269,7 +269,9 @@ def _load_benchmark_into_dbdata( def _load_into_dbdata( - dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, load_info: LoadInfoBaseClass + dbgym_workspace: DBGymWorkspace, + conn: sqlalchemy.Connection, + load_info: LoadInfoBaseClass, ) -> None: sql_file_execute(dbgym_workspace, conn, load_info.get_schema_fpath()) @@ -306,12 +308,17 @@ def start_postgres( _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, True) -def stop_postgres(dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path) -> None: +def stop_postgres( + dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path +) -> None: _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, False) def _start_or_stop_postgres( - dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path, is_start: bool + dbgym_workspace: DBGymWorkspace, + pgbin_path: Path, + dbdata_dpath: Path, + is_start: bool, ) -> None: # They should be absolute paths and should exist assert is_fully_resolved(pgbin_path) diff --git a/env/integtest_util.py b/env/integtest_util.py index 5ed296f6..7f01967a 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -1,10 +1,10 @@ import subprocess from pathlib import Path -from typing import Any, Optional +from typing import Optional import yaml -from env.tuning_artifacts import DBMSConfigDelta, TuningMetadata +from env.tuning_artifacts import TuningMetadata from util.workspace import ( DBGymWorkspace, fully_resolve_path, diff --git a/env/tuning_artifacts.py b/env/tuning_artifacts.py index a999ac98..a60d6bbe 100644 --- a/env/tuning_artifacts.py +++ b/env/tuning_artifacts.py @@ -77,7 +77,9 @@ def get_metadata_fpath(tuning_artifacts_dpath: Path) -> Path: class TuningArtifactsWriter: - def __init__(self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata) -> None: + def __init__( + self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata + ) -> None: self.dbgym_workspace = dbgym_workspace self.tuning_artifacts_dpath = self.dbgym_workspace.cur_task_runs_artifacts_path( "tuning_artifacts", mkdir=True diff --git a/manage/cli.py b/manage/cli.py index 2da52481..5d785d9f 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -68,7 +68,9 @@ def add_symlinks_in_dpath( processed_symlinks.add(file_path) -def _count_files_in_workspace(dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace) -> int: +def _count_files_in_workspace( + dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, +) -> int: """ Counts the number of files (regular file or dir or symlink) in the workspace. """ @@ -88,7 +90,9 @@ def _count_files_in_workspace(dbgym_workspace: DBGymWorkspace | MockDBGymWorkspa def clean_workspace( - dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, mode: str = "safe", verbose: bool = False + dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, + mode: str = "safe", + verbose: bool = False, ) -> None: """ Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks". @@ -104,7 +108,9 @@ def clean_workspace( # 1. Initialize paths to process if dbgym_workspace.dbgym_symlinks_path.exists(): add_symlinks_in_dpath( - symlink_fpaths_to_process, dbgym_workspace.dbgym_symlinks_path, processed_symlinks + symlink_fpaths_to_process, + dbgym_workspace.dbgym_symlinks_path, + processed_symlinks, ) # 2. Go through symlinks, figuring out which "children of task runs" to keep @@ -139,7 +145,9 @@ def clean_workspace( # Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep task_run_child_fordpath = None - if parent_dpath_of_path(real_fordpath).samefile(dbgym_workspace.dbgym_runs_path): + if parent_dpath_of_path(real_fordpath).samefile( + dbgym_workspace.dbgym_runs_path + ): # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/, # we'll just not delete it if the user happens to have one like this. Even if the user messed up # the structure somehow, it's just a good idea not to delete it. diff --git a/util/workspace.py b/util/workspace.py index 52d5ed23..b8ae3edd 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -302,7 +302,9 @@ def make_standard_dbgym_workspace() -> DBGymWorkspace: return dbgym_workspace -def fully_resolve_path(dbgym_workspace: DBGymWorkspace, inputpath: os.PathLike[str]) -> Path: +def fully_resolve_path( + dbgym_workspace: DBGymWorkspace, inputpath: os.PathLike[str] +) -> Path: """ Fully resolve any path to a real, absolute path. @@ -436,7 +438,9 @@ def is_child_path(child_path: os.PathLike[str], parent_dpath: os.PathLike[str]) ) -def open_and_save(dbgym_workspace: DBGymWorkspace, open_fpath: Path, mode: str = "r") -> IO[Any]: +def open_and_save( + dbgym_workspace: DBGymWorkspace, open_fpath: Path, mode: str = "r" +) -> IO[Any]: """ Open a file and "save" it to [workspace]/task_runs/run_*/. It takes in a str | Path to match the interface of open(). @@ -537,7 +541,9 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath( dbgym_workspace, fpath ) - this_run_save_dpath = dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname + this_run_save_dpath = ( + dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname + ) os.makedirs(this_run_save_dpath, exist_ok=True) # if the fpath file is directly in org_dpath, we symlink the file directly @@ -612,7 +618,9 @@ def link_result( assert codebase_dpath.samefile( dbgym_workspace.cur_task_runs_path() ), f"link_result should only be called on files generated by this invocation to task.py" - symlink_parent_dpath = dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname + symlink_parent_dpath = ( + dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname + ) symlink_parent_dpath.mkdir(parents=True, exist_ok=True) # Remove the old symlink ("old" meaning created in an earlier run) if there is one From b6d580c3fa2295077ba3151dd04ba5b8dd0a4fc7 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 14:57:12 -0500 Subject: [PATCH 03/43] renamed IntegtestWorkspace --- .github/workflows/tests.yaml | 2 +- ...aml => gymlib_integtest_dbgym_config.yaml} | 0 env/integtest_pg_conn.py | 8 ++--- env/integtest_replay.py | 11 +++--- env/integtest_tuning_artifacts.py | 18 +++++----- env/integtest_util.py | 36 +++++++++++-------- env/integtest_workload.py | 12 ++++--- ...h => set_up_gymlib_integtest_workspace.sh} | 2 +- 8 files changed, 49 insertions(+), 40 deletions(-) rename env/{env_integtests_dbgym_config.yaml => gymlib_integtest_dbgym_config.yaml} (100%) rename env/{set_up_env_integtests.sh => set_up_gymlib_integtest_workspace.sh} (90%) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index ed0f0d3e..c987172c 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -52,7 +52,7 @@ jobs: # Integration tests do require external systems to be running (most commonly a database instance). # Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does. env: - # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `./env/set_up_env_integtests.sh`. + # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `./env/set_up_gymlib_integtest_workspace.sh`. INTENDED_DBDATA_HARDWARE: ssd run: | . "$HOME/.cargo/env" diff --git a/env/env_integtests_dbgym_config.yaml b/env/gymlib_integtest_dbgym_config.yaml similarity index 100% rename from env/env_integtests_dbgym_config.yaml rename to env/gymlib_integtest_dbgym_config.yaml diff --git a/env/integtest_pg_conn.py b/env/integtest_pg_conn.py index 83a09847..d7ea043e 100644 --- a/env/integtest_pg_conn.py +++ b/env/integtest_pg_conn.py @@ -3,7 +3,7 @@ import psycopg -from env.integtest_util import IntegtestWorkspace +from env.integtest_util import GymlibIntegtestWorkspaceManager from env.pg_conn import PostgresConn from util.pg import ( DEFAULT_POSTGRES_PORT, @@ -15,7 +15,7 @@ class PostgresConnTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestWorkspaceManager.set_up_workspace() def setUp(self) -> None: self.assertFalse( @@ -23,7 +23,7 @@ def setUp(self) -> None: "Make sure Postgres isn't running before starting the integration test. `pkill postgres` is one way " + "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.", ) - self.metadata = IntegtestWorkspace.get_default_metadata() + self.metadata = GymlibIntegtestWorkspaceManager.get_default_metadata() # The reason we restart Postgres every time is to ensure a "clean" starting point # so that all tests are independent of each other. @@ -38,7 +38,7 @@ def tearDown(self) -> None: def create_pg_conn(self, pgport: int = DEFAULT_POSTGRES_PORT) -> PostgresConn: return PostgresConn( - IntegtestWorkspace.get_dbgym_workspace(), + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), pgport, self.metadata.pristine_dbdata_snapshot_path, self.metadata.dbdata_parent_path, diff --git a/env/integtest_replay.py b/env/integtest_replay.py index e318cf60..2b49389d 100644 --- a/env/integtest_replay.py +++ b/env/integtest_replay.py @@ -1,7 +1,7 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from env.integtest_util import IntegtestWorkspace +from env.integtest_util import GymlibIntegtestWorkspaceManager from env.replay import replay from env.tuning_artifacts import ( DBMSConfigDelta, @@ -15,12 +15,12 @@ class ReplayTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestWorkspaceManager.set_up_workspace() def test_replay(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_workspace(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + GymlibIntegtestWorkspaceManager.get_default_metadata(), ) writer.write_step( DBMSConfigDelta( @@ -41,7 +41,8 @@ def test_replay(self) -> None: ) ) replay_data = replay( - IntegtestWorkspace.get_dbgym_workspace(), writer.tuning_artifacts_dpath + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + writer.tuning_artifacts_dpath, ) # We do some very simple sanity checks here due to the inherent randomness of executing a workload. diff --git a/env/integtest_tuning_artifacts.py b/env/integtest_tuning_artifacts.py index 91f86564..b0a9fe91 100644 --- a/env/integtest_tuning_artifacts.py +++ b/env/integtest_tuning_artifacts.py @@ -1,6 +1,6 @@ import unittest -from env.integtest_util import IntegtestWorkspace +from env.integtest_util import GymlibIntegtestWorkspaceManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, @@ -14,7 +14,7 @@ class PostgresConnTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestWorkspaceManager.set_up_workspace() @staticmethod def make_config(letter: str) -> DBMSConfigDelta: @@ -26,8 +26,8 @@ def make_config(letter: str) -> DBMSConfigDelta: def test_get_delta_at_step(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_workspace(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + GymlibIntegtestWorkspaceManager.get_default_metadata(), ) writer.write_step(PostgresConnTests.make_config("a")) @@ -51,8 +51,8 @@ def test_get_delta_at_step(self) -> None: def test_get_all_deltas_in_order(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_workspace(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + GymlibIntegtestWorkspaceManager.get_default_metadata(), ) writer.write_step(PostgresConnTests.make_config("a")) @@ -72,12 +72,12 @@ def test_get_all_deltas_in_order(self) -> None: def test_get_metadata(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_workspace(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + GymlibIntegtestWorkspaceManager.get_default_metadata(), ) reader = TuningArtifactsReader(writer.tuning_artifacts_dpath) metadata = reader.get_metadata() - expected_metadata = IntegtestWorkspace.get_default_metadata() + expected_metadata = GymlibIntegtestWorkspaceManager.get_default_metadata() self.assertEqual(metadata, expected_metadata) diff --git a/env/integtest_util.py b/env/integtest_util.py index 7f01967a..1927cd85 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -16,51 +16,57 @@ get_workload_name, ) -# These are the values used by set_up_env_integtests.sh. -# TODO: make set_up_env_integtests.sh take in these values directly as envvars. +# These are the values used by set_up_gymlib_integtest_workspace.sh. +# TODO: make set_up_gymlib_integtest_workspace.sh take in these values directly as envvars. INTEGTEST_BENCHMARK = "tpch" INTEGTEST_SCALE_FACTOR = 0.01 -class IntegtestWorkspace: +class GymlibIntegtestWorkspaceManager: """ This is essentially a singleton class. This avoids multiple integtest_*.py files creating the workspace and/or the DBGymWorkspace redundantly. + + The reason I put all these static methods in a class instead of directly in the module is + that the functions have very generic names (e.g. set_up_workspace()) but having them + inside a class makes it clear that they are related to the gymlib integration tests. """ - ENV_INTEGTESTS_DBGYM_CONFIG_FPATH = Path("env/env_integtests_dbgym_config.yaml") - INTEGTEST_DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None + DBGYM_CONFIG_FPATH = Path("env/gymlib_integtest_dbgym_config.yaml") + DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None @staticmethod def set_up_workspace() -> None: # This if statement prevents us from setting up the workspace twice, which saves time. - if not IntegtestWorkspace.get_workspace_path().exists(): - subprocess.run(["./env/set_up_env_integtests.sh"], check=True) + if not GymlibIntegtestWorkspaceManager.get_workspace_path().exists(): + subprocess.run(["./env/set_up_gymlib_integtest_workspace.sh"], check=True) # Once we get here, we have an invariant that the workspace exists. We need this # invariant to be true in order to create the DBGymWorkspace. # # However, it also can't be created more than once so we need to check `is None`. - if IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE is None: - IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE = DBGymWorkspace( - IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH + if GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE is None: + GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE = DBGymWorkspace( + GymlibIntegtestWorkspaceManager.DBGYM_CONFIG_FPATH ) @staticmethod def get_dbgym_workspace() -> DBGymWorkspace: - assert IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE is not None - return IntegtestWorkspace.INTEGTEST_DBGYM_WORKSPACE + assert GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE is not None + return GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE @staticmethod def get_workspace_path() -> Path: - with open(IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH) as f: + with open( + GymlibIntegtestWorkspaceManager.DBGYM_CONFIG_FPATH + ) as f: return Path(yaml.safe_load(f)["dbgym_workspace_path"]) @staticmethod def get_default_metadata() -> TuningMetadata: - dbgym_workspace = IntegtestWorkspace.get_dbgym_workspace() + dbgym_workspace = GymlibIntegtestWorkspaceManager.get_dbgym_workspace() workspace_path = fully_resolve_path( - dbgym_workspace, IntegtestWorkspace.get_workspace_path() + dbgym_workspace, GymlibIntegtestWorkspaceManager.get_workspace_path() ) return TuningMetadata( workload_path=fully_resolve_path( diff --git a/env/integtest_workload.py b/env/integtest_workload.py index be6af8c3..9ee24f63 100644 --- a/env/integtest_workload.py +++ b/env/integtest_workload.py @@ -4,7 +4,7 @@ from env.integtest_util import ( INTEGTEST_BENCHMARK, INTEGTEST_SCALE_FACTOR, - IntegtestWorkspace, + GymlibIntegtestWorkspaceManager, ) from env.workload import Workload from util.workspace import ( @@ -18,13 +18,13 @@ class WorkloadTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestWorkspaceManager.set_up_workspace() def test_workload(self) -> None: workload_dpath = fully_resolve_path( - IntegtestWorkspace.get_dbgym_workspace(), + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), get_default_workload_path( - IntegtestWorkspace.get_workspace_path(), + GymlibIntegtestWorkspaceManager.get_workspace_path(), INTEGTEST_BENCHMARK, get_workload_name( INTEGTEST_SCALE_FACTOR, @@ -33,7 +33,9 @@ def test_workload(self) -> None: ), ) - workload = Workload(IntegtestWorkspace.get_dbgym_workspace(), workload_dpath) + workload = Workload( + GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), workload_dpath + ) # Check the order of query IDs. self.assertEqual( diff --git a/env/set_up_env_integtests.sh b/env/set_up_gymlib_integtest_workspace.sh similarity index 90% rename from env/set_up_env_integtests.sh rename to env/set_up_gymlib_integtest_workspace.sh index a536a2ac..fae23e66 100755 --- a/env/set_up_env_integtests.sh +++ b/env/set_up_gymlib_integtest_workspace.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Environment tests relies on Postgres being built and workloads/dbdata being generated. This script does this. +# Gymlib integration tests relies on Postgres being built and workloads/dbdata being generated. # Generating these things is not considered a part of the test which is why it's in its own shell script. # The reason there's a shell script generating them instead of them just being in the repo is because (a) # the Postgres repo is very large and (b) the built binary will be different for different machines. From 2a9b081cc186da989633041ed218a01af433b815 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 14:59:06 -0500 Subject: [PATCH 04/43] shortened GymlibIntegtestWorkspaceManager --- env/integtest_pg_conn.py | 8 ++++---- env/integtest_replay.py | 10 +++++----- env/integtest_tuning_artifacts.py | 18 +++++++++--------- env/integtest_util.py | 24 +++++++++++------------- env/integtest_workload.py | 10 +++++----- 5 files changed, 34 insertions(+), 36 deletions(-) diff --git a/env/integtest_pg_conn.py b/env/integtest_pg_conn.py index d7ea043e..8bb72b5f 100644 --- a/env/integtest_pg_conn.py +++ b/env/integtest_pg_conn.py @@ -3,7 +3,7 @@ import psycopg -from env.integtest_util import GymlibIntegtestWorkspaceManager +from env.integtest_util import GymlibIntegtestManager from env.pg_conn import PostgresConn from util.pg import ( DEFAULT_POSTGRES_PORT, @@ -15,7 +15,7 @@ class PostgresConnTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - GymlibIntegtestWorkspaceManager.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() def setUp(self) -> None: self.assertFalse( @@ -23,7 +23,7 @@ def setUp(self) -> None: "Make sure Postgres isn't running before starting the integration test. `pkill postgres` is one way " + "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.", ) - self.metadata = GymlibIntegtestWorkspaceManager.get_default_metadata() + self.metadata = GymlibIntegtestManager.get_default_metadata() # The reason we restart Postgres every time is to ensure a "clean" starting point # so that all tests are independent of each other. @@ -38,7 +38,7 @@ def tearDown(self) -> None: def create_pg_conn(self, pgport: int = DEFAULT_POSTGRES_PORT) -> PostgresConn: return PostgresConn( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_dbgym_workspace(), pgport, self.metadata.pristine_dbdata_snapshot_path, self.metadata.dbdata_parent_path, diff --git a/env/integtest_replay.py b/env/integtest_replay.py index 2b49389d..8b7ab12e 100644 --- a/env/integtest_replay.py +++ b/env/integtest_replay.py @@ -1,7 +1,7 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from env.integtest_util import GymlibIntegtestWorkspaceManager +from env.integtest_util import GymlibIntegtestManager from env.replay import replay from env.tuning_artifacts import ( DBMSConfigDelta, @@ -15,12 +15,12 @@ class ReplayTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - GymlibIntegtestWorkspaceManager.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() def test_replay(self) -> None: writer = TuningArtifactsWriter( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), - GymlibIntegtestWorkspaceManager.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) writer.write_step( DBMSConfigDelta( @@ -41,7 +41,7 @@ def test_replay(self) -> None: ) ) replay_data = replay( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_dbgym_workspace(), writer.tuning_artifacts_dpath, ) diff --git a/env/integtest_tuning_artifacts.py b/env/integtest_tuning_artifacts.py index b0a9fe91..9a700e30 100644 --- a/env/integtest_tuning_artifacts.py +++ b/env/integtest_tuning_artifacts.py @@ -1,6 +1,6 @@ import unittest -from env.integtest_util import GymlibIntegtestWorkspaceManager +from env.integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, @@ -14,7 +14,7 @@ class PostgresConnTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - GymlibIntegtestWorkspaceManager.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() @staticmethod def make_config(letter: str) -> DBMSConfigDelta: @@ -26,8 +26,8 @@ def make_config(letter: str) -> DBMSConfigDelta: def test_get_delta_at_step(self) -> None: writer = TuningArtifactsWriter( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), - GymlibIntegtestWorkspaceManager.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) writer.write_step(PostgresConnTests.make_config("a")) @@ -51,8 +51,8 @@ def test_get_delta_at_step(self) -> None: def test_get_all_deltas_in_order(self) -> None: writer = TuningArtifactsWriter( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), - GymlibIntegtestWorkspaceManager.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) writer.write_step(PostgresConnTests.make_config("a")) @@ -72,12 +72,12 @@ def test_get_all_deltas_in_order(self) -> None: def test_get_metadata(self) -> None: writer = TuningArtifactsWriter( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), - GymlibIntegtestWorkspaceManager.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) reader = TuningArtifactsReader(writer.tuning_artifacts_dpath) metadata = reader.get_metadata() - expected_metadata = GymlibIntegtestWorkspaceManager.get_default_metadata() + expected_metadata = GymlibIntegtestManager.get_default_metadata() self.assertEqual(metadata, expected_metadata) diff --git a/env/integtest_util.py b/env/integtest_util.py index 1927cd85..ed4c1a76 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -22,10 +22,10 @@ INTEGTEST_SCALE_FACTOR = 0.01 -class GymlibIntegtestWorkspaceManager: +class GymlibIntegtestManager: """ This is essentially a singleton class. This avoids multiple integtest_*.py files creating - the workspace and/or the DBGymWorkspace redundantly. + the workspace and/or the DBGymWorkspace object redundantly. The reason I put all these static methods in a class instead of directly in the module is that the functions have very generic names (e.g. set_up_workspace()) but having them @@ -38,35 +38,33 @@ class GymlibIntegtestWorkspaceManager: @staticmethod def set_up_workspace() -> None: # This if statement prevents us from setting up the workspace twice, which saves time. - if not GymlibIntegtestWorkspaceManager.get_workspace_path().exists(): + if not GymlibIntegtestManager.get_workspace_path().exists(): subprocess.run(["./env/set_up_gymlib_integtest_workspace.sh"], check=True) # Once we get here, we have an invariant that the workspace exists. We need this # invariant to be true in order to create the DBGymWorkspace. # # However, it also can't be created more than once so we need to check `is None`. - if GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE is None: - GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE = DBGymWorkspace( - GymlibIntegtestWorkspaceManager.DBGYM_CONFIG_FPATH + if GymlibIntegtestManager.DBGYM_WORKSPACE is None: + GymlibIntegtestManager.DBGYM_WORKSPACE = DBGymWorkspace( + GymlibIntegtestManager.DBGYM_CONFIG_FPATH ) @staticmethod def get_dbgym_workspace() -> DBGymWorkspace: - assert GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE is not None - return GymlibIntegtestWorkspaceManager.DBGYM_WORKSPACE + assert GymlibIntegtestManager.DBGYM_WORKSPACE is not None + return GymlibIntegtestManager.DBGYM_WORKSPACE @staticmethod def get_workspace_path() -> Path: - with open( - GymlibIntegtestWorkspaceManager.DBGYM_CONFIG_FPATH - ) as f: + with open(GymlibIntegtestManager.DBGYM_CONFIG_FPATH) as f: return Path(yaml.safe_load(f)["dbgym_workspace_path"]) @staticmethod def get_default_metadata() -> TuningMetadata: - dbgym_workspace = GymlibIntegtestWorkspaceManager.get_dbgym_workspace() + dbgym_workspace = GymlibIntegtestManager.get_dbgym_workspace() workspace_path = fully_resolve_path( - dbgym_workspace, GymlibIntegtestWorkspaceManager.get_workspace_path() + dbgym_workspace, GymlibIntegtestManager.get_workspace_path() ) return TuningMetadata( workload_path=fully_resolve_path( diff --git a/env/integtest_workload.py b/env/integtest_workload.py index 9ee24f63..5bf64d67 100644 --- a/env/integtest_workload.py +++ b/env/integtest_workload.py @@ -4,7 +4,7 @@ from env.integtest_util import ( INTEGTEST_BENCHMARK, INTEGTEST_SCALE_FACTOR, - GymlibIntegtestWorkspaceManager, + GymlibIntegtestManager, ) from env.workload import Workload from util.workspace import ( @@ -18,13 +18,13 @@ class WorkloadTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - GymlibIntegtestWorkspaceManager.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() def test_workload(self) -> None: workload_dpath = fully_resolve_path( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_dbgym_workspace(), get_default_workload_path( - GymlibIntegtestWorkspaceManager.get_workspace_path(), + GymlibIntegtestManager.get_workspace_path(), INTEGTEST_BENCHMARK, get_workload_name( INTEGTEST_SCALE_FACTOR, @@ -34,7 +34,7 @@ def test_workload(self) -> None: ) workload = Workload( - GymlibIntegtestWorkspaceManager.get_dbgym_workspace(), workload_dpath + GymlibIntegtestManager.get_dbgym_workspace(), workload_dpath ) # Check the order of query IDs. From a98ca869e3722b950a58107375c3873a1c35343a Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 15:00:13 -0500 Subject: [PATCH 05/43] moved benchmark and scale factor into class --- env/integtest_util.py | 22 +++++++++++++--------- env/integtest_workload.py | 12 ++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/env/integtest_util.py b/env/integtest_util.py index ed4c1a76..bd62c03f 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -16,11 +16,6 @@ get_workload_name, ) -# These are the values used by set_up_gymlib_integtest_workspace.sh. -# TODO: make set_up_gymlib_integtest_workspace.sh take in these values directly as envvars. -INTEGTEST_BENCHMARK = "tpch" -INTEGTEST_SCALE_FACTOR = 0.01 - class GymlibIntegtestManager: """ @@ -32,6 +27,11 @@ class GymlibIntegtestManager: inside a class makes it clear that they are related to the gymlib integration tests. """ + # These are the values used by set_up_gymlib_integtest_workspace.sh. + # TODO: make set_up_gymlib_integtest_workspace.sh take in these values directly as envvars. + BENCHMARK = "tpch" + SCALE_FACTOR = 0.01 + DBGYM_CONFIG_FPATH = Path("env/gymlib_integtest_dbgym_config.yaml") DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None @@ -71,17 +71,21 @@ def get_default_metadata() -> TuningMetadata: dbgym_workspace, get_default_workload_path( workspace_path, - INTEGTEST_BENCHMARK, + GymlibIntegtestManager.BENCHMARK, get_workload_name( - INTEGTEST_SCALE_FACTOR, - get_default_workload_name_suffix(INTEGTEST_BENCHMARK), + GymlibIntegtestManager.SCALE_FACTOR, + get_default_workload_name_suffix( + GymlibIntegtestManager.BENCHMARK + ), ), ), ), pristine_dbdata_snapshot_path=fully_resolve_path( dbgym_workspace, get_default_pristine_dbdata_snapshot_path( - workspace_path, INTEGTEST_BENCHMARK, INTEGTEST_SCALE_FACTOR + workspace_path, + GymlibIntegtestManager.BENCHMARK, + GymlibIntegtestManager.SCALE_FACTOR, ), ), dbdata_parent_path=fully_resolve_path( diff --git a/env/integtest_workload.py b/env/integtest_workload.py index 5bf64d67..847788ef 100644 --- a/env/integtest_workload.py +++ b/env/integtest_workload.py @@ -1,11 +1,7 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from env.integtest_util import ( - INTEGTEST_BENCHMARK, - INTEGTEST_SCALE_FACTOR, - GymlibIntegtestManager, -) +from env.integtest_util import GymlibIntegtestManager from env.workload import Workload from util.workspace import ( fully_resolve_path, @@ -25,10 +21,10 @@ def test_workload(self) -> None: GymlibIntegtestManager.get_dbgym_workspace(), get_default_workload_path( GymlibIntegtestManager.get_workspace_path(), - INTEGTEST_BENCHMARK, + GymlibIntegtestManager.BENCHMARK, get_workload_name( - INTEGTEST_SCALE_FACTOR, - get_default_workload_name_suffix(INTEGTEST_BENCHMARK), + GymlibIntegtestManager.SCALE_FACTOR, + get_default_workload_name_suffix(GymlibIntegtestManager.BENCHMARK), ), ), ) From 61080fb44b5d227ed5dd614140c74e88fdf3fcfb Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 15:07:43 -0500 Subject: [PATCH 06/43] now taking in benchmark and scale factor as envvars --- env/gymlib_integtest_dbgym_config.yaml | 2 +- env/integtest_util.py | 12 ++++++++++-- env/set_up_gymlib_integtest_workspace.sh | 6 ++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/env/gymlib_integtest_dbgym_config.yaml b/env/gymlib_integtest_dbgym_config.yaml index 0de54f97..8b64e125 100644 --- a/env/gymlib_integtest_dbgym_config.yaml +++ b/env/gymlib_integtest_dbgym_config.yaml @@ -1 +1 @@ -dbgym_workspace_path: ../dbgym_env_integtest_workspace/ +dbgym_workspace_path: ../dbgym_gymlib_integtest_workspace/ diff --git a/env/integtest_util.py b/env/integtest_util.py index bd62c03f..a80afc20 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -1,3 +1,4 @@ +import os import subprocess from pathlib import Path from typing import Optional @@ -28,7 +29,6 @@ class GymlibIntegtestManager: """ # These are the values used by set_up_gymlib_integtest_workspace.sh. - # TODO: make set_up_gymlib_integtest_workspace.sh take in these values directly as envvars. BENCHMARK = "tpch" SCALE_FACTOR = 0.01 @@ -39,7 +39,15 @@ class GymlibIntegtestManager: def set_up_workspace() -> None: # This if statement prevents us from setting up the workspace twice, which saves time. if not GymlibIntegtestManager.get_workspace_path().exists(): - subprocess.run(["./env/set_up_gymlib_integtest_workspace.sh"], check=True) + subprocess.run( + ["./env/set_up_gymlib_integtest_workspace.sh"], + env={ + "BENCHMARK": GymlibIntegtestManager.BENCHMARK, + "SCALE_FACTOR": str(GymlibIntegtestManager.SCALE_FACTOR), + **os.environ, + }, + check=True, + ) # Once we get here, we have an invariant that the workspace exists. We need this # invariant to be true in order to create the DBGymWorkspace. diff --git a/env/set_up_gymlib_integtest_workspace.sh b/env/set_up_gymlib_integtest_workspace.sh index fae23e66..62bddb1b 100755 --- a/env/set_up_gymlib_integtest_workspace.sh +++ b/env/set_up_gymlib_integtest_workspace.sh @@ -6,13 +6,11 @@ # the Postgres repo is very large and (b) the built binary will be different for different machines. # This script should be run from the base dbgym/ directory. -set -euxo pipefail +set -euo pipefail # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default. INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}" -BENCHMARK=tpch -SCALE_FACTOR=0.01 -export DBGYM_CONFIG_PATH=env/env_integtests_dbgym_config.yaml # Note that this envvar needs to be exported. +export DBGYM_CONFIG_PATH=env/gymlib_integtest_dbgym_config.yaml # Note that this envvar needs to be exported. WORKSPACE_PATH=$(grep 'dbgym_workspace_path:' $DBGYM_CONFIG_PATH | sed 's/dbgym_workspace_path: //') python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR From 9a89bb1dde29fef2b8c0d9299aee49539d89fbeb Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 15:29:02 -0500 Subject: [PATCH 07/43] DBGymWorkspace now only takes in the workspace path in init --- env/integtest_util.py | 31 +++++++++++++------------------ env/integtest_workload.py | 2 +- util/workspace.py | 30 ++++++++++++++---------------- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/env/integtest_util.py b/env/integtest_util.py index a80afc20..5cfe71e6 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -3,8 +3,6 @@ from pathlib import Path from typing import Optional -import yaml - from env.tuning_artifacts import TuningMetadata from util.workspace import ( DBGymWorkspace, @@ -15,6 +13,7 @@ get_default_workload_name_suffix, get_default_workload_path, get_workload_name, + get_workspace_path_from_config, ) @@ -37,8 +36,12 @@ class GymlibIntegtestManager: @staticmethod def set_up_workspace() -> None: + workspace_path = get_workspace_path_from_config( + GymlibIntegtestManager.DBGYM_CONFIG_FPATH + ) + # This if statement prevents us from setting up the workspace twice, which saves time. - if not GymlibIntegtestManager.get_workspace_path().exists(): + if not workspace_path.exists(): subprocess.run( ["./env/set_up_gymlib_integtest_workspace.sh"], env={ @@ -54,31 +57,21 @@ def set_up_workspace() -> None: # # However, it also can't be created more than once so we need to check `is None`. if GymlibIntegtestManager.DBGYM_WORKSPACE is None: - GymlibIntegtestManager.DBGYM_WORKSPACE = DBGymWorkspace( - GymlibIntegtestManager.DBGYM_CONFIG_FPATH - ) + GymlibIntegtestManager.DBGYM_WORKSPACE = DBGymWorkspace(workspace_path) @staticmethod def get_dbgym_workspace() -> DBGymWorkspace: assert GymlibIntegtestManager.DBGYM_WORKSPACE is not None return GymlibIntegtestManager.DBGYM_WORKSPACE - @staticmethod - def get_workspace_path() -> Path: - with open(GymlibIntegtestManager.DBGYM_CONFIG_FPATH) as f: - return Path(yaml.safe_load(f)["dbgym_workspace_path"]) - @staticmethod def get_default_metadata() -> TuningMetadata: dbgym_workspace = GymlibIntegtestManager.get_dbgym_workspace() - workspace_path = fully_resolve_path( - dbgym_workspace, GymlibIntegtestManager.get_workspace_path() - ) return TuningMetadata( workload_path=fully_resolve_path( dbgym_workspace, get_default_workload_path( - workspace_path, + dbgym_workspace.dbgym_workspace_path, GymlibIntegtestManager.BENCHMARK, get_workload_name( GymlibIntegtestManager.SCALE_FACTOR, @@ -91,15 +84,17 @@ def get_default_metadata() -> TuningMetadata: pristine_dbdata_snapshot_path=fully_resolve_path( dbgym_workspace, get_default_pristine_dbdata_snapshot_path( - workspace_path, + dbgym_workspace.dbgym_workspace_path, GymlibIntegtestManager.BENCHMARK, GymlibIntegtestManager.SCALE_FACTOR, ), ), dbdata_parent_path=fully_resolve_path( - dbgym_workspace, get_default_dbdata_parent_dpath(workspace_path) + dbgym_workspace, + get_default_dbdata_parent_dpath(dbgym_workspace.dbgym_workspace_path), ), pgbin_path=fully_resolve_path( - dbgym_workspace, get_default_pgbin_path(workspace_path) + dbgym_workspace, + get_default_pgbin_path(dbgym_workspace.dbgym_workspace_path), ), ) diff --git a/env/integtest_workload.py b/env/integtest_workload.py index 847788ef..d3431c2f 100644 --- a/env/integtest_workload.py +++ b/env/integtest_workload.py @@ -20,7 +20,7 @@ def test_workload(self) -> None: workload_dpath = fully_resolve_path( GymlibIntegtestManager.get_dbgym_workspace(), get_default_workload_path( - GymlibIntegtestManager.get_workspace_path(), + GymlibIntegtestManager.get_dbgym_workspace().dbgym_workspace_path, GymlibIntegtestManager.BENCHMARK, get_workload_name( GymlibIntegtestManager.SCALE_FACTOR, diff --git a/util/workspace.py b/util/workspace.py index b8ae3edd..cb8d721b 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -166,7 +166,9 @@ class DBGymWorkspace: num_times_created_this_run: int = 0 - def __init__(self, dbgym_config_path: Path): + def __init__(self, dbgym_workspace_path: Path): + assert is_fully_resolved(dbgym_workspace_path) + # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once. DBGymWorkspace.num_times_created_this_run += 1 assert ( @@ -177,20 +179,7 @@ def __init__(self, dbgym_config_path: Path): os.getcwd() ), "This script should be invoked from the root of the dbgym repo." - # Parse the YAML file. - contents: str = dbgym_config_path.read_text() - yaml_config: dict[str, Any] = yaml.safe_load(contents) - - # Require dbgym_workspace_path to be absolute. - # All future paths should be constructed from dbgym_workspace_path. - dbgym_workspace_path = ( - Path(yaml_config["dbgym_workspace_path"]).resolve().absolute() - ) - - self.path: Path = dbgym_config_path self.cur_path_list: list[str] = ["dbgym"] - self.root_yaml: dict[str, Any] = yaml_config - self.cur_yaml: dict[str, Any] = self.root_yaml # Set and create paths. self.dbgym_repo_path = Path(os.getcwd()) @@ -244,7 +233,6 @@ def __init__(self, dbgym_config_path: Path): # explained further in the documentation. def append_group(self, name: str) -> None: self.cur_path_list.append(name) - self.cur_yaml = self.cur_yaml.get(name, {}) def cur_source_path(self, *dirs: str) -> Path: cur_path = self.dbgym_repo_path @@ -292,13 +280,23 @@ def cur_task_runs_artifacts_path(self, *dirs: str, mkdir: bool = False) -> Path: return self.cur_task_runs_path("artifacts", *dirs, mkdir=mkdir) +def get_workspace_path_from_config(dbgym_config_path: Path) -> Path: + """ + Returns the workspace path (as a fully resolved path) from the config file. + """ + with open(dbgym_config_path) as f: + # TODO: use fully_resolve_path() + return Path(yaml.safe_load(f)["dbgym_workspace_path"]).resolve().absolute() + + def make_standard_dbgym_workspace() -> DBGymWorkspace: """ The "standard" way to make a DBGymWorkspace using the DBGYM_CONFIG_PATH envvar and the default path of dbgym_config.yaml. """ dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) - dbgym_workspace = DBGymWorkspace(dbgym_config_path) + dbgym_workspace_path = get_workspace_path_from_config(dbgym_config_path) + dbgym_workspace = DBGymWorkspace(dbgym_workspace_path) return dbgym_workspace From 70906282abcb7ae214481c84d39667fd5df1107f Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 15:36:43 -0500 Subject: [PATCH 08/43] wrote get_base_dbgym_repo_dpath function --- benchmark/job/load_info.py | 2 +- benchmark/tpch/load_info.py | 2 +- util/workspace.py | 32 +++++++++++++++++++------------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py index 58482b4d..e205847d 100644 --- a/benchmark/job/load_info.py +++ b/benchmark/job/load_info.py @@ -37,7 +37,7 @@ class JobLoadInfo(LoadInfoBaseClass): def __init__(self, dbgym_workspace: DBGymWorkspace): # schema and constraints - schema_root_dpath = dbgym_workspace.dbgym_repo_path + schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath for component in JobLoadInfo.CODEBASE_PATH_COMPONENTS[ 1: ]: # [1:] to skip "dbgym" diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py index 5162b8b1..f9710b04 100644 --- a/benchmark/tpch/load_info.py +++ b/benchmark/tpch/load_info.py @@ -26,7 +26,7 @@ class TpchLoadInfo(LoadInfoBaseClass): def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float): # schema and constraints - schema_root_dpath = dbgym_workspace.dbgym_repo_path + schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath for component in TpchLoadInfo.CODEBASE_PATH_COMPONENTS[ 1: ]: # [1:] to skip "dbgym" diff --git a/util/workspace.py b/util/workspace.py index cb8d721b..420f88eb 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -175,14 +175,10 @@ def __init__(self, dbgym_workspace_path: Path): DBGymWorkspace.num_times_created_this_run == 1 ), f"DBGymWorkspace has been created {DBGymWorkspace.num_times_created_this_run} times. It should only be created once per run." - assert is_base_git_dir( - os.getcwd() - ), "This script should be invoked from the root of the dbgym repo." - + self.base_dbgym_repo_dpath = get_base_dbgym_repo_dpath() self.cur_path_list: list[str] = ["dbgym"] # Set and create paths. - self.dbgym_repo_path = Path(os.getcwd()) self.dbgym_workspace_path = dbgym_workspace_path self.dbgym_workspace_path.mkdir(parents=True, exist_ok=True) self.dbgym_runs_path = get_runs_path_from_workspace_path( @@ -235,7 +231,7 @@ def append_group(self, name: str) -> None: self.cur_path_list.append(name) def cur_source_path(self, *dirs: str) -> Path: - cur_path = self.dbgym_repo_path + cur_path = self.base_dbgym_repo_dpath assert self.cur_path_list[0] == "dbgym" for folder in self.cur_path_list[1:]: cur_path = cur_path / folder @@ -320,12 +316,12 @@ def fully_resolve_path( realabspath = Path(inputpath) # `expanduser()` is always "ok" to call first. realabspath = realabspath.expanduser() - # The reason we don't call Path.absolute() is because the path should be relative to dbgym_workspace.dbgym_repo_path, + # The reason we don't call Path.absolute() is because the path should be relative to get_base_dbgym_repo_dpath(), # which is not necessary where cwd() points at the time of calling this function. if not realabspath.is_absolute(): - realabspath = dbgym_workspace.dbgym_repo_path / realabspath + realabspath = get_base_dbgym_repo_dpath() / realabspath # `resolve()` has two uses: normalize the path (remove ..) and resolve symlinks. - # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does it this + # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does these together this # way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs) realabspath = realabspath.resolve() assert is_fully_resolved( @@ -334,7 +330,15 @@ def fully_resolve_path( return realabspath -def is_base_git_dir(cwd: str) -> bool: +def get_base_dbgym_repo_dpath() -> Path: + path = Path(os.getcwd()) + assert _is_base_dbgym_repo_dpath( + path + ), "This script should be invoked from the root of the dbgym repo." + return path + + +def _is_base_dbgym_repo_dpath(path: Path) -> bool: """ Returns whether we are in the base directory of some git repository """ @@ -342,10 +346,12 @@ def is_base_git_dir(cwd: str) -> bool: git_toplevel = subprocess.check_output( ["git", "rev-parse", "--show-toplevel"], encoding="utf-8" ).strip() - return git_toplevel == cwd - except subprocess.CalledProcessError as e: - # this means we are not in _any_ git repo + return Path(git_toplevel) == path + except subprocess.CalledProcessError: + # This means we are not in _any_ git repo return False + except Exception as e: + raise e def is_fully_resolved(path: Path) -> bool: From ab73156a6240a331b97d9b0500bb5724155c6a0f Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 15:38:53 -0500 Subject: [PATCH 09/43] fully_resolve_path no longer takes in dbgym_workspace --- dbms/postgres/cli.py | 4 ++-- env/integtest_util.py | 4 ---- env/integtest_workload.py | 1 - util/workspace.py | 9 +++------ 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 33c2ec0a..393689a1 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -106,8 +106,8 @@ def postgres_dbdata( ) # Fully resolve all input paths. - pgbin_path = fully_resolve_path(dbgym_workspace, pgbin_path) - dbdata_parent_dpath = fully_resolve_path(dbgym_workspace, dbdata_parent_dpath) + pgbin_path = fully_resolve_path(pgbin_path) + dbdata_parent_dpath = fully_resolve_path(dbdata_parent_dpath) # Check assertions on args if intended_dbdata_hardware == "hdd": diff --git a/env/integtest_util.py b/env/integtest_util.py index 5cfe71e6..e37e2b65 100644 --- a/env/integtest_util.py +++ b/env/integtest_util.py @@ -69,7 +69,6 @@ def get_default_metadata() -> TuningMetadata: dbgym_workspace = GymlibIntegtestManager.get_dbgym_workspace() return TuningMetadata( workload_path=fully_resolve_path( - dbgym_workspace, get_default_workload_path( dbgym_workspace.dbgym_workspace_path, GymlibIntegtestManager.BENCHMARK, @@ -82,7 +81,6 @@ def get_default_metadata() -> TuningMetadata: ), ), pristine_dbdata_snapshot_path=fully_resolve_path( - dbgym_workspace, get_default_pristine_dbdata_snapshot_path( dbgym_workspace.dbgym_workspace_path, GymlibIntegtestManager.BENCHMARK, @@ -90,11 +88,9 @@ def get_default_metadata() -> TuningMetadata: ), ), dbdata_parent_path=fully_resolve_path( - dbgym_workspace, get_default_dbdata_parent_dpath(dbgym_workspace.dbgym_workspace_path), ), pgbin_path=fully_resolve_path( - dbgym_workspace, get_default_pgbin_path(dbgym_workspace.dbgym_workspace_path), ), ) diff --git a/env/integtest_workload.py b/env/integtest_workload.py index d3431c2f..7f473bb3 100644 --- a/env/integtest_workload.py +++ b/env/integtest_workload.py @@ -18,7 +18,6 @@ def setUpClass() -> None: def test_workload(self) -> None: workload_dpath = fully_resolve_path( - GymlibIntegtestManager.get_dbgym_workspace(), get_default_workload_path( GymlibIntegtestManager.get_dbgym_workspace().dbgym_workspace_path, GymlibIntegtestManager.BENCHMARK, diff --git a/util/workspace.py b/util/workspace.py index 420f88eb..3da81499 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -281,8 +281,7 @@ def get_workspace_path_from_config(dbgym_config_path: Path) -> Path: Returns the workspace path (as a fully resolved path) from the config file. """ with open(dbgym_config_path) as f: - # TODO: use fully_resolve_path() - return Path(yaml.safe_load(f)["dbgym_workspace_path"]).resolve().absolute() + return fully_resolve_path(Path(yaml.safe_load(f)["dbgym_workspace_path"])) def make_standard_dbgym_workspace() -> DBGymWorkspace: @@ -296,9 +295,7 @@ def make_standard_dbgym_workspace() -> DBGymWorkspace: return dbgym_workspace -def fully_resolve_path( - dbgym_workspace: DBGymWorkspace, inputpath: os.PathLike[str] -) -> Path: +def fully_resolve_path(inputpath: os.PathLike[str]) -> Path: """ Fully resolve any path to a real, absolute path. @@ -600,7 +597,7 @@ def link_result( assert is_fully_resolved( result_fordpath ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" - result_fordpath = fully_resolve_path(dbgym_workspace, result_fordpath) + result_fordpath = fully_resolve_path(result_fordpath) assert is_child_path(result_fordpath, dbgym_workspace.dbgym_this_run_path) assert not os.path.islink(result_fordpath) From 3add6644c1f46d986e0e736df78b10fc5e8dcaad Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 15:46:04 -0500 Subject: [PATCH 10/43] get_workspace_path_from_config no longer calls fully_resolve_path --- util/workspace.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/util/workspace.py b/util/workspace.py index 3da81499..3167a7d3 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -167,8 +167,6 @@ class DBGymWorkspace: num_times_created_this_run: int = 0 def __init__(self, dbgym_workspace_path: Path): - assert is_fully_resolved(dbgym_workspace_path) - # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once. DBGymWorkspace.num_times_created_this_run += 1 assert ( @@ -181,6 +179,10 @@ def __init__(self, dbgym_workspace_path: Path): # Set and create paths. self.dbgym_workspace_path = dbgym_workspace_path self.dbgym_workspace_path.mkdir(parents=True, exist_ok=True) + + # Now that the workspace is guaranteed to be created, we can check if it's fully resolved. + assert is_fully_resolved(self.dbgym_workspace_path) + self.dbgym_runs_path = get_runs_path_from_workspace_path( self.dbgym_workspace_path ) @@ -281,7 +283,8 @@ def get_workspace_path_from_config(dbgym_config_path: Path) -> Path: Returns the workspace path (as a fully resolved path) from the config file. """ with open(dbgym_config_path) as f: - return fully_resolve_path(Path(yaml.safe_load(f)["dbgym_workspace_path"])) + # We do *not* call fully_resolve_path() here because the workspace may not exist yet. + return Path(yaml.safe_load(f)["dbgym_workspace_path"]).resolve().absolute() def make_standard_dbgym_workspace() -> DBGymWorkspace: @@ -355,6 +358,9 @@ def is_fully_resolved(path: Path) -> bool: """ Checks if a path is fully resolved (exists, is absolute, and contains no symlinks in its entire ancestry). + The reason we check for existence is because that's the only way we know that there are no symlinks in its entire ancestry. + If we didn't check for existence, we could later create a new symlink in the path's ancestry. + Even if a path exists, is absolute, and is not itself a symlink, it could still contain symlinks in its parent directories. For example: /home/user/ # Real directory @@ -597,7 +603,6 @@ def link_result( assert is_fully_resolved( result_fordpath ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" - result_fordpath = fully_resolve_path(result_fordpath) assert is_child_path(result_fordpath, dbgym_workspace.dbgym_this_run_path) assert not os.path.islink(result_fordpath) From 6ca5cb22613a886d6d02a1db6eb7a39574698020 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 18:23:57 -0500 Subject: [PATCH 11/43] comments --- manage/tests/unittest_clean.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index b565d914..461ee601 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -25,6 +25,9 @@ class CleanTests(unittest.TestCase): @staticmethod def create_structure(root_path: Path, structure: FilesystemStructure) -> None: + """ + Create files and directories according to the structure. + """ def create_structure_internal( root_path: Path, cur_path: Path, structure: FilesystemStructure ) -> None: @@ -53,6 +56,9 @@ def create_structure_internal( @staticmethod def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool: + """ + Verify that the files and directories match the expected structure. + """ def verify_structure_internal( root_path: Path, cur_path: Path, structure: FilesystemStructure ) -> bool: From 910c2184c17da02f5e6ff09a3790f0fb1e5467a7 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 18:24:07 -0500 Subject: [PATCH 12/43] fmt --- manage/tests/unittest_clean.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 461ee601..78d38494 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -28,6 +28,7 @@ def create_structure(root_path: Path, structure: FilesystemStructure) -> None: """ Create files and directories according to the structure. """ + def create_structure_internal( root_path: Path, cur_path: Path, structure: FilesystemStructure ) -> None: @@ -59,6 +60,7 @@ def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool: """ Verify that the files and directories match the expected structure. """ + def verify_structure_internal( root_path: Path, cur_path: Path, structure: FilesystemStructure ) -> bool: From 723a72ac65b4881890283ddec67ef5abe872de04 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 18:27:26 -0500 Subject: [PATCH 13/43] moved env tests to env/tests/ --- .github/workflows/tests.yaml | 2 +- env/tests/__init__.py | 0 env/{ => tests}/gymlib_integtest_dbgym_config.yaml | 0 env/{ => tests}/integtest_pg_conn.py | 2 +- env/{ => tests}/integtest_replay.py | 2 +- env/{ => tests}/integtest_tuning_artifacts.py | 2 +- env/{ => tests}/integtest_util.py | 8 +++++--- env/{ => tests}/integtest_workload.py | 2 +- env/{ => tests}/set_up_gymlib_integtest_workspace.sh | 2 +- 9 files changed, 11 insertions(+), 9 deletions(-) create mode 100644 env/tests/__init__.py rename env/{ => tests}/gymlib_integtest_dbgym_config.yaml (100%) rename env/{ => tests}/integtest_pg_conn.py (99%) rename env/{ => tests}/integtest_replay.py (96%) rename env/{ => tests}/integtest_tuning_artifacts.py (97%) rename env/{ => tests}/integtest_util.py (91%) rename env/{ => tests}/integtest_workload.py (95%) rename env/{ => tests}/set_up_gymlib_integtest_workspace.sh (90%) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c987172c..05fd3dc5 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -52,7 +52,7 @@ jobs: # Integration tests do require external systems to be running (most commonly a database instance). # Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does. env: - # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `./env/set_up_gymlib_integtest_workspace.sh`. + # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `set_up_gymlib_integtest_workspace.sh`. INTENDED_DBDATA_HARDWARE: ssd run: | . "$HOME/.cargo/env" diff --git a/env/tests/__init__.py b/env/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/env/gymlib_integtest_dbgym_config.yaml b/env/tests/gymlib_integtest_dbgym_config.yaml similarity index 100% rename from env/gymlib_integtest_dbgym_config.yaml rename to env/tests/gymlib_integtest_dbgym_config.yaml diff --git a/env/integtest_pg_conn.py b/env/tests/integtest_pg_conn.py similarity index 99% rename from env/integtest_pg_conn.py rename to env/tests/integtest_pg_conn.py index 8bb72b5f..b3d4b212 100644 --- a/env/integtest_pg_conn.py +++ b/env/tests/integtest_pg_conn.py @@ -3,8 +3,8 @@ import psycopg -from env.integtest_util import GymlibIntegtestManager from env.pg_conn import PostgresConn +from env.tests.integtest_util import GymlibIntegtestManager from util.pg import ( DEFAULT_POSTGRES_PORT, get_is_postgres_running, diff --git a/env/integtest_replay.py b/env/tests/integtest_replay.py similarity index 96% rename from env/integtest_replay.py rename to env/tests/integtest_replay.py index 8b7ab12e..8d6eb183 100644 --- a/env/integtest_replay.py +++ b/env/tests/integtest_replay.py @@ -1,8 +1,8 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from env.integtest_util import GymlibIntegtestManager from env.replay import replay +from env.tests.integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, diff --git a/env/integtest_tuning_artifacts.py b/env/tests/integtest_tuning_artifacts.py similarity index 97% rename from env/integtest_tuning_artifacts.py rename to env/tests/integtest_tuning_artifacts.py index 9a700e30..2138e0f7 100644 --- a/env/integtest_tuning_artifacts.py +++ b/env/tests/integtest_tuning_artifacts.py @@ -1,6 +1,6 @@ import unittest -from env.integtest_util import GymlibIntegtestManager +from env.tests.integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, diff --git a/env/integtest_util.py b/env/tests/integtest_util.py similarity index 91% rename from env/integtest_util.py rename to env/tests/integtest_util.py index e37e2b65..1b621073 100644 --- a/env/integtest_util.py +++ b/env/tests/integtest_util.py @@ -27,11 +27,13 @@ class GymlibIntegtestManager: inside a class makes it clear that they are related to the gymlib integration tests. """ - # These are the values used by set_up_gymlib_integtest_workspace.sh. + # These constants are also used by set_up_gymlib_integtest_workspace.sh. BENCHMARK = "tpch" SCALE_FACTOR = 0.01 + # TODO: make DBGYM_CONFIG_FPATH an envvar. + DBGYM_CONFIG_FPATH = Path("env/tests/gymlib_integtest_dbgym_config.yaml") - DBGYM_CONFIG_FPATH = Path("env/gymlib_integtest_dbgym_config.yaml") + # This is set at most once by set_up_workspace(). DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None @staticmethod @@ -43,7 +45,7 @@ def set_up_workspace() -> None: # This if statement prevents us from setting up the workspace twice, which saves time. if not workspace_path.exists(): subprocess.run( - ["./env/set_up_gymlib_integtest_workspace.sh"], + ["./env/tests/set_up_gymlib_integtest_workspace.sh"], env={ "BENCHMARK": GymlibIntegtestManager.BENCHMARK, "SCALE_FACTOR": str(GymlibIntegtestManager.SCALE_FACTOR), diff --git a/env/integtest_workload.py b/env/tests/integtest_workload.py similarity index 95% rename from env/integtest_workload.py rename to env/tests/integtest_workload.py index 7f473bb3..fbfb91a1 100644 --- a/env/integtest_workload.py +++ b/env/tests/integtest_workload.py @@ -1,7 +1,7 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from env.integtest_util import GymlibIntegtestManager +from env.tests.integtest_util import GymlibIntegtestManager from env.workload import Workload from util.workspace import ( fully_resolve_path, diff --git a/env/set_up_gymlib_integtest_workspace.sh b/env/tests/set_up_gymlib_integtest_workspace.sh similarity index 90% rename from env/set_up_gymlib_integtest_workspace.sh rename to env/tests/set_up_gymlib_integtest_workspace.sh index 62bddb1b..bb26f3a1 100755 --- a/env/set_up_gymlib_integtest_workspace.sh +++ b/env/tests/set_up_gymlib_integtest_workspace.sh @@ -10,7 +10,7 @@ set -euo pipefail # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default. INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}" -export DBGYM_CONFIG_PATH=env/gymlib_integtest_dbgym_config.yaml # Note that this envvar needs to be exported. +export DBGYM_CONFIG_PATH=env/tests/gymlib_integtest_dbgym_config.yaml # Note that this envvar needs to be exported. WORKSPACE_PATH=$(grep 'dbgym_workspace_path:' $DBGYM_CONFIG_PATH | sed 's/dbgym_workspace_path: //') python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR From 2a5ead18a9f97cf4f0ace8f52aad7f064b9c6100 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 18:45:43 -0500 Subject: [PATCH 14/43] DBGYM_CONFIG_PATH is now only set in one place --- .github/workflows/tests.yaml | 2 +- ...pace.sh => _set_up_gymlib_integtest_workspace.sh} | 7 +++++-- env/tests/integtest_util.py | 12 +++++++----- util/workspace.py | 1 + 4 files changed, 14 insertions(+), 8 deletions(-) rename env/tests/{set_up_gymlib_integtest_workspace.sh => _set_up_gymlib_integtest_workspace.sh} (74%) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 05fd3dc5..a8fec259 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -52,7 +52,7 @@ jobs: # Integration tests do require external systems to be running (most commonly a database instance). # Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does. env: - # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `set_up_gymlib_integtest_workspace.sh`. + # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `_set_up_gymlib_integtest_workspace.sh`. INTENDED_DBDATA_HARDWARE: ssd run: | . "$HOME/.cargo/env" diff --git a/env/tests/set_up_gymlib_integtest_workspace.sh b/env/tests/_set_up_gymlib_integtest_workspace.sh similarity index 74% rename from env/tests/set_up_gymlib_integtest_workspace.sh rename to env/tests/_set_up_gymlib_integtest_workspace.sh index bb26f3a1..bd2130a7 100755 --- a/env/tests/set_up_gymlib_integtest_workspace.sh +++ b/env/tests/_set_up_gymlib_integtest_workspace.sh @@ -1,5 +1,10 @@ #!/bin/bash +# DO NOT RUN THIS SCRIPT DIRECTLY. +# This script only runs correctly when run by GymlibIntegtestManager.set_up_workspace() as it sets the necessary envvars. +# By allowing GymlibIntegtestManager.set_up_workspace() to set the envvars, we ensure that the envvars are only defined +# in a single location (inside GymlibIntegtestManager). + # Gymlib integration tests relies on Postgres being built and workloads/dbdata being generated. # Generating these things is not considered a part of the test which is why it's in its own shell script. # The reason there's a shell script generating them instead of them just being in the repo is because (a) @@ -10,8 +15,6 @@ set -euo pipefail # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default. INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}" -export DBGYM_CONFIG_PATH=env/tests/gymlib_integtest_dbgym_config.yaml # Note that this envvar needs to be exported. -WORKSPACE_PATH=$(grep 'dbgym_workspace_path:' $DBGYM_CONFIG_PATH | sed 's/dbgym_workspace_path: //') python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR diff --git a/env/tests/integtest_util.py b/env/tests/integtest_util.py index 1b621073..86b3b829 100644 --- a/env/tests/integtest_util.py +++ b/env/tests/integtest_util.py @@ -27,11 +27,10 @@ class GymlibIntegtestManager: inside a class makes it clear that they are related to the gymlib integration tests. """ - # These constants are also used by set_up_gymlib_integtest_workspace.sh. + # These constants are also used by _set_up_gymlib_integtest_workspace.sh. BENCHMARK = "tpch" SCALE_FACTOR = 0.01 - # TODO: make DBGYM_CONFIG_FPATH an envvar. - DBGYM_CONFIG_FPATH = Path("env/tests/gymlib_integtest_dbgym_config.yaml") + DBGYM_CONFIG_PATH = Path("env/tests/gymlib_integtest_dbgym_config.yaml") # This is set at most once by set_up_workspace(). DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None @@ -39,16 +38,19 @@ class GymlibIntegtestManager: @staticmethod def set_up_workspace() -> None: workspace_path = get_workspace_path_from_config( - GymlibIntegtestManager.DBGYM_CONFIG_FPATH + GymlibIntegtestManager.DBGYM_CONFIG_PATH ) # This if statement prevents us from setting up the workspace twice, which saves time. if not workspace_path.exists(): subprocess.run( - ["./env/tests/set_up_gymlib_integtest_workspace.sh"], + ["./env/tests/_set_up_gymlib_integtest_workspace.sh"], env={ "BENCHMARK": GymlibIntegtestManager.BENCHMARK, "SCALE_FACTOR": str(GymlibIntegtestManager.SCALE_FACTOR), + # By setting this envvar, we ensure that when running _set_up_gymlib_integtest_workspace.sh, + # make_standard_dbgym_workspace() will use the correct DBGYM_CONFIG_PATH. + "DBGYM_CONFIG_PATH": str(GymlibIntegtestManager.DBGYM_CONFIG_PATH), **os.environ, }, check=True, diff --git a/util/workspace.py b/util/workspace.py index 3167a7d3..56ba2d9f 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -293,6 +293,7 @@ def make_standard_dbgym_workspace() -> DBGymWorkspace: default path of dbgym_config.yaml. """ dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) + assert dbgym_config_path == Path("env/tests/gymlib_integtest_dbgym_config.yaml") dbgym_workspace_path = get_workspace_path_from_config(dbgym_config_path) dbgym_workspace = DBGymWorkspace(dbgym_workspace_path) return dbgym_workspace From 71495df0b778677b23cfca5efca61f426b9046fb Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 18:51:48 -0500 Subject: [PATCH 15/43] del unused --- util/workspace.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/util/workspace.py b/util/workspace.py index 56ba2d9f..474a7e70 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -8,7 +8,6 @@ import subprocess import time from datetime import datetime -from enum import Enum from pathlib import Path from typing import IO, Any, Optional @@ -19,12 +18,6 @@ from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run -# Enums -TuningMode = Enum("TuningMode", ["HPO", "TUNE", "REPLAY"]) - -# Default values -DEFAULT_WORKLOAD_TIMEOUT = 600 - # Relative paths of different folders in the codebase DBMS_PATH = Path("dbms") POSTGRES_PATH = DBMS_PATH / "postgres" From 9845d227324a9c604b1357fa8c89e37c327cf697 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 19:29:44 -0500 Subject: [PATCH 16/43] unittest_workspace basics --- util/tests/__init__.py | 0 util/tests/unittest_workspace.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 util/tests/__init__.py create mode 100644 util/tests/unittest_workspace.py diff --git a/util/tests/__init__.py b/util/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py new file mode 100644 index 00000000..99edac02 --- /dev/null +++ b/util/tests/unittest_workspace.py @@ -0,0 +1,28 @@ +# TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers. + +from pathlib import Path +import shutil +import unittest + +from manage.tests.unittest_clean import FilesystemStructure + + +class WorkspaceTests(unittest.TestCase): + scratchspace_path: Path = Path() + workspace_path: Path = Path() + + @classmethod + def setUpClass(cls) -> None: + cls.scratchspace_path = Path.cwd() / "util/tests/test_workspace_scratchspace/" + cls.workspace_path = cls.scratchspace_path / "dbgym_workspace" + + def setUp(self) -> None: + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + def tearDown(self) -> None: + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + def test_workspace_init(self) -> None: + pass From d4c3206bf81344a7c7e48531d0ec5d8c73068318 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 26 Dec 2024 19:39:56 -0500 Subject: [PATCH 17/43] refactored clean unittests to have a workspace in the scratch space (this fixes the nonexistent workspace test and makes external files truly external) --- manage/tests/unittest_clean.py | 185 +++++++++++++++++-------------- util/tests/unittest_workspace.py | 2 +- 2 files changed, 104 insertions(+), 83 deletions(-) diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 78d38494..0ace0d38 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -22,6 +22,7 @@ class CleanTests(unittest.TestCase): scratchspace_path: Path = Path() + workspace_path: Path = Path() @staticmethod def create_structure(root_path: Path, structure: FilesystemStructure) -> None: @@ -126,14 +127,17 @@ def make_workspace_structure( """ return FilesystemStructure( { - "symlinks": symlinks_structure, - "task_runs": task_runs_structure, + "dbgym_workspace": { + "symlinks": symlinks_structure, + "task_runs": task_runs_structure, + } } ) @classmethod def setUpClass(cls) -> None: cls.scratchspace_path = Path.cwd() / "manage/tests/test_clean_scratchspace/" + cls.workspace_path = cls.scratchspace_path / "dbgym_workspace" def setUp(self) -> None: if self.scratchspace_path.exists(): @@ -226,33 +230,35 @@ def test_structure_helpers(self) -> None: ) def test_nonexistent_workspace(self) -> None: - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + # This just ensures that it doesn't raise an exception. + clean_workspace(MockDBGymWorkspace(self.workspace_path)) def test_no_symlinks_dir_and_no_task_runs_dir(self) -> None: - starting_structure = FilesystemStructure({}) - ending_structure = FilesystemStructure({}) + starting_structure = FilesystemStructure({"dbgym_workspace": {}}) + ending_structure = FilesystemStructure({"dbgym_workspace": {}}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_no_symlinks_dir_and_yes_task_runs_dir(self) -> None: starting_structure = FilesystemStructure( - {"task_runs": {"file1.txt": ("file",)}} + {"dbgym_workspace": {"task_runs": {"file1.txt": ("file",)}}} ) - ending_structure = FilesystemStructure({"task_runs": {}}) + ending_structure = FilesystemStructure({"dbgym_workspace": {"task_runs": {}}}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_yes_symlinks_dir_and_no_task_runs_dir(self) -> None: - starting_structure = FilesystemStructure({"symlinks": {}}) - ending_structure = FilesystemStructure({"symlinks": {}}) + # If there are no task runs there can't be any symlinks. + starting_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) + ending_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -270,7 +276,7 @@ def test_no_symlinks_in_dir_and_no_task_runs_in_dir(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -288,14 +294,14 @@ def test_no_links_in_symlinks(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_link_to_file_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( {"file1.txt": ("file",), "file2.txt": ("file",)} @@ -304,7 +310,7 @@ def test_link_to_file_directly_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/file1.txt")} ) ending_task_runs_structure = FilesystemStructure({"file1.txt": ("file",)}) ending_structure = CleanTests.make_workspace_structure( @@ -312,14 +318,14 @@ def test_link_to_file_directly_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_link_to_dir_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { @@ -331,7 +337,7 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} @@ -341,14 +347,14 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { @@ -360,7 +366,7 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} @@ -370,14 +376,14 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_link_to_dir_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/dir2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/dir2")} ) starting_task_runs_structure = FilesystemStructure( { @@ -389,7 +395,7 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/dir2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/dir2")} ) ending_task_runs_structure = FilesystemStructure( { @@ -401,18 +407,18 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_link_to_link_crashes(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/symlink2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "symlink2": ("symlink", "task_runs/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt"), "file1.txt": ("file",), } ) @@ -422,15 +428,17 @@ def test_link_to_link_crashes(self) -> None: CleanTests.create_structure(self.scratchspace_path, starting_structure) with self.assertRaises(AssertionError): - clean_workspace(MockDBGymWorkspace(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) def test_safe_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + }, "file1.txt": ("file",), "file2.txt": ("file",), } @@ -439,11 +447,13 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + }, "file1.txt": ("file",), } ) @@ -452,20 +462,20 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt"), }, "file2.txt": ("file",), "file3.txt": ("file",), @@ -475,13 +485,13 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt"), }, "file2.txt": ("file",), } @@ -491,18 +501,20 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/dir2/file2.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt") + }, "dir2": { "file2.txt": ("file",), }, @@ -513,11 +525,13 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/dir2/file2.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt") + }, "dir2": { "file2.txt": ("file",), }, @@ -528,18 +542,20 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_aggressive_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + }, "file1.txt": ("file",), "file2.txt": ("file",), } @@ -548,7 +564,7 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { @@ -560,18 +576,20 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="aggressive") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="aggressive") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_link_to_link_to_file_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/symlink2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file2.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt") + }, "file2.txt": ("file",), } ) @@ -583,15 +601,15 @@ def test_link_to_link_to_file_gives_error(self) -> None: # We disallow links to links so it's an AssertionError with self.assertRaises(AssertionError): - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") def test_multi_link_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/symlink2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "symlinks/symlink1")}, + "dir1": {"symlink2": ("symlink", "dbgym_workspace/symlinks/symlink1")}, } ) starting_structure = CleanTests.make_workspace_structure( @@ -602,11 +620,11 @@ def test_multi_link_loop_gives_error(self) -> None: # pathlib disallows multi-link loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") def test_link_self_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "symlinks/symlink1")} + {"symlink1": ("symlink", "dbgym_workspace/symlinks/symlink1")} ) starting_task_runs_structure = FilesystemStructure({}) starting_structure = CleanTests.make_workspace_structure( @@ -617,23 +635,23 @@ def test_link_self_loop_gives_error(self) -> None: # pathlib disallows link self-loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs( self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir2/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) @@ -641,17 +659,17 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir2/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) @@ -660,7 +678,7 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -669,13 +687,13 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) @@ -683,13 +701,13 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) @@ -698,20 +716,20 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) @@ -719,13 +737,13 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) @@ -734,20 +752,23 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) def test_broken_symlink_has_no_effect(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/non_existent_file.txt"), + "symlink2": ( + "symlink", + "dbgym_workspace/task_runs/dir1/non_existent_file.txt", + ), }, "dir2": {"file2.txt": ("file",)}, } @@ -756,7 +777,7 @@ def test_broken_symlink_has_no_effect(self) -> None: starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",), "symlink2": ("symlink", None)}} @@ -766,7 +787,7 @@ def test_broken_symlink_has_no_effect(self) -> None: ) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -776,7 +797,7 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { @@ -794,12 +815,12 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save { "dir3": { "file3.txt": ("file",), - "symlink3": ("symlink", "task_runs/dir2/file2.txt"), + "symlink3": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), } } ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( { @@ -817,7 +838,7 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save } CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) @@ -837,7 +858,7 @@ def test_outside_task_runs_doesnt_get_deleted(self) -> None: ending_structure["external"] = FilesystemStructure({"file1.txt": ("file",)}) CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymWorkspace(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 99edac02..ae5e5ea1 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -1,8 +1,8 @@ # TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers. -from pathlib import Path import shutil import unittest +from pathlib import Path from manage.tests.unittest_clean import FilesystemStructure From 3d93e563fbf1bfb373c1d6d88db5c183e9ee923a Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 14:59:40 -0500 Subject: [PATCH 18/43] test_workspace_init --- manage/tests/unittest_clean.py | 1 + util/tests/unittest_workspace.py | 25 +++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 0ace0d38..63b629b9 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -130,6 +130,7 @@ def make_workspace_structure( "dbgym_workspace": { "symlinks": symlinks_structure, "task_runs": task_runs_structure, + "tmp": {}, } } ) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index ae5e5ea1..ee88154b 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -4,7 +4,8 @@ import unittest from pathlib import Path -from manage.tests.unittest_clean import FilesystemStructure +from manage.tests.unittest_clean import CleanTests, FilesystemStructure +from util.workspace import DBGymWorkspace class WorkspaceTests(unittest.TestCase): @@ -25,4 +26,24 @@ def tearDown(self) -> None: shutil.rmtree(self.scratchspace_path) def test_workspace_init(self) -> None: - pass + starting_structure = FilesystemStructure({}) + CleanTests.create_structure(self.scratchspace_path, starting_structure) + + workspace = DBGymWorkspace(self.workspace_path) + ending_symlinks_structure = FilesystemStructure({}) + ending_task_runs_structure = FilesystemStructure( + { + "latest_run.link": ( + "symlink", + f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}", + ), + workspace.dbgym_this_run_path.name: {}, + } + ) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) + + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) From dc1bfba63fd7949c3762c7e2449972fa6f68bdea Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 15:01:46 -0500 Subject: [PATCH 19/43] get_workspace_init_structure --- util/tests/unittest_workspace.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index ee88154b..044dea35 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -25,13 +25,10 @@ def tearDown(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) - def test_workspace_init(self) -> None: - starting_structure = FilesystemStructure({}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - - workspace = DBGymWorkspace(self.workspace_path) - ending_symlinks_structure = FilesystemStructure({}) - ending_task_runs_structure = FilesystemStructure( + @staticmethod + def get_workspace_init_structure(workspace: DBGymWorkspace) -> FilesystemStructure: + symlinks_structure = FilesystemStructure({}) + task_runs_structure = FilesystemStructure( { "latest_run.link": ( "symlink", @@ -40,10 +37,17 @@ def test_workspace_init(self) -> None: workspace.dbgym_this_run_path.name: {}, } ) - ending_structure = CleanTests.make_workspace_structure( - ending_symlinks_structure, ending_task_runs_structure + return CleanTests.make_workspace_structure( + symlinks_structure, task_runs_structure ) + def test_init_from_nonexistent_workspace(self) -> None: + starting_structure = FilesystemStructure({}) + CleanTests.create_structure(self.scratchspace_path, starting_structure) + + workspace = DBGymWorkspace(self.workspace_path) + ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) + self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) From c0772aa99cb2a63b7ba583011a2d7cec53fe858b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 15:08:48 -0500 Subject: [PATCH 20/43] added more workspace init tests --- manage/tests/unittest_clean.py | 2 +- util/tests/unittest_workspace.py | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 63b629b9..863a0d32 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -234,7 +234,7 @@ def test_nonexistent_workspace(self) -> None: # This just ensures that it doesn't raise an exception. clean_workspace(MockDBGymWorkspace(self.workspace_path)) - def test_no_symlinks_dir_and_no_task_runs_dir(self) -> None: + def test_empty_workspace(self) -> None: starting_structure = FilesystemStructure({"dbgym_workspace": {}}) ending_structure = FilesystemStructure({"dbgym_workspace": {}}) CleanTests.create_structure(self.scratchspace_path, starting_structure) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 044dea35..2114b860 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -21,6 +21,12 @@ def setUp(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) + # Reset this to avoid the error of it being created twice. + # The assertion checking that it's only been been created once is meant to avoid creating multiple + # objects for the same workspace. However, it's fine to create it multiple times in this case because + # we're deleting the scratchspace before each test. + DBGymWorkspace.num_times_created_this_run = 0 + def tearDown(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) @@ -51,3 +57,40 @@ def test_init_from_nonexistent_workspace(self) -> None: self.assertTrue( CleanTests.verify_structure(self.scratchspace_path, ending_structure) ) + + def test_init_from_empty_workspace(self) -> None: + starting_structure = FilesystemStructure({"dbgym_workspace": {}}) + CleanTests.create_structure(self.scratchspace_path, starting_structure) + + workspace = DBGymWorkspace(self.workspace_path) + ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) + + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) + + def test_init_from_already_initialized_workspace(self) -> None: + # This first initialization will create a task run. + workspace = DBGymWorkspace(self.workspace_path) + ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) + + # The second initialization will create a second task run. + # Make sure to reset this. In real usage, the second run would be a different Python process + # so DBGymWorkspace.num_times_created_this_run would be 0. + DBGymWorkspace.num_times_created_this_run = 0 + workspace = DBGymWorkspace(self.workspace_path) + ending_structure["dbgym_workspace"]["task_runs"][ + workspace.dbgym_this_run_path.name + ] = {} + ending_structure["dbgym_workspace"]["task_runs"]["latest_run.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}", + ) + + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) + + +if __name__ == "__main__": + unittest.main() From d3514f8454309b18dacc5e9978b7311861bc3ebe Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 15:10:08 -0500 Subject: [PATCH 21/43] renamed integtest_util to gymlib_integtest_util --- env/tests/{integtest_util.py => gymlib_integtest_util.py} | 0 env/tests/integtest_pg_conn.py | 2 +- env/tests/integtest_replay.py | 2 +- env/tests/integtest_tuning_artifacts.py | 2 +- env/tests/integtest_workload.py | 2 +- util/tests/filesystem_unittest_util.py | 0 6 files changed, 4 insertions(+), 4 deletions(-) rename env/tests/{integtest_util.py => gymlib_integtest_util.py} (100%) create mode 100644 util/tests/filesystem_unittest_util.py diff --git a/env/tests/integtest_util.py b/env/tests/gymlib_integtest_util.py similarity index 100% rename from env/tests/integtest_util.py rename to env/tests/gymlib_integtest_util.py diff --git a/env/tests/integtest_pg_conn.py b/env/tests/integtest_pg_conn.py index b3d4b212..a5e11f4d 100644 --- a/env/tests/integtest_pg_conn.py +++ b/env/tests/integtest_pg_conn.py @@ -4,7 +4,7 @@ import psycopg from env.pg_conn import PostgresConn -from env.tests.integtest_util import GymlibIntegtestManager +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from util.pg import ( DEFAULT_POSTGRES_PORT, get_is_postgres_running, diff --git a/env/tests/integtest_replay.py b/env/tests/integtest_replay.py index 8d6eb183..9523fac7 100644 --- a/env/tests/integtest_replay.py +++ b/env/tests/integtest_replay.py @@ -2,7 +2,7 @@ from benchmark.tpch.constants import DEFAULT_TPCH_SEED from env.replay import replay -from env.tests.integtest_util import GymlibIntegtestManager +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, diff --git a/env/tests/integtest_tuning_artifacts.py b/env/tests/integtest_tuning_artifacts.py index 2138e0f7..dfdc77c0 100644 --- a/env/tests/integtest_tuning_artifacts.py +++ b/env/tests/integtest_tuning_artifacts.py @@ -1,6 +1,6 @@ import unittest -from env.tests.integtest_util import GymlibIntegtestManager +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, diff --git a/env/tests/integtest_workload.py b/env/tests/integtest_workload.py index fbfb91a1..ee6fc068 100644 --- a/env/tests/integtest_workload.py +++ b/env/tests/integtest_workload.py @@ -1,7 +1,7 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from env.tests.integtest_util import GymlibIntegtestManager +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.workload import Workload from util.workspace import ( fully_resolve_path, diff --git a/util/tests/filesystem_unittest_util.py b/util/tests/filesystem_unittest_util.py new file mode 100644 index 00000000..e69de29b From bf6e8f7bee3aa995c7f6fcd38db1322d8c309552 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 15:17:20 -0500 Subject: [PATCH 22/43] moved structure util to its own file (to avoid running cleantests twice by importing it) --- .gitignore | 2 +- manage/tests/unittest_clean.py | 400 ++++-------------- util/tests/filesystem_unittest_util.py | 130 ++++++ .../unittest_filesystem_unittest_util.py | 96 +++++ util/tests/unittest_workspace.py | 27 +- util/workspace.py | 13 - 6 files changed, 323 insertions(+), 345 deletions(-) create mode 100644 util/tests/unittest_filesystem_unittest_util.py diff --git a/.gitignore b/.gitignore index 601b999d..2185a9c6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ __pycache__/ .conda/ .idea/ build/ -test_clean_scratchspace/ +*_scratchspace/ workspace/ default_*_benchbase_config_*.xml *.egg-info/ \ No newline at end of file diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 863a0d32..2e82d4a9 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -1,13 +1,15 @@ -import copy import logging -import os import shutil import unittest from pathlib import Path -from typing import Any, NewType, cast from manage.cli import MockDBGymWorkspace, clean_workspace -from util.workspace import path_exists_dont_follow_symlinks +from util.tests.filesystem_unittest_util import ( + FilesystemStructure, + create_structure, + make_workspace_structure, + verify_structure, +) # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which # does not go through the `if __name__ == "__main__"` codepath. @@ -17,124 +19,10 @@ logging.basicConfig(level=logging.CRITICAL) -FilesystemStructure = NewType("FilesystemStructure", dict[str, Any]) - - class CleanTests(unittest.TestCase): scratchspace_path: Path = Path() workspace_path: Path = Path() - @staticmethod - def create_structure(root_path: Path, structure: FilesystemStructure) -> None: - """ - Create files and directories according to the structure. - """ - - def create_structure_internal( - root_path: Path, cur_path: Path, structure: FilesystemStructure - ) -> None: - for path, content in structure.items(): - full_path: Path = cur_path / path - - if isinstance(content, dict): # Directory - full_path.mkdir(parents=True, exist_ok=True) - create_structure_internal( - root_path, - full_path, - FilesystemStructure(cast(dict[str, Any], content)), - ) - elif isinstance(content, tuple) and content[0] == "file": - assert len(content) == 1 - full_path.touch() - elif isinstance(content, tuple) and content[0] == "symlink": - assert len(content) == 2 - target_path = root_path / content[1] - os.symlink(target_path, full_path) - else: - raise ValueError(f"Unsupported type for path ({path}): {content}") - - root_path.mkdir(parents=True, exist_ok=True) - create_structure_internal(root_path, root_path, structure) - - @staticmethod - def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool: - """ - Verify that the files and directories match the expected structure. - """ - - def verify_structure_internal( - root_path: Path, cur_path: Path, structure: FilesystemStructure - ) -> bool: - # Check for the presence of each item specified in the structure - for name, item in structure.items(): - new_cur_path = cur_path / name - if not path_exists_dont_follow_symlinks(new_cur_path): - logging.debug(f"{new_cur_path} does not exist") - return False - elif isinstance(item, dict): - if not new_cur_path.is_dir(): - logging.debug(f"expected {new_cur_path} to be a directory") - return False - if not verify_structure_internal( - root_path, - new_cur_path, - FilesystemStructure(cast(dict[str, Any], item)), - ): - return False - elif isinstance(item, tuple) and item[0] == "file": - if not new_cur_path.is_file(): - logging.debug(f"expected {new_cur_path} to be a regular file") - return False - elif isinstance(item, tuple) and item[0] == "symlink": - if not new_cur_path.is_symlink(): - logging.debug(f"expected {new_cur_path} to be a symlink") - return False - # If item[1] is None, this indicates that we expect the symlink to be broken - if item[1] != None: - expected_target = root_path / item[1] - if not new_cur_path.resolve().samefile(expected_target): - logging.debug( - f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}" - ) - return False - else: - assert False, "structure misconfigured" - - # Check for any extra files or directories not described by the structure - expected_names = set(structure.keys()) - actual_names = {entry.name for entry in cur_path.iterdir()} - if not expected_names.issuperset(actual_names): - logging.debug( - f"expected_names={expected_names}, actual_names={actual_names}" - ) - return False - - return True - - if not root_path.exists(): - logging.debug(f"{root_path} does not exist") - return False - return verify_structure_internal(root_path, root_path, structure) - - @staticmethod - def make_workspace_structure( - symlinks_structure: FilesystemStructure, - task_runs_structure: FilesystemStructure, - ) -> FilesystemStructure: - """ - This function exists so that it's easier to refactor the tests in case we ever change - how the workspace is organized. - """ - return FilesystemStructure( - { - "dbgym_workspace": { - "symlinks": symlinks_structure, - "task_runs": task_runs_structure, - "tmp": {}, - } - } - ) - @classmethod def setUpClass(cls) -> None: cls.scratchspace_path = Path.cwd() / "manage/tests/test_clean_scratchspace/" @@ -148,88 +36,6 @@ def tearDown(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) - def test_structure_helpers(self) -> None: - structure = FilesystemStructure( - { - "dir1": {"file1.txt": ("file",), "dir2": {"file2.txt": ("file",)}}, - "dir3": {"nested_link_to_dir1": ("symlink", "dir1")}, - "link_to_dir1": ("symlink", "dir1"), - "link_to_file2": ("symlink", "dir1/dir2/file2.txt"), - } - ) - CleanTests.create_structure(self.scratchspace_path, structure) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, structure)) - - extra_dir_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure) - ) - extra_dir_structure["dir4"] = {} - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure) - ) - - missing_dir_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure) - ) - del missing_dir_structure["dir1"] - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure) - ) - - extra_file_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, extra_file_structure) - ) - extra_file_structure["file3.txt"] = ("file",) - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, extra_file_structure) - ) - - missing_file_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, missing_file_structure) - ) - del missing_file_structure["dir1"]["file1.txt"] - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, missing_file_structure) - ) - - extra_link_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, extra_link_structure) - ) - extra_link_structure["link_to_dir3"] = ("symlink", "dir3") - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, extra_link_structure) - ) - - missing_link_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, missing_link_structure) - ) - del missing_link_structure["link_to_dir1"] - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, missing_link_structure) - ) - - wrong_link_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure) - ) - wrong_link_structure["link_to_dir1"] = ("symlink", "dir3") - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure) - ) - def test_nonexistent_workspace(self) -> None: # This just ensures that it doesn't raise an exception. clean_workspace(MockDBGymWorkspace(self.workspace_path)) @@ -237,68 +43,58 @@ def test_nonexistent_workspace(self) -> None: def test_empty_workspace(self) -> None: starting_structure = FilesystemStructure({"dbgym_workspace": {}}) ending_structure = FilesystemStructure({"dbgym_workspace": {}}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_symlinks_dir_and_yes_task_runs_dir(self) -> None: starting_structure = FilesystemStructure( {"dbgym_workspace": {"task_runs": {"file1.txt": ("file",)}}} ) ending_structure = FilesystemStructure({"dbgym_workspace": {"task_runs": {}}}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_yes_symlinks_dir_and_no_task_runs_dir(self) -> None: # If there are no task runs there can't be any symlinks. starting_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) ending_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_symlinks_in_dir_and_no_task_runs_in_dir(self) -> None: starting_symlinks_structure = FilesystemStructure({}) starting_task_runs_structure = FilesystemStructure({}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure({}) ending_task_runs_structure = FilesystemStructure({}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_links_in_symlinks(self) -> None: starting_symlinks_structure = FilesystemStructure({}) starting_task_runs_structure = FilesystemStructure({"run_0": {}}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure({}) ending_task_runs_structure = FilesystemStructure({}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_file_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -307,22 +103,20 @@ def test_link_to_file_directly_in_task_runs(self) -> None: starting_task_runs_structure = FilesystemStructure( {"file1.txt": ("file",), "file2.txt": ("file",)} ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( {"symlink1": ("symlink", "dbgym_workspace/task_runs/file1.txt")} ) ending_task_runs_structure = FilesystemStructure({"file1.txt": ("file",)}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_dir_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -334,7 +128,7 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -343,15 +137,13 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -363,7 +155,7 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -372,15 +164,13 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_dir_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -392,7 +182,7 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: "dir3": {"file3.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -403,15 +193,13 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: "dir1": {"dir2": {"file1.txt": ("file",)}, "file2.txt": ("file",)}, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_link_crashes(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -423,11 +211,11 @@ def test_link_to_link_crashes(self) -> None: "file1.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) with self.assertRaises(AssertionError): clean_workspace(MockDBGymWorkspace(self.workspace_path)) @@ -444,7 +232,7 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: "file2.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -458,15 +246,13 @@ def test_safe_mode_link_to_dir_with_link(self) -> None: "file1.txt": ("file",), } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -482,7 +268,7 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: "file3.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -497,15 +283,13 @@ def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: "file2.txt": ("file",), } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -522,7 +306,7 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No "file3.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -538,15 +322,13 @@ def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> No }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_aggressive_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -561,7 +343,7 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: "file2.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -572,15 +354,13 @@ def test_aggressive_mode_link_to_dir_with_link(self) -> None: "dir1": {"symlink2": ("symlink", None)}, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="aggressive") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_link_to_file_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -594,11 +374,11 @@ def test_link_to_link_to_file_gives_error(self) -> None: "file2.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) # We disallow links to links so it's an AssertionError with self.assertRaises(AssertionError): @@ -613,11 +393,11 @@ def test_multi_link_loop_gives_error(self) -> None: "dir1": {"symlink2": ("symlink", "dbgym_workspace/symlinks/symlink1")}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) # pathlib disallows multi-link loops so it's a RuntimeError with self.assertRaises(RuntimeError): @@ -628,11 +408,11 @@ def test_link_self_loop_gives_error(self) -> None: {"symlink1": ("symlink", "dbgym_workspace/symlinks/symlink1")} ) starting_task_runs_structure = FilesystemStructure({}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) # pathlib disallows link self-loops so it's a RuntimeError with self.assertRaises(RuntimeError): @@ -656,7 +436,7 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs }, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -674,15 +454,13 @@ def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_itself( self, @@ -698,7 +476,7 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ }, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -712,15 +490,13 @@ def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_ }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -734,7 +510,7 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: }, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -748,15 +524,13 @@ def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_broken_symlink_has_no_effect(self) -> None: starting_symlinks_structure = FilesystemStructure( @@ -774,7 +548,7 @@ def test_broken_symlink_has_no_effect(self) -> None: "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( @@ -783,15 +557,13 @@ def test_broken_symlink_has_no_effect(self) -> None: ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",), "symlink2": ("symlink", None)}} ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) # The idea behind this test is that we shouldn't be following links outside of task_runs, even on safe mode def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save_other_run( @@ -809,7 +581,7 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) starting_structure["external"] = FilesystemStructure( @@ -831,38 +603,34 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save } } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) ending_structure["external"] = { "dir3": {"file3.txt": ("file",), "symlink3": ("symlink", None)} } - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_outside_task_runs_doesnt_get_deleted(self) -> None: starting_symlinks_structure = FilesystemStructure({}) starting_task_runs_structure = FilesystemStructure({"dir1": {}}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) starting_structure["external"] = FilesystemStructure({"file1.txt": ("file",)}) ending_symlinks_structure = FilesystemStructure({}) ending_task_runs_structure = FilesystemStructure({}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) ending_structure["external"] = FilesystemStructure({"file1.txt": ("file",)}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) if __name__ == "__main__": diff --git a/util/tests/filesystem_unittest_util.py b/util/tests/filesystem_unittest_util.py index e69de29b..667bd24e 100644 --- a/util/tests/filesystem_unittest_util.py +++ b/util/tests/filesystem_unittest_util.py @@ -0,0 +1,130 @@ +import logging +import os +from pathlib import Path +from typing import Any, NewType, cast + +FilesystemStructure = NewType("FilesystemStructure", dict[str, Any]) + + +def create_structure(root_path: Path, structure: FilesystemStructure) -> None: + """ + Create files and directories according to the structure. + """ + + def create_structure_internal( + root_path: Path, cur_path: Path, structure: FilesystemStructure + ) -> None: + for path, content in structure.items(): + full_path: Path = cur_path / path + + if isinstance(content, dict): # Directory + full_path.mkdir(parents=True, exist_ok=True) + create_structure_internal( + root_path, + full_path, + FilesystemStructure(cast(dict[str, Any], content)), + ) + elif isinstance(content, tuple) and content[0] == "file": + assert len(content) == 1 + full_path.touch() + elif isinstance(content, tuple) and content[0] == "symlink": + assert len(content) == 2 + target_path = root_path / content[1] + os.symlink(target_path, full_path) + else: + raise ValueError(f"Unsupported type for path ({path}): {content}") + + root_path.mkdir(parents=True, exist_ok=True) + create_structure_internal(root_path, root_path, structure) + + +def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool: + """ + Verify that the files and directories match the expected structure. + """ + + def verify_structure_internal( + root_path: Path, cur_path: Path, structure: FilesystemStructure + ) -> bool: + # Check for the presence of each item specified in the structure + for name, item in structure.items(): + new_cur_path = cur_path / name + if not path_exists_dont_follow_symlinks(new_cur_path): + logging.debug(f"{new_cur_path} does not exist") + return False + elif isinstance(item, dict): + if not new_cur_path.is_dir(): + logging.debug(f"expected {new_cur_path} to be a directory") + return False + if not verify_structure_internal( + root_path, + new_cur_path, + FilesystemStructure(cast(dict[str, Any], item)), + ): + return False + elif isinstance(item, tuple) and item[0] == "file": + if not new_cur_path.is_file(): + logging.debug(f"expected {new_cur_path} to be a regular file") + return False + elif isinstance(item, tuple) and item[0] == "symlink": + if not new_cur_path.is_symlink(): + logging.debug(f"expected {new_cur_path} to be a symlink") + return False + # If item[1] is None, this indicates that we expect the symlink to be broken + if item[1] != None: + expected_target = root_path / item[1] + if not new_cur_path.resolve().samefile(expected_target): + logging.debug( + f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}" + ) + return False + else: + assert False, "structure misconfigured" + + # Check for any extra files or directories not described by the structure + expected_names = set(structure.keys()) + actual_names = {entry.name for entry in cur_path.iterdir()} + if not expected_names.issuperset(actual_names): + logging.debug( + f"expected_names={expected_names}, actual_names={actual_names}" + ) + return False + + return True + + if not root_path.exists(): + logging.debug(f"{root_path} does not exist") + return False + return verify_structure_internal(root_path, root_path, structure) + + +def make_workspace_structure( + symlinks_structure: FilesystemStructure, + task_runs_structure: FilesystemStructure, +) -> FilesystemStructure: + """ + This function exists so that it's easier to refactor the tests in case we ever change + how the workspace is organized. + """ + return FilesystemStructure( + { + "dbgym_workspace": { + "symlinks": symlinks_structure, + "task_runs": task_runs_structure, + "tmp": {}, + } + } + ) + + +def path_exists_dont_follow_symlinks(path: Path) -> bool: + """ + As of writing this comment, ray is currently constraining us to python <3.12. However, the "follow_symlinks" option in + Path.exists() only comes up in python 3.12. Thus, this is the only way to check if a path exists without following symlinks. + """ + # If the path exists and is a symlink, os.path.islink() will be true (even if the symlink is broken) + if os.path.islink(path): + return True + # Otherwise, we know it's either non-existent or not a symlink, so path.exists() works fine + else: + return path.exists() diff --git a/util/tests/unittest_filesystem_unittest_util.py b/util/tests/unittest_filesystem_unittest_util.py new file mode 100644 index 00000000..04f3fe7d --- /dev/null +++ b/util/tests/unittest_filesystem_unittest_util.py @@ -0,0 +1,96 @@ +import copy +import shutil +import unittest +from pathlib import Path + +from util.tests.filesystem_unittest_util import ( + FilesystemStructure, + create_structure, + verify_structure, +) + + +class FilesystemUnittestUtilTests(unittest.TestCase): + scratchspace_path: Path = Path() + + @classmethod + def setUpClass(cls) -> None: + cls.scratchspace_path = ( + Path.cwd() / "util/tests/test_filesystem_unittest_util_scratchspace/" + ) + + def setUp(self) -> None: + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + def tearDown(self) -> None: + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + def test_filesystem_unittest_util(self) -> None: + structure = FilesystemStructure( + { + "dir1": {"file1.txt": ("file",), "dir2": {"file2.txt": ("file",)}}, + "dir3": {"nested_link_to_dir1": ("symlink", "dir1")}, + "link_to_dir1": ("symlink", "dir1"), + "link_to_file2": ("symlink", "dir1/dir2/file2.txt"), + } + ) + create_structure(self.scratchspace_path, structure) + self.assertTrue(verify_structure(self.scratchspace_path, structure)) + + extra_dir_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, extra_dir_structure)) + extra_dir_structure["dir4"] = {} + self.assertFalse(verify_structure(self.scratchspace_path, extra_dir_structure)) + + missing_dir_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, missing_dir_structure)) + del missing_dir_structure["dir1"] + self.assertFalse( + verify_structure(self.scratchspace_path, missing_dir_structure) + ) + + extra_file_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, extra_file_structure)) + extra_file_structure["file3.txt"] = ("file",) + self.assertFalse(verify_structure(self.scratchspace_path, extra_file_structure)) + + missing_file_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue( + verify_structure(self.scratchspace_path, missing_file_structure) + ) + del missing_file_structure["dir1"]["file1.txt"] + self.assertFalse( + verify_structure(self.scratchspace_path, missing_file_structure) + ) + + extra_link_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, extra_link_structure)) + extra_link_structure["link_to_dir3"] = ("symlink", "dir3") + self.assertFalse(verify_structure(self.scratchspace_path, extra_link_structure)) + + missing_link_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue( + verify_structure(self.scratchspace_path, missing_link_structure) + ) + del missing_link_structure["link_to_dir1"] + self.assertFalse( + verify_structure(self.scratchspace_path, missing_link_structure) + ) + + wrong_link_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, wrong_link_structure)) + wrong_link_structure["link_to_dir1"] = ("symlink", "dir3") + self.assertFalse(verify_structure(self.scratchspace_path, wrong_link_structure)) + + +if __name__ == "__main__": + unittest.main() diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 2114b860..a4d8fa2c 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -4,7 +4,12 @@ import unittest from pathlib import Path -from manage.tests.unittest_clean import CleanTests, FilesystemStructure +from util.tests.filesystem_unittest_util import ( + FilesystemStructure, + create_structure, + make_workspace_structure, + verify_structure, +) from util.workspace import DBGymWorkspace @@ -43,31 +48,25 @@ def get_workspace_init_structure(workspace: DBGymWorkspace) -> FilesystemStructu workspace.dbgym_this_run_path.name: {}, } ) - return CleanTests.make_workspace_structure( - symlinks_structure, task_runs_structure - ) + return make_workspace_structure(symlinks_structure, task_runs_structure) def test_init_from_nonexistent_workspace(self) -> None: starting_structure = FilesystemStructure({}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) workspace = DBGymWorkspace(self.workspace_path) ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_init_from_empty_workspace(self) -> None: starting_structure = FilesystemStructure({"dbgym_workspace": {}}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) workspace = DBGymWorkspace(self.workspace_path) ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_init_from_already_initialized_workspace(self) -> None: # This first initialization will create a task run. @@ -87,9 +86,7 @@ def test_init_from_already_initialized_workspace(self) -> None: f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}", ) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) if __name__ == "__main__": diff --git a/util/workspace.py b/util/workspace.py index 474a7e70..120f7f26 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -382,19 +382,6 @@ def is_fully_resolved(path: Path) -> bool: return str(resolved_path) == str(path) -def path_exists_dont_follow_symlinks(path: Path) -> bool: - """ - As of writing this comment, ray is currently constraining us to python <3.12. However, the "follow_symlinks" option in - Path.exists() only comes up in python 3.12. Thus, this is the only way to check if a path exists without following symlinks. - """ - # If the path exists and is a symlink, os.path.islink() will be true (even if the symlink is broken) - if os.path.islink(path): - return True - # Otherwise, we know it's either non-existent or not a symlink, so path.exists() works fine - else: - return path.exists() - - def parent_dpath_of_path(dpath: Path) -> Path: """ This function only calls Path.parent, but in a safer way. From 0e774517a60727e752d27649175baa925115ff02 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 15:40:19 -0500 Subject: [PATCH 23/43] gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2185a9c6..eb20bf15 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ build/ *_scratchspace/ workspace/ default_*_benchbase_config_*.xml -*.egg-info/ \ No newline at end of file +*.egg-info/ +*.code-workspace \ No newline at end of file From 316ca02d08f279225b2224624456a8cd966b956b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 15:45:26 -0500 Subject: [PATCH 24/43] get_updated_structure_from_workspace_init --- util/tests/unittest_workspace.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index a4d8fa2c..72a5d9af 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -10,7 +10,7 @@ make_workspace_structure, verify_structure, ) -from util.workspace import DBGymWorkspace +from util.workspace import DBGymWorkspace, save_file class WorkspaceTests(unittest.TestCase): @@ -50,6 +50,19 @@ def get_workspace_init_structure(workspace: DBGymWorkspace) -> FilesystemStructu ) return make_workspace_structure(symlinks_structure, task_runs_structure) + @staticmethod + def get_updated_structure_from_workspace_init( + structure: FilesystemStructure, workspace: DBGymWorkspace + ) -> FilesystemStructure: + structure["dbgym_workspace"]["task_runs"][ + workspace.dbgym_this_run_path.name + ] = {} + structure["dbgym_workspace"]["task_runs"]["latest_run.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}", + ) + return structure + def test_init_from_nonexistent_workspace(self) -> None: starting_structure = FilesystemStructure({}) create_structure(self.scratchspace_path, starting_structure) @@ -78,12 +91,8 @@ def test_init_from_already_initialized_workspace(self) -> None: # so DBGymWorkspace.num_times_created_this_run would be 0. DBGymWorkspace.num_times_created_this_run = 0 workspace = DBGymWorkspace(self.workspace_path) - ending_structure["dbgym_workspace"]["task_runs"][ - workspace.dbgym_this_run_path.name - ] = {} - ending_structure["dbgym_workspace"]["task_runs"]["latest_run.link"] = ( - "symlink", - f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}", + ending_structure = WorkspaceTests.get_updated_structure_from_workspace_init( + ending_structure, workspace ) self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) From 88f31b3bc8ccc64599c0431e2bf41bc81db83fdb Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 16:04:40 -0500 Subject: [PATCH 25/43] test_link_result_basic_functionality --- util/tests/unittest_workspace.py | 36 ++++++++++++++++++++- util/workspace.py | 55 +++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 72a5d9af..a6454487 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -10,7 +10,7 @@ make_workspace_structure, verify_structure, ) -from util.workspace import DBGymWorkspace, save_file +from util.workspace import DBGymWorkspace, link_result, save_file class WorkspaceTests(unittest.TestCase): @@ -33,6 +33,7 @@ def setUp(self) -> None: DBGymWorkspace.num_times_created_this_run = 0 def tearDown(self) -> None: + # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) @@ -63,6 +64,10 @@ def get_updated_structure_from_workspace_init( ) return structure + def test_init_fields(self) -> None: + workspace = DBGymWorkspace(self.workspace_path) + self.assertEqual(workspace.app_name, "dbgym") + def test_init_from_nonexistent_workspace(self) -> None: starting_structure = FilesystemStructure({}) create_structure(self.scratchspace_path, starting_structure) @@ -97,6 +102,35 @@ def test_init_from_already_initialized_workspace(self) -> None: self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) + def test_link_result_basic_functionality(self) -> None: + # Setup. + workspace = DBGymWorkspace(self.workspace_path) + ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) + + # Make a result file. + result_path = workspace.dbgym_this_run_path / "result.txt" + result_path.touch() + ending_structure["dbgym_workspace"]["task_runs"][ + workspace.dbgym_this_run_path.name + ]["result.txt"] = ("file",) + + # Link the result file. + workspace.link_result(result_path) + ending_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + ending_structure["dbgym_workspace"]["symlinks"]["dbgym"]["result.txt.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}/result.txt", + ) + + # Verify structure. + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) + + # TODO: test overriding existing symlink + # TODO: test linking result from another run should raise + # TODO: test that it should link in the agent dir in the links + # TODO: test that it will ignore the directory structure (unlike save which keeps it) + # TODO: test linking a symlink or a non-fully-resolved path + if __name__ == "__main__": unittest.main() diff --git a/util/workspace.py b/util/workspace.py index 120f7f26..7dc3278d 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -168,6 +168,9 @@ def __init__(self, dbgym_workspace_path: Path): self.base_dbgym_repo_dpath = get_base_dbgym_repo_dpath() self.cur_path_list: list[str] = ["dbgym"] + self.app_name = ( + "dbgym" # TODO: discover this dynamically. app means dbgym or an agent + ) # Set and create paths. self.dbgym_workspace_path = dbgym_workspace_path @@ -220,6 +223,56 @@ def __init__(self, dbgym_workspace_path: Path): try_remove_file(self.dbgym_latest_run_path) try_create_symlink(self.dbgym_this_run_path, self.dbgym_latest_run_path) + # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead + def link_result( + self, + result_fordpath: Path, + custom_result_name: Optional[str] = None, + ) -> Path: + """ + result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. + Further, result_fordpath must have been generated by this invocation to task.py. This also means that + result_fordpath itself can be a file or a dir but not a symlink. + Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside + symlinks/[codebase]/[org]/. + Will override the old symlink if there is one, so that symlinks/ always contains the latest generated + version of a file. + This function will return the path to the symlink that was created. + """ + assert isinstance(result_fordpath, Path) + assert is_fully_resolved( + result_fordpath + ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" + assert is_child_path(result_fordpath, self.dbgym_this_run_path) + assert not os.path.islink(result_fordpath) + + if type(custom_result_name) is str: + result_name = custom_result_name + else: + if os.path.isfile(result_fordpath): + result_name = basename_of_path(result_fordpath) + ".link" + elif os.path.isdir(result_fordpath): + result_name = basename_of_path(result_fordpath) + ".link" + else: + raise AssertionError("result_fordpath must be either a file or dir") + + # TODO: check that it was generated in this run_*/. + symlink_parent_dpath = self.dbgym_symlinks_path / self.app_name + symlink_parent_dpath.mkdir(parents=True, exist_ok=True) + + # Remove the old symlink ("old" meaning created in an earlier run) if there is one + # Note that in a multi-threaded setting, this might remove one created by a process in the same run, + # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink + # file of the current run regardless of the order of threads. + assert result_name.endswith(".link") and not result_name.endswith( + ".link.link" + ), f'result_name ({result_name}) should end with ".link"' + symlink_path = symlink_parent_dpath / result_name + try_remove_file(symlink_path) + try_create_symlink(result_fordpath, symlink_path) + + return symlink_path + # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is # explained further in the documentation. def append_group(self, name: str) -> None: @@ -564,7 +617,7 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: shutil.copy(fpath, copy_fpath) -# TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead +# TODO(phw2): deprecate this once I'm done with unittest_workspace.py def link_result( dbgym_workspace: DBGymWorkspace, result_fordpath: Path, From 3f956329a7cbbbe934300ee369c4f656dbc40a07 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 16:37:30 -0500 Subject: [PATCH 26/43] refactored workspace tests to use helpers --- util/tests/unittest_workspace.py | 130 ++++++++++++++----------------- 1 file changed, 57 insertions(+), 73 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index a6454487..9a9d4d11 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -26,104 +26,88 @@ def setUp(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) - # Reset this to avoid the error of it being created twice. - # The assertion checking that it's only been been created once is meant to avoid creating multiple - # objects for the same workspace. However, it's fine to create it multiple times in this case because - # we're deleting the scratchspace before each test. - DBGymWorkspace.num_times_created_this_run = 0 + self.workspace = None + self.expected_structure = None def tearDown(self) -> None: # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) - @staticmethod - def get_workspace_init_structure(workspace: DBGymWorkspace) -> FilesystemStructure: - symlinks_structure = FilesystemStructure({}) - task_runs_structure = FilesystemStructure( - { - "latest_run.link": ( - "symlink", - f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}", + # All these helper functions will perform an action, update the expected structure, and then verify the structure. + def init_workspace_helper(self) -> None: + # Reset this to avoid the error of it being created twice. + # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0. + DBGymWorkspace.num_times_created_this_run = 0 + self.workspace = DBGymWorkspace(self.workspace_path) + + if self.expected_structure is None: + self.expected_structure = make_workspace_structure( + FilesystemStructure({}), + FilesystemStructure( + { + "latest_run.link": ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}", + ), + self.workspace.dbgym_this_run_path.name: {}, + } ), - workspace.dbgym_this_run_path.name: {}, - } + ) + else: + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ] = {} + self.expected_structure["dbgym_workspace"]["task_runs"]["latest_run.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}", + ) + + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) ) - return make_workspace_structure(symlinks_structure, task_runs_structure) - - @staticmethod - def get_updated_structure_from_workspace_init( - structure: FilesystemStructure, workspace: DBGymWorkspace - ) -> FilesystemStructure: - structure["dbgym_workspace"]["task_runs"][ - workspace.dbgym_this_run_path.name - ] = {} - structure["dbgym_workspace"]["task_runs"]["latest_run.link"] = ( + + def make_result_helper(self, result_name: str="result.txt") -> Path: + result_path = self.workspace.dbgym_this_run_path / result_name + result_path.touch() + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][result_name] = ("file",) + self.assertTrue(verify_structure(self.scratchspace_path, self.expected_structure)) + return result_path + + def link_result_helper(self, result_path: Path) -> None: + self.workspace.link_result(result_path) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( "symlink", - f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) - return structure + self.assertTrue(verify_structure(self.scratchspace_path, self.expected_structure)) def test_init_fields(self) -> None: workspace = DBGymWorkspace(self.workspace_path) self.assertEqual(workspace.app_name, "dbgym") def test_init_from_nonexistent_workspace(self) -> None: - starting_structure = FilesystemStructure({}) - create_structure(self.scratchspace_path, starting_structure) - - workspace = DBGymWorkspace(self.workspace_path) - ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) - - self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) + self.init_workspace_helper() def test_init_from_empty_workspace(self) -> None: starting_structure = FilesystemStructure({"dbgym_workspace": {}}) create_structure(self.scratchspace_path, starting_structure) - workspace = DBGymWorkspace(self.workspace_path) - ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) - - self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) + self.init_workspace_helper() def test_init_from_already_initialized_workspace(self) -> None: - # This first initialization will create a task run. - workspace = DBGymWorkspace(self.workspace_path) - ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) - - # The second initialization will create a second task run. - # Make sure to reset this. In real usage, the second run would be a different Python process - # so DBGymWorkspace.num_times_created_this_run would be 0. - DBGymWorkspace.num_times_created_this_run = 0 - workspace = DBGymWorkspace(self.workspace_path) - ending_structure = WorkspaceTests.get_updated_structure_from_workspace_init( - ending_structure, workspace - ) - - self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) + self.init_workspace_helper() + self.init_workspace_helper() def test_link_result_basic_functionality(self) -> None: - # Setup. - workspace = DBGymWorkspace(self.workspace_path) - ending_structure = WorkspaceTests.get_workspace_init_structure(workspace) - - # Make a result file. - result_path = workspace.dbgym_this_run_path / "result.txt" - result_path.touch() - ending_structure["dbgym_workspace"]["task_runs"][ - workspace.dbgym_this_run_path.name - ]["result.txt"] = ("file",) - - # Link the result file. - workspace.link_result(result_path) - ending_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - ending_structure["dbgym_workspace"]["symlinks"]["dbgym"]["result.txt.link"] = ( - "symlink", - f"dbgym_workspace/task_runs/{workspace.dbgym_this_run_path.name}/result.txt", - ) - - # Verify structure. - self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) + self.init_workspace_helper() + result_path = self.make_result_helper() + self.link_result_helper(result_path) # TODO: test overriding existing symlink # TODO: test linking result from another run should raise From e65b0d29701e62ef7ca7f1ae0213e00f799bb10b Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 16:44:44 -0500 Subject: [PATCH 27/43] test_link_result_invalid_custom_link_name and test_link_result_valid_custom_link_name --- util/tests/unittest_workspace.py | 26 ++++++++++++++++++++++---- util/workspace.py | 16 ++++++++-------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 9a9d4d11..4c0f4dbc 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -1,6 +1,7 @@ # TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers. import shutil +from typing import Optional import unittest from pathlib import Path @@ -76,11 +77,12 @@ def make_result_helper(self, result_name: str="result.txt") -> Path: self.assertTrue(verify_structure(self.scratchspace_path, self.expected_structure)) return result_path - def link_result_helper(self, result_path: Path) -> None: - self.workspace.link_result(result_path) + def link_result_helper(self, result_path: Path, custom_link_name: Optional[str]=None) -> None: + self.workspace.link_result(result_path, custom_link_name=custom_link_name) + link_name = f"{result_path.name}.link" if custom_link_name is None else custom_link_name self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - f"{result_path.name}.link" + link_name ] = ( "symlink", f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", @@ -97,7 +99,6 @@ def test_init_from_nonexistent_workspace(self) -> None: def test_init_from_empty_workspace(self) -> None: starting_structure = FilesystemStructure({"dbgym_workspace": {}}) create_structure(self.scratchspace_path, starting_structure) - self.init_workspace_helper() def test_init_from_already_initialized_workspace(self) -> None: @@ -109,6 +110,23 @@ def test_link_result_basic_functionality(self) -> None: result_path = self.make_result_helper() self.link_result_helper(result_path) + def test_link_result_invalid_custom_link_name(self) -> None: + self.init_workspace_helper() + result_path = self.make_result_helper() + with self.assertRaises(AssertionError): + self.link_result_helper(result_path, custom_link_name="custom_link") + + def test_link_result_valid_custom_link_name(self) -> None: + self.init_workspace_helper() + result_path = self.make_result_helper() + self.link_result_helper(result_path, custom_link_name="custom_link.link") + + def test_link_same_result_twice(self) -> None: + self.init_workspace_helper() + result_path = self.make_result_helper() + self.link_result_helper(result_path) + self.link_result_helper(result_path) + # TODO: test overriding existing symlink # TODO: test linking result from another run should raise # TODO: test that it should link in the agent dir in the links diff --git a/util/workspace.py b/util/workspace.py index 7dc3278d..060cae05 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -227,7 +227,7 @@ def __init__(self, dbgym_workspace_path: Path): def link_result( self, result_fordpath: Path, - custom_result_name: Optional[str] = None, + custom_link_name: Optional[str] = None, ) -> Path: """ result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. @@ -246,13 +246,13 @@ def link_result( assert is_child_path(result_fordpath, self.dbgym_this_run_path) assert not os.path.islink(result_fordpath) - if type(custom_result_name) is str: - result_name = custom_result_name + if type(custom_link_name) is str: + link_name = custom_link_name else: if os.path.isfile(result_fordpath): - result_name = basename_of_path(result_fordpath) + ".link" + link_name = basename_of_path(result_fordpath) + ".link" elif os.path.isdir(result_fordpath): - result_name = basename_of_path(result_fordpath) + ".link" + link_name = basename_of_path(result_fordpath) + ".link" else: raise AssertionError("result_fordpath must be either a file or dir") @@ -264,10 +264,10 @@ def link_result( # Note that in a multi-threaded setting, this might remove one created by a process in the same run, # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink # file of the current run regardless of the order of threads. - assert result_name.endswith(".link") and not result_name.endswith( + assert link_name.endswith(".link") and not link_name.endswith( ".link.link" - ), f'result_name ({result_name}) should end with ".link"' - symlink_path = symlink_parent_dpath / result_name + ), f'link_name ({link_name}) should end with ".link"' + symlink_path = symlink_parent_dpath / link_name try_remove_file(symlink_path) try_create_symlink(result_fordpath, symlink_path) From c3e17f400ecad9451a2a2d4cd57788d06f639e78 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 16:45:38 -0500 Subject: [PATCH 28/43] fmt and mypy --- util/tests/unittest_workspace.py | 34 +++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 4c0f4dbc..63572ef4 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -1,9 +1,9 @@ # TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers. import shutil -from typing import Optional import unittest from pathlib import Path +from typing import Optional from util.tests.filesystem_unittest_util import ( FilesystemStructure, @@ -27,8 +27,8 @@ def setUp(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) - self.workspace = None - self.expected_structure = None + self.workspace: Optional[DBGymWorkspace] = None + self.expected_structure: Optional[FilesystemStructure] = None def tearDown(self) -> None: # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). @@ -59,7 +59,9 @@ def init_workspace_helper(self) -> None: self.expected_structure["dbgym_workspace"]["task_runs"][ self.workspace.dbgym_this_run_path.name ] = {} - self.expected_structure["dbgym_workspace"]["task_runs"]["latest_run.link"] = ( + self.expected_structure["dbgym_workspace"]["task_runs"][ + "latest_run.link" + ] = ( "symlink", f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}", ) @@ -68,26 +70,34 @@ def init_workspace_helper(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - def make_result_helper(self, result_name: str="result.txt") -> Path: + def make_result_helper(self, result_name: str = "result.txt") -> Path: + assert self.workspace is not None and self.expected_structure is not None result_path = self.workspace.dbgym_this_run_path / result_name result_path.touch() self.expected_structure["dbgym_workspace"]["task_runs"][ self.workspace.dbgym_this_run_path.name ][result_name] = ("file",) - self.assertTrue(verify_structure(self.scratchspace_path, self.expected_structure)) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) return result_path - def link_result_helper(self, result_path: Path, custom_link_name: Optional[str]=None) -> None: + def link_result_helper( + self, result_path: Path, custom_link_name: Optional[str] = None + ) -> None: + assert self.workspace is not None and self.expected_structure is not None self.workspace.link_result(result_path, custom_link_name=custom_link_name) - link_name = f"{result_path.name}.link" if custom_link_name is None else custom_link_name + link_name = ( + f"{result_path.name}.link" if custom_link_name is None else custom_link_name + ) self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ - link_name - ] = ( + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][link_name] = ( "symlink", f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) - self.assertTrue(verify_structure(self.scratchspace_path, self.expected_structure)) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) def test_init_fields(self) -> None: workspace = DBGymWorkspace(self.workspace_path) From fc2f424773017c061ff7d6947b0b2bca039b2374 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 16:48:43 -0500 Subject: [PATCH 29/43] test_link_same_result_with_different_name --- util/tests/unittest_workspace.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 63572ef4..011340ad 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -11,7 +11,7 @@ make_workspace_structure, verify_structure, ) -from util.workspace import DBGymWorkspace, link_result, save_file +from util.workspace import DBGymWorkspace class WorkspaceTests(unittest.TestCase): @@ -85,12 +85,14 @@ def make_result_helper(self, result_name: str = "result.txt") -> Path: def link_result_helper( self, result_path: Path, custom_link_name: Optional[str] = None ) -> None: + assert custom_link_name != f"{result_path.name}.link", "You probably made a mistake in the test" assert self.workspace is not None and self.expected_structure is not None self.workspace.link_result(result_path, custom_link_name=custom_link_name) link_name = ( f"{result_path.name}.link" if custom_link_name is None else custom_link_name ) - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + if "dbgym" not in self.expected_structure["dbgym_workspace"]["symlinks"]: + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][link_name] = ( "symlink", f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", @@ -124,12 +126,12 @@ def test_link_result_invalid_custom_link_name(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() with self.assertRaises(AssertionError): - self.link_result_helper(result_path, custom_link_name="custom_link") + self.link_result_helper(result_path, custom_link_name="custom") def test_link_result_valid_custom_link_name(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() - self.link_result_helper(result_path, custom_link_name="custom_link.link") + self.link_result_helper(result_path, custom_link_name="custom.link") def test_link_same_result_twice(self) -> None: self.init_workspace_helper() @@ -137,6 +139,12 @@ def test_link_same_result_twice(self) -> None: self.link_result_helper(result_path) self.link_result_helper(result_path) + def test_link_same_result_with_different_name(self) -> None: + self.init_workspace_helper() + result_path = self.make_result_helper() + self.link_result_helper(result_path) + self.link_result_helper(result_path, custom_link_name="custom.link") + # TODO: test overriding existing symlink # TODO: test linking result from another run should raise # TODO: test that it should link in the agent dir in the links From 905a65c19f9d6cefe2beee3434e05a9b27d46b54 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 16:56:43 -0500 Subject: [PATCH 30/43] got rid of link_result_helper --- util/tests/unittest_workspace.py | 78 +++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 011340ad..d2c73b5a 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -36,6 +36,7 @@ def tearDown(self) -> None: shutil.rmtree(self.scratchspace_path) # All these helper functions will perform an action, update the expected structure, and then verify the structure. + # Importantly though, I don't have helper functions for the complex functions that I want to test (e.g. link_result and save_file). def init_workspace_helper(self) -> None: # Reset this to avoid the error of it being created twice. # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0. @@ -82,25 +83,6 @@ def make_result_helper(self, result_name: str = "result.txt") -> Path: ) return result_path - def link_result_helper( - self, result_path: Path, custom_link_name: Optional[str] = None - ) -> None: - assert custom_link_name != f"{result_path.name}.link", "You probably made a mistake in the test" - assert self.workspace is not None and self.expected_structure is not None - self.workspace.link_result(result_path, custom_link_name=custom_link_name) - link_name = ( - f"{result_path.name}.link" if custom_link_name is None else custom_link_name - ) - if "dbgym" not in self.expected_structure["dbgym_workspace"]["symlinks"]: - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][link_name] = ( - "symlink", - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", - ) - self.assertTrue( - verify_structure(self.scratchspace_path, self.expected_structure) - ) - def test_init_fields(self) -> None: workspace = DBGymWorkspace(self.workspace_path) self.assertEqual(workspace.app_name, "dbgym") @@ -120,30 +102,72 @@ def test_init_from_already_initialized_workspace(self) -> None: def test_link_result_basic_functionality(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() - self.link_result_helper(result_path) + self.workspace.link_result(result_path) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][f"{result_path.name}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) def test_link_result_invalid_custom_link_name(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() with self.assertRaises(AssertionError): - self.link_result_helper(result_path, custom_link_name="custom") + self.workspace.link_result(result_path, custom_link_name=f"custom") def test_link_result_valid_custom_link_name(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() - self.link_result_helper(result_path, custom_link_name="custom.link") + self.workspace.link_result(result_path, custom_link_name="custom.link") + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"]["custom.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) - def test_link_same_result_twice(self) -> None: + def test_link_same_result_twice_with_same_link_name(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() - self.link_result_helper(result_path) - self.link_result_helper(result_path) + self.workspace.link_result(result_path) + self.workspace.link_result(result_path) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) def test_link_same_result_with_different_name(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() - self.link_result_helper(result_path) - self.link_result_helper(result_path, custom_link_name="custom.link") + self.workspace.link_result(result_path) + self.workspace.link_result(result_path, custom_link_name="custom.link") + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"custom.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) # TODO: test overriding existing symlink # TODO: test linking result from another run should raise From 1484715aa90c1b27a5aa401c7f5c33393b704373 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 17:00:57 -0500 Subject: [PATCH 31/43] test_link_result_does_not_copy_directory_structure_to_symlinks_dir --- util/tests/unittest_workspace.py | 36 +++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index d2c73b5a..501e26e0 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -71,13 +71,24 @@ def init_workspace_helper(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - def make_result_helper(self, result_name: str = "result.txt") -> Path: + def make_result_helper(self, relative_path: str = "result.txt") -> Path: assert self.workspace is not None and self.expected_structure is not None - result_path = self.workspace.dbgym_this_run_path / result_name + result_path = self.workspace.dbgym_this_run_path / relative_path + # Create parent directories if needed + result_path.parent.mkdir(parents=True, exist_ok=True) result_path.touch() - self.expected_structure["dbgym_workspace"]["task_runs"][ + + # Build up the nested dict structure for the expected path + current_dict = self.expected_structure["dbgym_workspace"]["task_runs"][ self.workspace.dbgym_this_run_path.name - ][result_name] = ("file",) + ] + path_parts = relative_path.split("/") + for part in path_parts[:-1]: + if part not in current_dict: + current_dict[part] = {} + current_dict = current_dict[part] + current_dict[path_parts[-1]] = ("file",) + self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) ) @@ -112,6 +123,22 @@ def test_link_result_basic_functionality(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) + def test_link_result_does_not_copy_directory_structure_to_symlinks_dir(self) -> None: + """ + Unlike save_file, link_result does not copy the directory structure to the symlinks directory. + """ + self.init_workspace_helper() + result_path = self.make_result_helper(relative_path="dir1/dir2/dir3/result.txt") + self.workspace.link_result(result_path) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][f"{result_path.name}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/dir1/dir2/dir3/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + def test_link_result_invalid_custom_link_name(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() @@ -169,7 +196,6 @@ def test_link_same_result_with_different_name(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - # TODO: test overriding existing symlink # TODO: test linking result from another run should raise # TODO: test that it should link in the agent dir in the links # TODO: test that it will ignore the directory structure (unlike save which keeps it) From d9603ebf1b059a0c4b80c1f757ab44126a8b160a Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 17:02:48 -0500 Subject: [PATCH 32/43] fmt and mypy --- util/tests/unittest_workspace.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 501e26e0..1883f1cc 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -77,7 +77,7 @@ def make_result_helper(self, relative_path: str = "result.txt") -> Path: # Create parent directories if needed result_path.parent.mkdir(parents=True, exist_ok=True) result_path.touch() - + # Build up the nested dict structure for the expected path current_dict = self.expected_structure["dbgym_workspace"]["task_runs"][ self.workspace.dbgym_this_run_path.name @@ -112,10 +112,13 @@ def test_init_from_already_initialized_workspace(self) -> None: def test_link_result_basic_functionality(self) -> None: self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() self.workspace.link_result(result_path) self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][f"{result_path.name}.link"] = ( + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( "symlink", f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) @@ -123,15 +126,20 @@ def test_link_result_basic_functionality(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - def test_link_result_does_not_copy_directory_structure_to_symlinks_dir(self) -> None: + def test_link_result_does_not_copy_directory_structure_to_symlinks_dir( + self, + ) -> None: """ Unlike save_file, link_result does not copy the directory structure to the symlinks directory. """ self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper(relative_path="dir1/dir2/dir3/result.txt") self.workspace.link_result(result_path) self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][f"{result_path.name}.link"] = ( + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( "symlink", f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/dir1/dir2/dir3/{result_path.name}", ) @@ -141,16 +149,20 @@ def test_link_result_does_not_copy_directory_structure_to_symlinks_dir(self) -> def test_link_result_invalid_custom_link_name(self) -> None: self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() with self.assertRaises(AssertionError): self.workspace.link_result(result_path, custom_link_name=f"custom") def test_link_result_valid_custom_link_name(self) -> None: self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() self.workspace.link_result(result_path, custom_link_name="custom.link") self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} - self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"]["custom.link"] = ( + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + "custom.link" + ] = ( "symlink", f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", ) @@ -160,6 +172,7 @@ def test_link_result_valid_custom_link_name(self) -> None: def test_link_same_result_twice_with_same_link_name(self) -> None: self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() self.workspace.link_result(result_path) self.workspace.link_result(result_path) @@ -176,6 +189,7 @@ def test_link_same_result_twice_with_same_link_name(self) -> None: def test_link_same_result_with_different_name(self) -> None: self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() self.workspace.link_result(result_path) self.workspace.link_result(result_path, custom_link_name="custom.link") @@ -198,7 +212,6 @@ def test_link_same_result_with_different_name(self) -> None: # TODO: test linking result from another run should raise # TODO: test that it should link in the agent dir in the links - # TODO: test that it will ignore the directory structure (unlike save which keeps it) # TODO: test linking a symlink or a non-fully-resolved path From 4dde8e683a0051d3bd7e016070a9c51e5987fffd Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 19:32:45 -0500 Subject: [PATCH 33/43] test_link_result_from_another_run_raises_error --- util/tests/unittest_workspace.py | 9 ++++++++- util/workspace.py | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 1883f1cc..b162a785 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -151,7 +151,7 @@ def test_link_result_invalid_custom_link_name(self) -> None: self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() - with self.assertRaises(AssertionError): + with self.assertRaisesRegex(AssertionError, "link_name \\(custom\\) should end with \"\\.link\""): self.workspace.link_result(result_path, custom_link_name=f"custom") def test_link_result_valid_custom_link_name(self) -> None: @@ -210,6 +210,13 @@ def test_link_same_result_with_different_name(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) + def test_link_result_from_another_run_raises_error(self) -> None: + self.init_workspace_helper() + result_path = self.make_result_helper() + self.init_workspace_helper() + with self.assertRaisesRegex(AssertionError, "The result must have been generated in \*this\* run\_\*/ dir"): + self.workspace.link_result(result_path) + # TODO: test linking result from another run should raise # TODO: test that it should link in the agent dir in the links # TODO: test linking a symlink or a non-fully-resolved path diff --git a/util/workspace.py b/util/workspace.py index 060cae05..2582940c 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -243,7 +243,7 @@ def link_result( assert is_fully_resolved( result_fordpath ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" - assert is_child_path(result_fordpath, self.dbgym_this_run_path) + assert is_child_path(result_fordpath, self.dbgym_this_run_path), "The result must have been generated in *this* run_*/ dir" assert not os.path.islink(result_fordpath) if type(custom_link_name) is str: @@ -256,7 +256,6 @@ def link_result( else: raise AssertionError("result_fordpath must be either a file or dir") - # TODO: check that it was generated in this run_*/. symlink_parent_dpath = self.dbgym_symlinks_path / self.app_name symlink_parent_dpath.mkdir(parents=True, exist_ok=True) From 7381e07730720a72f7a6336f822ba1e80e08d561 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 19:44:15 -0500 Subject: [PATCH 34/43] test_link_result_from_external_dir_raises_error and make_file_helper --- util/tests/unittest_workspace.py | 60 +++++++++++++++++++++++--------- util/workspace.py | 4 ++- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index b162a785..c3b1a8b6 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -30,10 +30,10 @@ def setUp(self) -> None: self.workspace: Optional[DBGymWorkspace] = None self.expected_structure: Optional[FilesystemStructure] = None - def tearDown(self) -> None: - # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). - if self.scratchspace_path.exists(): - shutil.rmtree(self.scratchspace_path) + # def tearDown(self) -> None: + # # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). + # if self.scratchspace_path.exists(): + # shutil.rmtree(self.scratchspace_path) # All these helper functions will perform an action, update the expected structure, and then verify the structure. # Importantly though, I don't have helper functions for the complex functions that I want to test (e.g. link_result and save_file). @@ -71,17 +71,17 @@ def init_workspace_helper(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - def make_result_helper(self, relative_path: str = "result.txt") -> Path: + def make_file_helper(self, relative_path: str) -> Path: assert self.workspace is not None and self.expected_structure is not None - result_path = self.workspace.dbgym_this_run_path / relative_path - # Create parent directories if needed - result_path.parent.mkdir(parents=True, exist_ok=True) - result_path.touch() + assert ( + ".." not in relative_path + ), 'relative_path should not contain ".." (it should be inside the scratchspace dir)' + file_path = self.scratchspace_path / relative_path + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.touch() # Build up the nested dict structure for the expected path - current_dict = self.expected_structure["dbgym_workspace"]["task_runs"][ - self.workspace.dbgym_this_run_path.name - ] + current_dict = self.expected_structure path_parts = relative_path.split("/") for part in path_parts[:-1]: if part not in current_dict: @@ -92,7 +92,16 @@ def make_result_helper(self, relative_path: str = "result.txt") -> Path: self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) ) - return result_path + return file_path + + def make_result_helper(self, relative_path: str = "result.txt") -> Path: + assert self.workspace is not None and self.expected_structure is not None + assert ( + ".." not in relative_path + ), 'relative_path should not contain ".." (it should be inside the run_*/ dir)' + return self.make_file_helper( + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{relative_path}" + ) def test_init_fields(self) -> None: workspace = DBGymWorkspace(self.workspace_path) @@ -151,7 +160,9 @@ def test_link_result_invalid_custom_link_name(self) -> None: self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None result_path = self.make_result_helper() - with self.assertRaisesRegex(AssertionError, "link_name \\(custom\\) should end with \"\\.link\""): + with self.assertRaisesRegex( + AssertionError, 'link_name \\(custom\\) should end with "\\.link"' + ): self.workspace.link_result(result_path, custom_link_name=f"custom") def test_link_result_valid_custom_link_name(self) -> None: @@ -214,11 +225,26 @@ def test_link_result_from_another_run_raises_error(self) -> None: self.init_workspace_helper() result_path = self.make_result_helper() self.init_workspace_helper() - with self.assertRaisesRegex(AssertionError, "The result must have been generated in \*this\* run\_\*/ dir"): + assert self.workspace is not None and self.expected_structure is not None + with self.assertRaisesRegex( + AssertionError, + "The result must have been generated in \*this\* run\_\*/ dir", + ): + self.workspace.link_result(result_path) + + def test_link_result_from_external_dir_raises_error(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_file_helper("external/result.txt") + with self.assertRaisesRegex( + AssertionError, + "The result must have been generated in \*this\* run\_\*/ dir", + ): self.workspace.link_result(result_path) - # TODO: test linking result from another run should raise - # TODO: test that it should link in the agent dir in the links + def test_link_result_cannot_link_symlink(self) -> None: + pass + # TODO: test linking a symlink or a non-fully-resolved path diff --git a/util/workspace.py b/util/workspace.py index 2582940c..c999f1f3 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -243,7 +243,9 @@ def link_result( assert is_fully_resolved( result_fordpath ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" - assert is_child_path(result_fordpath, self.dbgym_this_run_path), "The result must have been generated in *this* run_*/ dir" + assert is_child_path( + result_fordpath, self.dbgym_this_run_path + ), "The result must have been generated in *this* run_*/ dir" assert not os.path.islink(result_fordpath) if type(custom_link_name) is str: From f1e56fc51fdbaa694ae43ea29dc072767aa3063f Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 19:53:34 -0500 Subject: [PATCH 35/43] test_link_result_cannot_link_symlink --- util/tests/unittest_workspace.py | 52 ++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index c3b1a8b6..41cd0136 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -1,5 +1,6 @@ # TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers. +import os import shutil import unittest from pathlib import Path @@ -30,10 +31,10 @@ def setUp(self) -> None: self.workspace: Optional[DBGymWorkspace] = None self.expected_structure: Optional[FilesystemStructure] = None - # def tearDown(self) -> None: - # # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). - # if self.scratchspace_path.exists(): - # shutil.rmtree(self.scratchspace_path) + def tearDown(self) -> None: + # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) # All these helper functions will perform an action, update the expected structure, and then verify the structure. # Importantly though, I don't have helper functions for the complex functions that I want to test (e.g. link_result and save_file). @@ -71,14 +72,28 @@ def init_workspace_helper(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - def make_file_helper(self, relative_path: str) -> Path: + def make_file_helper( + self, relative_path: str, file_obj: tuple[str, ...] = ("file",) + ) -> Path: + """ + You can override file_obj to make it a symlink instead. + """ assert self.workspace is not None and self.expected_structure is not None assert ( ".." not in relative_path ), 'relative_path should not contain ".." (it should be inside the scratchspace dir)' file_path = self.scratchspace_path / relative_path file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.touch() + + if file_obj[0] == "file": + assert len(file_obj) == 1 + file_path.touch() + elif file_obj[0] == "symlink": + assert len(file_obj) == 2 + target_path = self.scratchspace_path / file_obj[1] + os.symlink(target_path, file_path) + else: + assert False, f"Unsupported file_obj: {file_obj}" # Build up the nested dict structure for the expected path current_dict = self.expected_structure @@ -87,20 +102,23 @@ def make_file_helper(self, relative_path: str) -> Path: if part not in current_dict: current_dict[part] = {} current_dict = current_dict[part] - current_dict[path_parts[-1]] = ("file",) + current_dict[path_parts[-1]] = file_obj self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) ) return file_path - def make_result_helper(self, relative_path: str = "result.txt") -> Path: + def make_result_helper( + self, relative_path: str = "result.txt", file_obj: tuple[str, ...] = ("file",) + ) -> Path: assert self.workspace is not None and self.expected_structure is not None assert ( ".." not in relative_path ), 'relative_path should not contain ".." (it should be inside the run_*/ dir)' return self.make_file_helper( - f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{relative_path}" + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{relative_path}", + file_obj=file_obj, ) def test_init_fields(self) -> None: @@ -243,7 +261,21 @@ def test_link_result_from_external_dir_raises_error(self) -> None: self.workspace.link_result(result_path) def test_link_result_cannot_link_symlink(self) -> None: - pass + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + symlink_path = self.make_result_helper( + "symlink.link", + file_obj=( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ), + ) + with self.assertRaisesRegex( + AssertionError, + "result_fordpath \(.*\) should be a fully resolved path", + ): + self.workspace.link_result(symlink_path) # TODO: test linking a symlink or a non-fully-resolved path From 8e5e5f23f9023ac72089c199d1f11d2227d070de Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 20:01:32 -0500 Subject: [PATCH 36/43] link result tests done --- util/tests/unittest_workspace.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 41cd0136..3f6fb004 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -277,8 +277,6 @@ def test_link_result_cannot_link_symlink(self) -> None: ): self.workspace.link_result(symlink_path) - # TODO: test linking a symlink or a non-fully-resolved path - if __name__ == "__main__": unittest.main() From 1a0d9474d93022dc70088d470310f39e7e8e4bb1 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 20:21:38 -0500 Subject: [PATCH 37/43] now passing test_save_file_dependency --- util/tests/unittest_workspace.py | 26 ++++++++++- util/workspace.py | 74 ++++++++++++++++++++++++++++---- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 3f6fb004..46834e15 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -157,7 +157,7 @@ def test_link_result_does_not_copy_directory_structure_to_symlinks_dir( self, ) -> None: """ - Unlike save_file, link_result does not copy the directory structure to the symlinks directory. + We always just want link_result to link to the base symlinks dir. """ self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None @@ -277,6 +277,30 @@ def test_link_result_cannot_link_symlink(self) -> None: ): self.workspace.link_result(symlink_path) + def test_save_file_dependency(self) -> None: + """ + See the comments in save_file() for what a "dependency" is. + """ + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result_path = self.make_result_helper() + self.init_workspace_helper() + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + # TODO: test save_file on a config + # TODO: test save_file on something generated this run + # TODO: test that save_file saves the whole directory + if __name__ == "__main__": unittest.main() diff --git a/util/workspace.py b/util/workspace.py index c999f1f3..073f650f 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -274,8 +274,67 @@ def link_result( return symlink_path - # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is - # explained further in the documentation. + def get_run_dpath_from_fpath(self, fpath: Path) -> Path: + run_dpath = fpath + while not parent_dpath_of_path(run_dpath).samefile(self.dbgym_runs_path): + run_dpath = parent_dpath_of_path(run_dpath) + return run_dpath + + # TODO(phw2): really look at the clean PR to see what it changed + # TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save() + def save_file(self, fpath: Path) -> None: + """ + If an external function takes in a file/directory as input, you will not be able to call open_and_save(). + In these situations, just call save_file(). + Like open_and_save(), this function only works with real absolute paths. + "Saving" can mean either copying the file or creating a symlink to it + We copy the file if it is a "config", meaning it just exists without having been generated + We create a symlink if it is a "dependency", meaning a task.py command was run to generate it + In these cases we create a symlink so we have full provenance for how the dependency was created + """ + # validate fpath + assert isinstance(fpath, Path) + assert not os.path.islink(fpath), f"fpath ({fpath}) should not be a symlink" + assert os.path.exists(fpath), f"fpath ({fpath}) does not exist" + assert os.path.isfile(fpath), f"fpath ({fpath}) is not a file" + assert not is_child_path( + fpath, self.dbgym_this_run_path + ), f"fpath ({fpath}) was generated in this task run ({self.dbgym_this_run_path}). You do not need to save it" + + # Save _something_ to dbgym_this_run_path. + # Save a symlink if the opened file was generated by a run. This is for two reasons: + # 1. Files or dirs generated by a run are supposed to be immutable so saving a symlink is safe. + # 2. Files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them. + if is_child_path(fpath, self.dbgym_runs_path): + # If the fpath file is directly in run_dpath, we symlink the file directly. + run_dpath = self.get_run_dpath_from_fpath(fpath) + parent_dpath = parent_dpath_of_path(fpath) + if parent_dpath.samefile(run_dpath): + fname = basename_of_path(fpath) + symlink_fpath = self.dbgym_this_run_path / (fname + ".link") + try_create_symlink(fpath, symlink_fpath) + # Otherwise, we know the fpath file is _not_ directly inside run_dpath dir. + # We go as far back as we can while still staying in run_dpath and symlink that "base" dir. + # This is because lots of runs create dirs within run_dpath and it creates too much clutter to symlink every individual file. + # Further, this avoids an edge case where you both save a file and the dir it's in. + else: + # Set base_dpath such that its parent is run_dpath. + base_dpath = parent_dpath + while not parent_dpath_of_path(base_dpath).samefile(run_dpath): + base_dpath = parent_dpath_of_path(base_dpath) + + # Create symlink + open_base_dname = basename_of_path(base_dpath) + symlink_dpath = self.dbgym_this_run_path / (open_base_dname + ".link") + try_create_symlink(base_dpath, symlink_dpath) + # If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable. + else: + fname = basename_of_path(fpath) + # In this case, we want to copy instead of symlinking since it might disappear in the future. + copy_fpath = self.dbgym_this_run_path / fname + shutil.copy(fpath, copy_fpath) + + # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is explained further in the documentation. def append_group(self, name: str) -> None: self.cur_path_list.append(name) @@ -489,6 +548,7 @@ def open_and_save( This file does not work if open_fpath is a symlink, to make its interface identical to that of open(). Make sure to resolve all symlinks with fully_resolve_path(). To avoid confusion, I'm enforcing this function to only work with absolute paths. + # TODO: maybe make it work on non-fully-resolved paths to better match open() See the comment of save_file() for what "saving" means If you are generating a "result" for the run, _do not_ use this. Just use the normal open(). This shouldn't be too hard to remember because this function crashes if open_fpath doesn't exist, @@ -553,8 +613,7 @@ def extract_from_task_run_fordpath( return codebase_dpath, codebase_dname, org_dpath, org_dname -# TODO(phw2): really look at the clean PR to see what it changed -# TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save() +# TODO(phw2): deprecate this once I'm done with unittest_workspace.py def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: """ If an external function takes in a file/directory as input, you will not be able to call open_and_save(). @@ -566,10 +625,8 @@ def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: In these cases we create a symlink so we have full provenance for how the dependency was created """ # validate fpath - assert isinstance(fpath, Path) - assert not os.path.islink(fpath), f"fpath ({fpath}) should not be a symlink" - assert os.path.exists(fpath), f"fpath ({fpath}) does not exist" - assert os.path.isfile(fpath), f"fpath ({fpath}) is not a file" + assert is_fully_resolved(fpath), f"fpath ({fpath}) should be a fully resolved path" + assert os.path.isfile(fpath), f"fpath ({fpath}) should be a file" assert not is_child_path( fpath, dbgym_workspace.dbgym_this_run_path ), f"fpath ({fpath}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it" @@ -703,6 +760,7 @@ def try_remove_file(path: Path) -> None: pass +# TODO: move this stuff to shell.py def restart_ray(redis_port: int) -> None: """ Stop and start Ray. From ddefc2b3943d6a91575553c4e82949a8b2bd539f Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 20:22:56 -0500 Subject: [PATCH 38/43] test_save_file_config --- util/tests/unittest_workspace.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 46834e15..1307f53d 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -297,6 +297,21 @@ def test_save_file_dependency(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) + def test_save_file_config(self) -> None: + """ + See the comments in save_file() for what a "config" is. + """ + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_file_helper("external/result.txt") + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}"] = ("file",) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + # TODO: test save_file on a config # TODO: test save_file on something generated this run # TODO: test that save_file saves the whole directory From f558a42607f143bcbcf756e0d88dfda7adef00a1 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 20:28:31 -0500 Subject: [PATCH 39/43] test_save_file_generated_this_run_raises_error --- util/tests/unittest_workspace.py | 34 +++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 1307f53d..1038727f 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -312,9 +312,37 @@ def test_save_file_config(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - # TODO: test save_file on a config - # TODO: test save_file on something generated this run - # TODO: test that save_file saves the whole directory + def test_save_file_dependency_inside_directory(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result_path = self.make_result_helper("dir1/dir2/result.txt") + self.make_result_helper("dir1/other1.txt") + self.make_result_helper("dir1/dir3/other2.txt") + self.init_workspace_helper() + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ]["dir1.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/dir1", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_generated_this_run_raises_error(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + with self.assertRaisesRegex( + AssertionError, + "fpath \(.*\) was generated in this task run \(.*\)\. You do not need to save it", + ): + self.workspace.save_file(result_path) + + # TODO: test saving the same config/dependency twice + # TODO: test saving different configs/dependencies with the same name if __name__ == "__main__": From f0e660f58cee3dbe0499b152d5df86750ca099a4 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 20:31:02 -0500 Subject: [PATCH 40/43] wrote save same file twice tests --- util/tests/unittest_workspace.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 1038727f..6d553ed6 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -297,6 +297,24 @@ def test_save_file_dependency(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) + def test_save_file_same_dependency_twice(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result_path = self.make_result_helper() + self.init_workspace_helper() + self.workspace.save_file(result_path) + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + def test_save_file_config(self) -> None: """ See the comments in save_file() for what a "config" is. @@ -312,6 +330,19 @@ def test_save_file_config(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) + def test_save_file_same_config_twice(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_file_helper("external/result.txt") + self.workspace.save_file(result_path) + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}"] = ("file",) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + def test_save_file_dependency_inside_directory(self) -> None: self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None From 7d9ee751e3928dd6e42d23ad35ed03727c496f99 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 20:33:15 -0500 Subject: [PATCH 41/43] can now write file contents when creating file --- util/tests/filesystem_unittest_util.py | 8 ++++++-- util/tests/unittest_workspace.py | 7 +++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/util/tests/filesystem_unittest_util.py b/util/tests/filesystem_unittest_util.py index 667bd24e..37997c71 100644 --- a/util/tests/filesystem_unittest_util.py +++ b/util/tests/filesystem_unittest_util.py @@ -25,8 +25,12 @@ def create_structure_internal( FilesystemStructure(cast(dict[str, Any], content)), ) elif isinstance(content, tuple) and content[0] == "file": - assert len(content) == 1 - full_path.touch() + full_path.parent.mkdir(parents=True, exist_ok=True) + if len(content) == 2: + full_path.write_text(content[1]) + else: + assert len(content) == 1 + full_path.touch() elif isinstance(content, tuple) and content[0] == "symlink": assert len(content) == 2 target_path = root_path / content[1] diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 6d553ed6..06cef5ec 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -86,7 +86,7 @@ def make_file_helper( file_path.parent.mkdir(parents=True, exist_ok=True) if file_obj[0] == "file": - assert len(file_obj) == 1 + assert len(file_obj) in [1, 2] file_path.touch() elif file_obj[0] == "symlink": assert len(file_obj) == 2 @@ -321,11 +321,11 @@ def test_save_file_config(self) -> None: """ self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None - result_path = self.make_file_helper("external/result.txt") + result_path = self.make_file_helper("external/result.txt", file_obj=("file", "contents")) self.workspace.save_file(result_path) self.expected_structure["dbgym_workspace"]["task_runs"][ self.workspace.dbgym_this_run_path.name - ][f"{result_path.name}"] = ("file",) + ][f"{result_path.name}"] = ("file", "contents") self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) ) @@ -372,7 +372,6 @@ def test_save_file_generated_this_run_raises_error(self) -> None: ): self.workspace.save_file(result_path) - # TODO: test saving the same config/dependency twice # TODO: test saving different configs/dependencies with the same name From 8fa8ac057353acdd446a4d483f058ccd60f44cef Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Fri, 27 Dec 2024 20:33:58 -0500 Subject: [PATCH 42/43] fmt and mypy --- util/tests/unittest_workspace.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index 06cef5ec..b255793c 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -315,13 +315,20 @@ def test_save_file_same_dependency_twice(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) + def test_save_file_two_different_dependencies_with_same_name(self) -> None: + # TODO + # TODO: also do the config version + pass + def test_save_file_config(self) -> None: """ See the comments in save_file() for what a "config" is. """ self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None - result_path = self.make_file_helper("external/result.txt", file_obj=("file", "contents")) + result_path = self.make_file_helper( + "external/result.txt", file_obj=("file", "contents") + ) self.workspace.save_file(result_path) self.expected_structure["dbgym_workspace"]["task_runs"][ self.workspace.dbgym_this_run_path.name From 36d1c68c89d6f3cfab7df3a1dab88c99db6e3f67 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 28 Dec 2024 19:10:13 -0500 Subject: [PATCH 43/43] done with two file save tests and code --- util/tests/unittest_workspace.py | 93 ++++++++++++++++++++++++++++---- util/workspace.py | 16 +++--- 2 files changed, 94 insertions(+), 15 deletions(-) diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py index b255793c..89a1c6c1 100644 --- a/util/tests/unittest_workspace.py +++ b/util/tests/unittest_workspace.py @@ -301,7 +301,7 @@ def test_save_file_same_dependency_twice(self) -> None: self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None prev_run_name = self.workspace.dbgym_this_run_path.name - result_path = self.make_result_helper() + result_path = self.make_result_helper(file_obj=("file",)) self.init_workspace_helper() self.workspace.save_file(result_path) self.workspace.save_file(result_path) @@ -315,10 +315,64 @@ def test_save_file_same_dependency_twice(self) -> None: verify_structure(self.scratchspace_path, self.expected_structure) ) - def test_save_file_two_different_dependencies_with_same_name(self) -> None: - # TODO - # TODO: also do the config version - pass + def test_save_file_two_different_dependencies_with_same_filename_both_directly_inside_run( + self, + ) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_names = [] + prev_run_names.append(self.workspace.dbgym_this_run_path.name) + result1_path = self.make_result_helper(file_obj=("file",)) + self.init_workspace_helper() + prev_run_names.append(self.workspace.dbgym_this_run_path.name) + result2_path = self.make_result_helper(file_obj=("file",)) + filename = result1_path.name + assert filename == result2_path.name + + self.init_workspace_helper() + self.workspace.save_file(result1_path) + self.workspace.save_file(result2_path) + # The second save_file() should have overwritten the first one. + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{filename}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_names[-1]}/{filename}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_two_different_dependencies_with_same_filename_but_different_outermost_dirs( + self, + ) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result1_path = self.make_result_helper("dir1/result.txt", file_obj=("file",)) + result2_path = self.make_result_helper("result.txt", file_obj=("file",)) + filename = result1_path.name + assert filename == result2_path.name + + self.init_workspace_helper() + self.workspace.save_file(result1_path) + self.workspace.save_file(result2_path) + # The second save_file() should not overwrite the first one because the outermost dirs are different. + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{filename}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/{filename}", + ) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ]["dir1.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/dir1", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) def test_save_file_config(self) -> None: """ @@ -340,12 +394,35 @@ def test_save_file_config(self) -> None: def test_save_file_same_config_twice(self) -> None: self.init_workspace_helper() assert self.workspace is not None and self.expected_structure is not None - result_path = self.make_file_helper("external/result.txt") + result_path = self.make_file_helper( + "external/result.txt", file_obj=("file", "contents") + ) self.workspace.save_file(result_path) self.workspace.save_file(result_path) self.expected_structure["dbgym_workspace"]["task_runs"][ self.workspace.dbgym_this_run_path.name - ][f"{result_path.name}"] = ("file",) + ][f"{result_path.name}"] = ("file", "contents") + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_two_different_configs_with_same_filename(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result1_path = self.make_file_helper( + "external/result.txt", file_obj=("file", "contents1") + ) + result2_path = self.make_file_helper( + "external/dir1/result.txt", file_obj=("file", "contents2") + ) + filename = result1_path.name + assert filename == result2_path.name + + self.workspace.save_file(result1_path) + self.workspace.save_file(result2_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{filename}"] = ("file", "contents2") self.assertTrue( verify_structure(self.scratchspace_path, self.expected_structure) ) @@ -379,8 +456,6 @@ def test_save_file_generated_this_run_raises_error(self) -> None: ): self.workspace.save_file(result_path) - # TODO: test saving different configs/dependencies with the same name - if __name__ == "__main__": unittest.main() diff --git a/util/workspace.py b/util/workspace.py index 073f650f..11e0d620 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -291,6 +291,14 @@ def save_file(self, fpath: Path) -> None: We copy the file if it is a "config", meaning it just exists without having been generated We create a symlink if it is a "dependency", meaning a task.py command was run to generate it In these cases we create a symlink so we have full provenance for how the dependency was created + + **Notable Behavior** + - When you save a dependency, it actually creates a link to the outermost directory still inside run_*/. + - The second save will overwrite the first. + - If you save the same file twice in the same run, the second save will overwrite the first. + - If you save two configs with the same name, the second save will overwrite the first. + - If you save two dependencies with the same *outermost* directory, or two dependencies with the same filename + both directly inside run_*/, the second save will overwrite the first. """ # validate fpath assert isinstance(fpath, Path) @@ -312,6 +320,7 @@ def save_file(self, fpath: Path) -> None: if parent_dpath.samefile(run_dpath): fname = basename_of_path(fpath) symlink_fpath = self.dbgym_this_run_path / (fname + ".link") + try_remove_file(symlink_fpath) try_create_symlink(fpath, symlink_fpath) # Otherwise, we know the fpath file is _not_ directly inside run_dpath dir. # We go as far back as we can while still staying in run_dpath and symlink that "base" dir. @@ -326,6 +335,7 @@ def save_file(self, fpath: Path) -> None: # Create symlink open_base_dname = basename_of_path(base_dpath) symlink_dpath = self.dbgym_this_run_path / (open_base_dname + ".link") + try_remove_file(symlink_dpath) try_create_symlink(base_dpath, symlink_dpath) # If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable. else: @@ -553,12 +563,6 @@ def open_and_save( If you are generating a "result" for the run, _do not_ use this. Just use the normal open(). This shouldn't be too hard to remember because this function crashes if open_fpath doesn't exist, and when you write results you're usually opening open_fpaths which do not exist. - - **Notable Behavior** - - If you open the same "config" file twice in the same run, it'll only be saved the first time (even if the file has changed in between). - - "Dependency" files should be immutable so there's no problem here. - - If you open two "config" files of the same name but different paths, only the first open will be saved. - - Opening two "dependency" files of the same name but different paths will lead to two different "base dirs" being symlinked. """ # validate open_fpath assert isinstance(open_fpath, Path)