diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index ed0f0d3e..a8fec259 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -52,7 +52,7 @@ jobs: # Integration tests do require external systems to be running (most commonly a database instance). # Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does. env: - # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `./env/set_up_env_integtests.sh`. + # We set `INTENDED_DBDATA_HARDWARE` so that it's seen when `integtest_pg_conn.py` executes `_set_up_gymlib_integtest_workspace.sh`. INTENDED_DBDATA_HARDWARE: ssd run: | . "$HOME/.cargo/env" diff --git a/.gitignore b/.gitignore index 601b999d..eb20bf15 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,8 @@ __pycache__/ .conda/ .idea/ build/ -test_clean_scratchspace/ +*_scratchspace/ workspace/ default_*_benchbase_config_*.xml -*.egg-info/ \ No newline at end of file +*.egg-info/ +*.code-workspace \ No newline at end of file diff --git a/benchmark/cli.py b/benchmark/cli.py index 17ca442b..1cf63418 100644 --- a/benchmark/cli.py +++ b/benchmark/cli.py @@ -2,13 +2,13 @@ from benchmark.job.cli import job_group from benchmark.tpch.cli import tpch_group -from util.workspace import DBGymConfig +from util.workspace import DBGymWorkspace @click.group(name="benchmark") @click.pass_obj -def benchmark_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("benchmark") +def benchmark_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("benchmark") benchmark_group.add_command(tpch_group) diff --git a/benchmark/job/cli.py b/benchmark/job/cli.py index d13b5ddf..84df02ef 100644 --- a/benchmark/job/cli.py +++ b/benchmark/job/cli.py @@ -7,7 +7,7 @@ from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run from util.workspace import ( - DBGymConfig, + DBGymWorkspace, get_default_tables_dname, get_workload_name, is_fully_resolved, @@ -136,8 +136,8 @@ @click.group(name="job") @click.pass_obj -def job_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("job") +def job_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("job") @job_group.command(name="data") @@ -146,9 +146,9 @@ def job_group(dbgym_cfg: DBGymConfig) -> None: @click.pass_obj # The reason generate data is separate from create dbdata is because generate-data is generic # to all DBMSs while create dbdata is specific to a single DBMS. -def job_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: +def job_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: assert scale_factor == DEFAULT_SCALE_FACTOR - _download_job_data(dbgym_cfg) + _download_job_data(dbgym_workspace) @job_group.command(name="workload") @@ -160,25 +160,25 @@ def job_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR) @click.pass_obj def job_workload( - dbgym_cfg: DBGymConfig, query_subset: str, scale_factor: float + dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float ) -> None: assert scale_factor == DEFAULT_SCALE_FACTOR - _download_job_queries(dbgym_cfg) - _generate_job_workload(dbgym_cfg, query_subset) + _download_job_queries(dbgym_workspace) + _generate_job_workload(dbgym_workspace, query_subset) -def _download_job_data(dbgym_cfg: DBGymConfig) -> None: +def _download_job_data(dbgym_workspace: DBGymWorkspace) -> None: _download_and_untar_dir( - dbgym_cfg, + dbgym_workspace, JOB_TABLES_URL, "imdb.tgz", get_default_tables_dname(DEFAULT_SCALE_FACTOR), ) -def _download_job_queries(dbgym_cfg: DBGymConfig) -> None: +def _download_job_queries(dbgym_workspace: DBGymWorkspace) -> None: _download_and_untar_dir( - dbgym_cfg, + dbgym_workspace, JOB_QUERIES_URL, "job.tgz", JOB_QUERIES_DNAME, @@ -187,7 +187,7 @@ def _download_job_queries(dbgym_cfg: DBGymConfig) -> None: def _download_and_untar_dir( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, download_url: str, download_tarred_fname: str, untarred_dname: str, @@ -200,7 +200,7 @@ def _download_and_untar_dir( `untarred_original_dname` to ensure that it gets renamed to `untarred_dname`. """ expected_symlink_dpath = ( - dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link" + dbgym_workspace.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link" ) if expected_symlink_dpath.exists(): logging.getLogger(DBGYM_LOGGER_NAME).info( @@ -209,9 +209,9 @@ def _download_and_untar_dir( return logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}") - real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) + real_data_path = dbgym_workspace.cur_task_runs_data_path(mkdir=True) subprocess_run(f"curl -O {download_url}", cwd=real_data_path) - untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname) + untarred_data_dpath = dbgym_workspace.cur_task_runs_data_path(untarred_dname) if untarred_original_dname is not None: assert not untarred_data_dpath.exists() @@ -226,24 +226,24 @@ def _download_and_untar_dir( assert untarred_data_dpath.exists() subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path) - symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath) + symlink_dpath = link_result(dbgym_workspace, untarred_data_dpath) assert expected_symlink_dpath.samefile(symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloaded: {expected_symlink_dpath}") def _generate_job_workload( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, query_subset: str, ) -> None: workload_name = get_workload_name(DEFAULT_SCALE_FACTOR, query_subset) - expected_workload_symlink_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True) / ( - workload_name + ".link" - ) + expected_workload_symlink_dpath = dbgym_workspace.cur_symlinks_data_path( + mkdir=True + ) / (workload_name + ".link") logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generating: {expected_workload_symlink_dpath}" ) - real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True) + real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True) query_names = None if query_subset == "all": @@ -258,7 +258,7 @@ def _generate_job_workload( with open(real_dpath / "order.txt", "w") as f: for qname in query_names: sql_fpath = ( - dbgym_cfg.cur_symlinks_data_path(mkdir=True) + dbgym_workspace.cur_symlinks_data_path(mkdir=True) / (f"{JOB_QUERIES_DNAME}.link") ).resolve() / f"{qname}.sql" assert is_fully_resolved( @@ -266,7 +266,7 @@ def _generate_job_workload( ), "We should only write existent real absolute paths to a file" f.write(f"Q{qname},{sql_fpath}\n") - workload_symlink_dpath = link_result(dbgym_cfg, real_dpath) + workload_symlink_dpath = link_result(dbgym_workspace, real_dpath) assert workload_symlink_dpath == expected_workload_symlink_dpath logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated: {expected_workload_symlink_dpath}" diff --git a/benchmark/job/load_info.py b/benchmark/job/load_info.py index 4df370b6..e205847d 100644 --- a/benchmark/job/load_info.py +++ b/benchmark/job/load_info.py @@ -3,7 +3,7 @@ from benchmark.constants import DEFAULT_SCALE_FACTOR from dbms.load_info_base_class import LoadInfoBaseClass -from util.workspace import DBGymConfig, get_default_tables_dname, is_fully_resolved +from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved JOB_SCHEMA_FNAME = "job_schema.sql" @@ -35,9 +35,9 @@ class JobLoadInfo(LoadInfoBaseClass): "title", ] - def __init__(self, dbgym_cfg: DBGymConfig): + def __init__(self, dbgym_workspace: DBGymWorkspace): # schema and constraints - schema_root_dpath = dbgym_cfg.dbgym_repo_path + schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath for component in JobLoadInfo.CODEBASE_PATH_COMPONENTS[ 1: ]: # [1:] to skip "dbgym" @@ -49,7 +49,7 @@ def __init__(self, dbgym_cfg: DBGymConfig): # Tables data_root_dpath = ( - dbgym_cfg.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data" + dbgym_workspace.dbgym_symlinks_path / JobLoadInfo.CODEBASE_DNAME / "data" ) tables_symlink_dpath = ( data_root_dpath / f"{get_default_tables_dname(DEFAULT_SCALE_FACTOR)}.link" diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index 1a3314bb..eea9dc5c 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -8,7 +8,7 @@ from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run from util.workspace import ( - DBGymConfig, + DBGymWorkspace, get_default_tables_dname, get_scale_factor_string, get_workload_name, @@ -19,8 +19,8 @@ @click.group(name="tpch") @click.pass_obj -def tpch_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("tpch") +def tpch_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("tpch") @tpch_group.command(name="data") @@ -28,9 +28,9 @@ def tpch_group(dbgym_cfg: DBGymConfig) -> None: @click.pass_obj # The reason generate data is separate from create dbdata is because generate-data is generic # to all DBMSs while create dbdata is specific to a single DBMS. -def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: - _clone_tpch_kit(dbgym_cfg) - _generate_data(dbgym_cfg, scale_factor) +def tpch_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: + _clone_tpch_kit(dbgym_workspace) + _generate_data(dbgym_workspace, scale_factor) @tpch_group.command(name="workload") @@ -54,7 +54,7 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR) @click.pass_obj def tpch_workload( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, query_subset: str, @@ -63,18 +63,20 @@ def tpch_workload( assert ( seed_start <= seed_end ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})" - _clone_tpch_kit(dbgym_cfg) - _generate_tpch_queries(dbgym_cfg, seed_start, seed_end, scale_factor) - _generate_tpch_workload(dbgym_cfg, seed_start, seed_end, query_subset, scale_factor) + _clone_tpch_kit(dbgym_workspace) + _generate_tpch_queries(dbgym_workspace, seed_start, seed_end, scale_factor) + _generate_tpch_workload( + dbgym_workspace, seed_start, seed_end, query_subset, scale_factor + ) def _get_queries_dname(seed: int, scale_factor: float) -> str: return f"queries_{seed}_sf{get_scale_factor_string(scale_factor)}" -def _clone_tpch_kit(dbgym_cfg: DBGymConfig) -> None: +def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None: expected_symlink_dpath = ( - dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link" + dbgym_workspace.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link" ) if expected_symlink_dpath.exists(): logging.getLogger(DBGYM_LOGGER_NAME).info( @@ -83,26 +85,28 @@ def _clone_tpch_kit(dbgym_cfg: DBGymConfig) -> None: return logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloning: {expected_symlink_dpath}") - real_build_path = dbgym_cfg.cur_task_runs_build_path() + real_build_path = dbgym_workspace.cur_task_runs_build_path() subprocess_run( - f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_cfg.cur_source_path() + f"./clone_tpch_kit.sh {real_build_path}", cwd=dbgym_workspace.cur_source_path() ) - symlink_dpath = link_result(dbgym_cfg, real_build_path / "tpch-kit") + symlink_dpath = link_result(dbgym_workspace, real_build_path / "tpch-kit") assert expected_symlink_dpath.samefile(symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info(f"Cloned: {expected_symlink_dpath}") -def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path: - tpch_kit_dpath = (dbgym_cfg.cur_symlinks_build_path() / "tpch-kit.link").resolve() +def _get_tpch_kit_dpath(dbgym_workspace: DBGymWorkspace) -> Path: + tpch_kit_dpath = ( + dbgym_workspace.cur_symlinks_build_path() / "tpch-kit.link" + ).resolve() assert is_fully_resolved(tpch_kit_dpath) return tpch_kit_dpath def _generate_tpch_queries( - dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float + dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float ) -> None: - tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg) - data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True) + tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace) + data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generating queries: {data_path} [{seed_start}, {seed_end}]" ) @@ -113,7 +117,7 @@ def _generate_tpch_queries( if expected_queries_symlink_dpath.exists(): continue - real_dir = dbgym_cfg.cur_task_runs_data_path( + real_dir = dbgym_workspace.cur_task_runs_data_path( _get_queries_dname(seed, scale_factor), mkdir=True ) for i in range(1, NUM_TPCH_QUERIES + 1): @@ -123,16 +127,16 @@ def _generate_tpch_queries( cwd=tpch_kit_dpath / "dbgen", verbose=False, ) - queries_symlink_dpath = link_result(dbgym_cfg, real_dir) + queries_symlink_dpath = link_result(dbgym_workspace, real_dir) assert queries_symlink_dpath.samefile(expected_queries_symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated queries: {data_path} [{seed_start}, {seed_end}]" ) -def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: - tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg) - data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True) +def _generate_data(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: + tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_workspace) + data_path = dbgym_workspace.cur_symlinks_data_path(mkdir=True) expected_tables_symlink_dpath = ( data_path / f"{get_default_tables_dname(scale_factor)}.link" ) @@ -146,12 +150,12 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: f"Generating: {expected_tables_symlink_dpath}" ) subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen") - real_dir = dbgym_cfg.cur_task_runs_data_path( + real_dir = dbgym_workspace.cur_task_runs_data_path( get_default_tables_dname(scale_factor), mkdir=True ) subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen") - tables_symlink_dpath = link_result(dbgym_cfg, real_dir) + tables_symlink_dpath = link_result(dbgym_workspace, real_dir) assert tables_symlink_dpath.samefile(expected_tables_symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated: {expected_tables_symlink_dpath}" @@ -159,13 +163,13 @@ def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None: def _generate_tpch_workload( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, query_subset: str, scale_factor: float, ) -> None: - symlink_data_dpath = dbgym_cfg.cur_symlinks_data_path(mkdir=True) + symlink_data_dpath = dbgym_workspace.cur_symlinks_data_path(mkdir=True) workload_name = get_workload_name( scale_factor, f"{seed_start}_{seed_end}_{query_subset}" ) @@ -174,7 +178,7 @@ def _generate_tpch_workload( logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generating: {expected_workload_symlink_dpath}" ) - real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True) + real_dpath = dbgym_workspace.cur_task_runs_data_path(workload_name, mkdir=True) query_names = None if query_subset == "all": @@ -199,7 +203,7 @@ def _generate_tpch_workload( f.write(f"S{seed}-Q{qname},{sql_fpath}\n") # TODO(WAN): add option to deep-copy the workload. - workload_symlink_dpath = link_result(dbgym_cfg, real_dpath) + workload_symlink_dpath = link_result(dbgym_workspace, real_dpath) assert workload_symlink_dpath == expected_workload_symlink_dpath logging.getLogger(DBGYM_LOGGER_NAME).info( f"Generated: {expected_workload_symlink_dpath}" diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py index fb660906..f9710b04 100644 --- a/benchmark/tpch/load_info.py +++ b/benchmark/tpch/load_info.py @@ -2,7 +2,7 @@ from typing import Optional from dbms.load_info_base_class import LoadInfoBaseClass -from util.workspace import DBGymConfig, get_default_tables_dname, is_fully_resolved +from util.workspace import DBGymWorkspace, get_default_tables_dname, is_fully_resolved TPCH_SCHEMA_FNAME = "tpch_schema.sql" TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql" @@ -24,9 +24,9 @@ class TpchLoadInfo(LoadInfoBaseClass): "lineitem", ] - def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float): + def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float): # schema and constraints - schema_root_dpath = dbgym_cfg.dbgym_repo_path + schema_root_dpath = dbgym_workspace.base_dbgym_repo_dpath for component in TpchLoadInfo.CODEBASE_PATH_COMPONENTS[ 1: ]: # [1:] to skip "dbgym" @@ -42,7 +42,7 @@ def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float): # tables data_root_dpath = ( - dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data" + dbgym_workspace.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data" ) tables_symlink_dpath = ( data_root_dpath / f"{get_default_tables_dname(scale_factor)}.link" diff --git a/dbms/cli.py b/dbms/cli.py index b3984637..6d97c5e2 100644 --- a/dbms/cli.py +++ b/dbms/cli.py @@ -1,13 +1,13 @@ import click from dbms.postgres.cli import postgres_group -from util.workspace import DBGymConfig +from util.workspace import DBGymWorkspace @click.group(name="dbms") @click.pass_obj -def dbms_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("dbms") +def dbms_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("dbms") dbms_group.add_command(postgres_group) diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 8d34e366..393689a1 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -31,7 +31,7 @@ from util.shell import subprocess_run from util.workspace import ( WORKSPACE_PATH_PLACEHOLDER, - DBGymConfig, + DBGymWorkspace, fully_resolve_path, get_dbdata_tgz_name, get_default_dbdata_parent_dpath, @@ -46,8 +46,8 @@ @click.group(name="postgres") @click.pass_obj -def postgres_group(dbgym_cfg: DBGymConfig) -> None: - dbgym_cfg.append_group("postgres") +def postgres_group(dbgym_workspace: DBGymWorkspace) -> None: + dbgym_workspace.append_group("postgres") @postgres_group.command( @@ -60,8 +60,8 @@ def postgres_group(dbgym_cfg: DBGymConfig) -> None: is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.", ) -def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: - _build_repo(dbgym_cfg, rebuild) +def postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: + _build_repo(dbgym_workspace, rebuild) @postgres_group.command( @@ -90,7 +90,7 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", ) def postgres_dbdata( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float, pgbin_path: Optional[Path], @@ -99,15 +99,15 @@ def postgres_dbdata( ) -> None: # Set args to defaults programmatically (do this before doing anything else in the function) if pgbin_path is None: - pgbin_path = get_default_pgbin_path(dbgym_cfg.dbgym_workspace_path) + pgbin_path = get_default_pgbin_path(dbgym_workspace.dbgym_workspace_path) if dbdata_parent_dpath is None: dbdata_parent_dpath = get_default_dbdata_parent_dpath( - dbgym_cfg.dbgym_workspace_path + dbgym_workspace.dbgym_workspace_path ) # Fully resolve all input paths. - pgbin_path = fully_resolve_path(dbgym_cfg, pgbin_path) - dbdata_parent_dpath = fully_resolve_path(dbgym_cfg, dbdata_parent_dpath) + pgbin_path = fully_resolve_path(pgbin_path) + dbdata_parent_dpath = fully_resolve_path(dbdata_parent_dpath) # Check assertions on args if intended_dbdata_hardware == "hdd": @@ -123,22 +123,22 @@ def postgres_dbdata( # Create dbdata _create_dbdata( - dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath + dbgym_workspace, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath ) -def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path: - return dbgym_cfg.cur_symlinks_build_path( +def _get_pgbin_symlink_path(dbgym_workspace: DBGymWorkspace) -> Path: + return dbgym_workspace.cur_symlinks_build_path( "repo.link", "boot", "build", "postgres", "bin" ) -def _get_repo_symlink_path(dbgym_cfg: DBGymConfig) -> Path: - return dbgym_cfg.cur_symlinks_build_path("repo.link") +def _get_repo_symlink_path(dbgym_workspace: DBGymWorkspace) -> Path: + return dbgym_workspace.cur_symlinks_build_path("repo.link") -def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: - expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_cfg) +def _build_repo(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: + expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_workspace) if not rebuild and expected_repo_symlink_dpath.exists(): logging.getLogger(DBGYM_LOGGER_NAME).info( f"Skipping _build_repo: {expected_repo_symlink_dpath}" @@ -148,13 +148,13 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: logging.getLogger(DBGYM_LOGGER_NAME).info( f"Setting up repo in {expected_repo_symlink_dpath}" ) - repo_real_dpath = dbgym_cfg.cur_task_runs_build_path("repo", mkdir=True) + repo_real_dpath = dbgym_workspace.cur_task_runs_build_path("repo", mkdir=True) subprocess_run( - f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_cfg.cur_source_path() + f"./build_repo.sh {repo_real_dpath}", cwd=dbgym_workspace.cur_source_path() ) # only link at the end so that the link only ever points to a complete repo - repo_symlink_dpath = link_result(dbgym_cfg, repo_real_dpath) + repo_symlink_dpath = link_result(dbgym_workspace, repo_real_dpath) assert expected_repo_symlink_dpath.samefile(repo_symlink_dpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Set up repo in {expected_repo_symlink_dpath}" @@ -162,7 +162,7 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild: bool) -> None: def _create_dbdata( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float, pgbin_path: Path, @@ -184,23 +184,23 @@ def _create_dbdata( # Call initdb. # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run. - save_file(dbgym_cfg, pgbin_path / "initdb") + save_file(dbgym_workspace, pgbin_path / "initdb") subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path) # Start Postgres (all other dbdata setup requires postgres to be started). # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead. - start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) + start_postgres(dbgym_workspace, pgbin_path, dbdata_dpath) # Set up Postgres. - _generic_dbdata_setup(dbgym_cfg) - _load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor) + _generic_dbdata_setup(dbgym_workspace) + _load_benchmark_into_dbdata(dbgym_workspace, benchmark_name, scale_factor) # Stop Postgres so that we don't "leak" processes. - stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) + stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath) # Create .tgz file. # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir. - dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path( + dbdata_tgz_real_fpath = dbgym_workspace.cur_task_runs_data_path( mkdir=True ) / get_dbdata_tgz_name(benchmark_name, scale_factor) # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath. @@ -208,22 +208,22 @@ def _create_dbdata( # Create symlink. # Only link at the end so that the link only ever points to a complete dbdata. - dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath) + dbdata_tgz_symlink_path = link_result(dbgym_workspace, dbdata_tgz_real_fpath) logging.getLogger(DBGYM_LOGGER_NAME).info( f"Created dbdata in {dbdata_tgz_symlink_path}" ) -def _generic_dbdata_setup(dbgym_cfg: DBGymConfig) -> None: +def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None: # get necessary vars - pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve() + pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_workspace).resolve() assert pgbin_real_dpath.exists() dbgym_pguser = DBGYM_POSTGRES_USER dbgym_pgpass = DBGYM_POSTGRES_PASS pgport = DEFAULT_POSTGRES_PORT # Create user - save_file(dbgym_cfg, pgbin_real_dpath / "psql") + save_file(dbgym_workspace, pgbin_real_dpath / "psql") subprocess_run( f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost", cwd=pgbin_real_dpath, @@ -251,34 +251,36 @@ def _generic_dbdata_setup(dbgym_cfg: DBGymConfig) -> None: def _load_benchmark_into_dbdata( - dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float + dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float ) -> None: load_info: LoadInfoBaseClass with create_sqlalchemy_conn() as conn: if benchmark_name == "tpch": - load_info = TpchLoadInfo(dbgym_cfg, scale_factor) + load_info = TpchLoadInfo(dbgym_workspace, scale_factor) elif benchmark_name == "job": - load_info = JobLoadInfo(dbgym_cfg) + load_info = JobLoadInfo(dbgym_workspace) else: raise AssertionError( f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented" ) - _load_into_dbdata(dbgym_cfg, conn, load_info) + _load_into_dbdata(dbgym_workspace, conn, load_info) def _load_into_dbdata( - dbgym_cfg: DBGymConfig, conn: sqlalchemy.Connection, load_info: LoadInfoBaseClass + dbgym_workspace: DBGymWorkspace, + conn: sqlalchemy.Connection, + load_info: LoadInfoBaseClass, ) -> None: - sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath()) + sql_file_execute(dbgym_workspace, conn, load_info.get_schema_fpath()) # Truncate all tables first before even loading a single one. for table, _ in load_info.get_tables_and_fpaths(): sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE") # Then, load the tables. for table, table_fpath in load_info.get_tables_and_fpaths(): - with open_and_save(dbgym_cfg, table_fpath, "r") as table_csv: + with open_and_save(dbgym_workspace, table_fpath, "r") as table_csv: assert conn.connection.dbapi_connection is not None cur = conn.connection.dbapi_connection.cursor() try: @@ -292,7 +294,7 @@ def _load_into_dbdata( constraints_fpath = load_info.get_constraints_fpath() if constraints_fpath is not None: - sql_file_execute(dbgym_cfg, conn, constraints_fpath) + sql_file_execute(dbgym_workspace, conn, constraints_fpath) # The start and stop functions slightly duplicate functionality from pg_conn.py. However, I chose to do it this way @@ -301,23 +303,28 @@ def _load_into_dbdata( # even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the # tuning environment. def start_postgres( - dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path + dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path ) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True) + _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, True) -def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False) +def stop_postgres( + dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path +) -> None: + _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, False) def _start_or_stop_postgres( - dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool + dbgym_workspace: DBGymWorkspace, + pgbin_path: Path, + dbdata_dpath: Path, + is_start: bool, ) -> None: # They should be absolute paths and should exist assert is_fully_resolved(pgbin_path) assert is_fully_resolved(dbdata_dpath) pgport = DEFAULT_POSTGRES_PORT - save_file(dbgym_cfg, pgbin_path / "pg_ctl") + save_file(dbgym_workspace, pgbin_path / "pg_ctl") if is_start: # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start". diff --git a/env/env_integtests_dbgym_config.yaml b/env/env_integtests_dbgym_config.yaml deleted file mode 100644 index 0de54f97..00000000 --- a/env/env_integtests_dbgym_config.yaml +++ /dev/null @@ -1 +0,0 @@ -dbgym_workspace_path: ../dbgym_env_integtest_workspace/ diff --git a/env/integtest_util.py b/env/integtest_util.py deleted file mode 100644 index c996d83f..00000000 --- a/env/integtest_util.py +++ /dev/null @@ -1,89 +0,0 @@ -import subprocess -from pathlib import Path -from typing import Any, Optional - -import yaml - -from env.tuning_artifacts import DBMSConfigDelta, TuningMetadata -from util.workspace import ( - DBGymConfig, - fully_resolve_path, - get_default_dbdata_parent_dpath, - get_default_pgbin_path, - get_default_pristine_dbdata_snapshot_path, - get_default_workload_name_suffix, - get_default_workload_path, - get_workload_name, -) - -# These are the values used by set_up_env_integtests.sh. -# TODO: make set_up_env_integtests.sh take in these values directly as envvars. -INTEGTEST_BENCHMARK = "tpch" -INTEGTEST_SCALE_FACTOR = 0.01 - - -class IntegtestWorkspace: - """ - This is essentially a singleton class. This avoids multiple integtest_*.py files creating - the workspace and/or the DBGymConfig redundantly. - """ - - ENV_INTEGTESTS_DBGYM_CONFIG_FPATH = Path("env/env_integtests_dbgym_config.yaml") - INTEGTEST_DBGYM_CFG: Optional[DBGymConfig] = None - - @staticmethod - def set_up_workspace() -> None: - # This if statement prevents us from setting up the workspace twice, which saves time. - if not IntegtestWorkspace.get_workspace_path().exists(): - subprocess.run(["./env/set_up_env_integtests.sh"], check=True) - - # Once we get here, we have an invariant that the workspace exists. We need this - # invariant to be true in order to create the DBGymConfig. - # - # However, it also can't be created more than once so we need to check `is None`. - if IntegtestWorkspace.INTEGTEST_DBGYM_CFG is None: - IntegtestWorkspace.INTEGTEST_DBGYM_CFG = DBGymConfig( - IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH - ) - - @staticmethod - def get_dbgym_cfg() -> DBGymConfig: - assert IntegtestWorkspace.INTEGTEST_DBGYM_CFG is not None - return IntegtestWorkspace.INTEGTEST_DBGYM_CFG - - @staticmethod - def get_workspace_path() -> Path: - with open(IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH) as f: - return Path(yaml.safe_load(f)["dbgym_workspace_path"]) - - @staticmethod - def get_default_metadata() -> TuningMetadata: - dbgym_cfg = IntegtestWorkspace.get_dbgym_cfg() - workspace_path = fully_resolve_path( - dbgym_cfg, IntegtestWorkspace.get_workspace_path() - ) - return TuningMetadata( - workload_path=fully_resolve_path( - dbgym_cfg, - get_default_workload_path( - workspace_path, - INTEGTEST_BENCHMARK, - get_workload_name( - INTEGTEST_SCALE_FACTOR, - get_default_workload_name_suffix(INTEGTEST_BENCHMARK), - ), - ), - ), - pristine_dbdata_snapshot_path=fully_resolve_path( - dbgym_cfg, - get_default_pristine_dbdata_snapshot_path( - workspace_path, INTEGTEST_BENCHMARK, INTEGTEST_SCALE_FACTOR - ), - ), - dbdata_parent_path=fully_resolve_path( - dbgym_cfg, get_default_dbdata_parent_dpath(workspace_path) - ), - pgbin_path=fully_resolve_path( - dbgym_cfg, get_default_pgbin_path(workspace_path) - ), - ) diff --git a/env/pg_conn.py b/env/pg_conn.py index d28cf1d4..17b36499 100644 --- a/env/pg_conn.py +++ b/env/pg_conn.py @@ -24,7 +24,7 @@ from util.log import DBGYM_LOGGER_NAME from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr -from util.workspace import DBGymConfig, open_and_save, parent_dpath_of_path +from util.workspace import DBGymWorkspace, open_and_save, parent_dpath_of_path CONNECT_TIMEOUT = 300 @@ -35,7 +35,7 @@ class PostgresConn: # are organized in the workspace. def __init__( self, - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, pgport: int, pristine_dbdata_snapshot_fpath: Path, dbdata_parent_dpath: Path, @@ -44,7 +44,7 @@ def __init__( boot_config_fpath: Optional[Path], ) -> None: - self.dbgym_cfg = dbgym_cfg + self.dbgym_workspace = dbgym_workspace self.pgport = pgport self.pgbin_path = pgbin_path self.boot_config_fpath = boot_config_fpath @@ -59,7 +59,7 @@ def __init__( # state of the database as it is being tuned. It is generated while tuning and is # discarded once tuning is completed. self.checkpoint_dbdata_snapshot_fpath = ( - dbgym_cfg.dbgym_tmp_path / "checkpoint_dbdata.tgz" + dbgym_workspace.dbgym_tmp_path / "checkpoint_dbdata.tgz" ) # dbdata_parent_dpath is the parent directory of the dbdata that is *actively being tuned*. # It is *not* the parent directory of pristine_dbdata_snapshot_fpath. @@ -103,11 +103,11 @@ def disconnect(self) -> None: def move_log(self) -> None: pglog_fpath = ( - self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log" ) pglog_this_step_fpath = ( - self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log.{self.log_step}" ) if pglog_fpath.exists(): @@ -299,7 +299,7 @@ def restart_with_changes( "-l", # We log to pg{self.pgport}.log instead of pg.log so that different PostgresConn objects # don't all try to write to the same file. - self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + self.dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log", "start", ].run(retcode=None) @@ -346,7 +346,7 @@ def restart_with_changes( # Set up Boot if we're told to do so if self.boot_config_fpath is not None: - with open_and_save(self.dbgym_cfg, self.boot_config_fpath) as f: + with open_and_save(self.dbgym_workspace, self.boot_config_fpath) as f: boot_config = yaml.safe_load(f) self._set_up_boot( diff --git a/env/replay.py b/env/replay.py index ea364c31..8dcfe8e4 100644 --- a/env/replay.py +++ b/env/replay.py @@ -5,11 +5,11 @@ from env.tuning_artifacts import TuningArtifactsReader from env.workload import Workload from util.pg import DEFAULT_POSTGRES_PORT -from util.workspace import DBGymConfig +from util.workspace import DBGymWorkspace def replay( - dbgym_cfg: DBGymConfig, tuning_artifacts_dpath: Path + dbgym_workspace: DBGymWorkspace, tuning_artifacts_dpath: Path ) -> list[tuple[float, int]]: """ Returns the total runtime and the number of timed out queries for each step. @@ -20,7 +20,7 @@ def replay( reader = TuningArtifactsReader(tuning_artifacts_dpath) pg_conn = PostgresConn( - dbgym_cfg, + dbgym_workspace, DEFAULT_POSTGRES_PORT, reader.get_metadata().pristine_dbdata_snapshot_path, reader.get_metadata().dbdata_parent_path, @@ -28,7 +28,7 @@ def replay( None, ) workload = Workload( - dbgym_cfg, + dbgym_workspace, reader.get_metadata().workload_path, ) diff --git a/env/tests/__init__.py b/env/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/env/set_up_env_integtests.sh b/env/tests/_set_up_gymlib_integtest_workspace.sh similarity index 65% rename from env/set_up_env_integtests.sh rename to env/tests/_set_up_gymlib_integtest_workspace.sh index a536a2ac..bd2130a7 100755 --- a/env/set_up_env_integtests.sh +++ b/env/tests/_set_up_gymlib_integtest_workspace.sh @@ -1,19 +1,20 @@ #!/bin/bash -# Environment tests relies on Postgres being built and workloads/dbdata being generated. This script does this. +# DO NOT RUN THIS SCRIPT DIRECTLY. +# This script only runs correctly when run by GymlibIntegtestManager.set_up_workspace() as it sets the necessary envvars. +# By allowing GymlibIntegtestManager.set_up_workspace() to set the envvars, we ensure that the envvars are only defined +# in a single location (inside GymlibIntegtestManager). + +# Gymlib integration tests relies on Postgres being built and workloads/dbdata being generated. # Generating these things is not considered a part of the test which is why it's in its own shell script. # The reason there's a shell script generating them instead of them just being in the repo is because (a) # the Postgres repo is very large and (b) the built binary will be different for different machines. # This script should be run from the base dbgym/ directory. -set -euxo pipefail +set -euo pipefail # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default. INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}" -BENCHMARK=tpch -SCALE_FACTOR=0.01 -export DBGYM_CONFIG_PATH=env/env_integtests_dbgym_config.yaml # Note that this envvar needs to be exported. -WORKSPACE_PATH=$(grep 'dbgym_workspace_path:' $DBGYM_CONFIG_PATH | sed 's/dbgym_workspace_path: //') python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR diff --git a/env/tests/gymlib_integtest_dbgym_config.yaml b/env/tests/gymlib_integtest_dbgym_config.yaml new file mode 100644 index 00000000..8b64e125 --- /dev/null +++ b/env/tests/gymlib_integtest_dbgym_config.yaml @@ -0,0 +1 @@ +dbgym_workspace_path: ../dbgym_gymlib_integtest_workspace/ diff --git a/env/tests/gymlib_integtest_util.py b/env/tests/gymlib_integtest_util.py new file mode 100644 index 00000000..86b3b829 --- /dev/null +++ b/env/tests/gymlib_integtest_util.py @@ -0,0 +1,100 @@ +import os +import subprocess +from pathlib import Path +from typing import Optional + +from env.tuning_artifacts import TuningMetadata +from util.workspace import ( + DBGymWorkspace, + fully_resolve_path, + get_default_dbdata_parent_dpath, + get_default_pgbin_path, + get_default_pristine_dbdata_snapshot_path, + get_default_workload_name_suffix, + get_default_workload_path, + get_workload_name, + get_workspace_path_from_config, +) + + +class GymlibIntegtestManager: + """ + This is essentially a singleton class. This avoids multiple integtest_*.py files creating + the workspace and/or the DBGymWorkspace object redundantly. + + The reason I put all these static methods in a class instead of directly in the module is + that the functions have very generic names (e.g. set_up_workspace()) but having them + inside a class makes it clear that they are related to the gymlib integration tests. + """ + + # These constants are also used by _set_up_gymlib_integtest_workspace.sh. + BENCHMARK = "tpch" + SCALE_FACTOR = 0.01 + DBGYM_CONFIG_PATH = Path("env/tests/gymlib_integtest_dbgym_config.yaml") + + # This is set at most once by set_up_workspace(). + DBGYM_WORKSPACE: Optional[DBGymWorkspace] = None + + @staticmethod + def set_up_workspace() -> None: + workspace_path = get_workspace_path_from_config( + GymlibIntegtestManager.DBGYM_CONFIG_PATH + ) + + # This if statement prevents us from setting up the workspace twice, which saves time. + if not workspace_path.exists(): + subprocess.run( + ["./env/tests/_set_up_gymlib_integtest_workspace.sh"], + env={ + "BENCHMARK": GymlibIntegtestManager.BENCHMARK, + "SCALE_FACTOR": str(GymlibIntegtestManager.SCALE_FACTOR), + # By setting this envvar, we ensure that when running _set_up_gymlib_integtest_workspace.sh, + # make_standard_dbgym_workspace() will use the correct DBGYM_CONFIG_PATH. + "DBGYM_CONFIG_PATH": str(GymlibIntegtestManager.DBGYM_CONFIG_PATH), + **os.environ, + }, + check=True, + ) + + # Once we get here, we have an invariant that the workspace exists. We need this + # invariant to be true in order to create the DBGymWorkspace. + # + # However, it also can't be created more than once so we need to check `is None`. + if GymlibIntegtestManager.DBGYM_WORKSPACE is None: + GymlibIntegtestManager.DBGYM_WORKSPACE = DBGymWorkspace(workspace_path) + + @staticmethod + def get_dbgym_workspace() -> DBGymWorkspace: + assert GymlibIntegtestManager.DBGYM_WORKSPACE is not None + return GymlibIntegtestManager.DBGYM_WORKSPACE + + @staticmethod + def get_default_metadata() -> TuningMetadata: + dbgym_workspace = GymlibIntegtestManager.get_dbgym_workspace() + return TuningMetadata( + workload_path=fully_resolve_path( + get_default_workload_path( + dbgym_workspace.dbgym_workspace_path, + GymlibIntegtestManager.BENCHMARK, + get_workload_name( + GymlibIntegtestManager.SCALE_FACTOR, + get_default_workload_name_suffix( + GymlibIntegtestManager.BENCHMARK + ), + ), + ), + ), + pristine_dbdata_snapshot_path=fully_resolve_path( + get_default_pristine_dbdata_snapshot_path( + dbgym_workspace.dbgym_workspace_path, + GymlibIntegtestManager.BENCHMARK, + GymlibIntegtestManager.SCALE_FACTOR, + ), + ), + dbdata_parent_path=fully_resolve_path( + get_default_dbdata_parent_dpath(dbgym_workspace.dbgym_workspace_path), + ), + pgbin_path=fully_resolve_path( + get_default_pgbin_path(dbgym_workspace.dbgym_workspace_path), + ), + ) diff --git a/env/integtest_pg_conn.py b/env/tests/integtest_pg_conn.py similarity index 96% rename from env/integtest_pg_conn.py rename to env/tests/integtest_pg_conn.py index 3e6655de..a5e11f4d 100644 --- a/env/integtest_pg_conn.py +++ b/env/tests/integtest_pg_conn.py @@ -3,8 +3,8 @@ import psycopg -from env.integtest_util import IntegtestWorkspace from env.pg_conn import PostgresConn +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from util.pg import ( DEFAULT_POSTGRES_PORT, get_is_postgres_running, @@ -15,7 +15,7 @@ class PostgresConnTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() def setUp(self) -> None: self.assertFalse( @@ -23,7 +23,7 @@ def setUp(self) -> None: "Make sure Postgres isn't running before starting the integration test. `pkill postgres` is one way " + "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.", ) - self.metadata = IntegtestWorkspace.get_default_metadata() + self.metadata = GymlibIntegtestManager.get_default_metadata() # The reason we restart Postgres every time is to ensure a "clean" starting point # so that all tests are independent of each other. @@ -38,7 +38,7 @@ def tearDown(self) -> None: def create_pg_conn(self, pgport: int = DEFAULT_POSTGRES_PORT) -> PostgresConn: return PostgresConn( - IntegtestWorkspace.get_dbgym_cfg(), + GymlibIntegtestManager.get_dbgym_workspace(), pgport, self.metadata.pristine_dbdata_snapshot_path, self.metadata.dbdata_parent_path, diff --git a/env/integtest_replay.py b/env/tests/integtest_replay.py similarity index 83% rename from env/integtest_replay.py rename to env/tests/integtest_replay.py index 49af2909..9523fac7 100644 --- a/env/integtest_replay.py +++ b/env/tests/integtest_replay.py @@ -1,8 +1,8 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED -from env.integtest_util import IntegtestWorkspace from env.replay import replay +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, @@ -15,12 +15,12 @@ class ReplayTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() def test_replay(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) writer.write_step( DBMSConfigDelta( @@ -41,7 +41,8 @@ def test_replay(self) -> None: ) ) replay_data = replay( - IntegtestWorkspace.get_dbgym_cfg(), writer.tuning_artifacts_dpath + GymlibIntegtestManager.get_dbgym_workspace(), + writer.tuning_artifacts_dpath, ) # We do some very simple sanity checks here due to the inherent randomness of executing a workload. diff --git a/env/integtest_tuning_artifacts.py b/env/tests/integtest_tuning_artifacts.py similarity index 80% rename from env/integtest_tuning_artifacts.py rename to env/tests/integtest_tuning_artifacts.py index 1e8760f1..dfdc77c0 100644 --- a/env/integtest_tuning_artifacts.py +++ b/env/tests/integtest_tuning_artifacts.py @@ -1,6 +1,6 @@ import unittest -from env.integtest_util import IntegtestWorkspace +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.tuning_artifacts import ( DBMSConfigDelta, IndexesDelta, @@ -14,7 +14,7 @@ class PostgresConnTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() @staticmethod def make_config(letter: str) -> DBMSConfigDelta: @@ -26,8 +26,8 @@ def make_config(letter: str) -> DBMSConfigDelta: def test_get_delta_at_step(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) writer.write_step(PostgresConnTests.make_config("a")) @@ -51,8 +51,8 @@ def test_get_delta_at_step(self) -> None: def test_get_all_deltas_in_order(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) writer.write_step(PostgresConnTests.make_config("a")) @@ -72,12 +72,12 @@ def test_get_all_deltas_in_order(self) -> None: def test_get_metadata(self) -> None: writer = TuningArtifactsWriter( - IntegtestWorkspace.get_dbgym_cfg(), - IntegtestWorkspace.get_default_metadata(), + GymlibIntegtestManager.get_dbgym_workspace(), + GymlibIntegtestManager.get_default_metadata(), ) reader = TuningArtifactsReader(writer.tuning_artifacts_dpath) metadata = reader.get_metadata() - expected_metadata = IntegtestWorkspace.get_default_metadata() + expected_metadata = GymlibIntegtestManager.get_default_metadata() self.assertEqual(metadata, expected_metadata) diff --git a/env/integtest_workload.py b/env/tests/integtest_workload.py similarity index 65% rename from env/integtest_workload.py rename to env/tests/integtest_workload.py index b1f7c35d..ee6fc068 100644 --- a/env/integtest_workload.py +++ b/env/tests/integtest_workload.py @@ -1,11 +1,7 @@ import unittest from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES -from env.integtest_util import ( - INTEGTEST_BENCHMARK, - INTEGTEST_SCALE_FACTOR, - IntegtestWorkspace, -) +from env.tests.gymlib_integtest_util import GymlibIntegtestManager from env.workload import Workload from util.workspace import ( fully_resolve_path, @@ -18,22 +14,23 @@ class WorkloadTests(unittest.TestCase): @staticmethod def setUpClass() -> None: - IntegtestWorkspace.set_up_workspace() + GymlibIntegtestManager.set_up_workspace() def test_workload(self) -> None: workload_dpath = fully_resolve_path( - IntegtestWorkspace.get_dbgym_cfg(), get_default_workload_path( - IntegtestWorkspace.get_workspace_path(), - INTEGTEST_BENCHMARK, + GymlibIntegtestManager.get_dbgym_workspace().dbgym_workspace_path, + GymlibIntegtestManager.BENCHMARK, get_workload_name( - INTEGTEST_SCALE_FACTOR, - get_default_workload_name_suffix(INTEGTEST_BENCHMARK), + GymlibIntegtestManager.SCALE_FACTOR, + get_default_workload_name_suffix(GymlibIntegtestManager.BENCHMARK), ), ), ) - workload = Workload(IntegtestWorkspace.get_dbgym_cfg(), workload_dpath) + workload = Workload( + GymlibIntegtestManager.get_dbgym_workspace(), workload_dpath + ) # Check the order of query IDs. self.assertEqual( diff --git a/env/tuning_artifacts.py b/env/tuning_artifacts.py index cac3c4b8..a60d6bbe 100644 --- a/env/tuning_artifacts.py +++ b/env/tuning_artifacts.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any, NewType, TypedDict -from util.workspace import DBGymConfig, is_fully_resolved +from util.workspace import DBGymWorkspace, is_fully_resolved # PostgresConn doesn't use these types because PostgresConn is used internally by tuning agents # while these types are only used in the interface between the orchestrator and the tuning agents. @@ -77,9 +77,11 @@ def get_metadata_fpath(tuning_artifacts_dpath: Path) -> Path: class TuningArtifactsWriter: - def __init__(self, dbgym_cfg: DBGymConfig, metadata: TuningMetadata) -> None: - self.dbgym_cfg = dbgym_cfg - self.tuning_artifacts_dpath = self.dbgym_cfg.cur_task_runs_artifacts_path( + def __init__( + self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata + ) -> None: + self.dbgym_workspace = dbgym_workspace + self.tuning_artifacts_dpath = self.dbgym_workspace.cur_task_runs_artifacts_path( "tuning_artifacts", mkdir=True ) assert is_fully_resolved(self.tuning_artifacts_dpath) diff --git a/env/workload.py b/env/workload.py index 669594ae..89dde426 100644 --- a/env/workload.py +++ b/env/workload.py @@ -1,11 +1,11 @@ from pathlib import Path -from util.workspace import DBGymConfig, is_fully_resolved, open_and_save +from util.workspace import DBGymWorkspace, is_fully_resolved, open_and_save class Workload: - def __init__(self, dbgym_cfg: DBGymConfig, workload_dpath: Path) -> None: - self.dbgym_cfg = dbgym_cfg + def __init__(self, dbgym_workspace: DBGymWorkspace, workload_dpath: Path) -> None: + self.dbgym_workspace = dbgym_workspace self.workload_dpath = workload_dpath assert is_fully_resolved(self.workload_dpath) @@ -15,13 +15,13 @@ def __init__(self, dbgym_cfg: DBGymConfig, workload_dpath: Path) -> None: assert order_fpath.exists() - with open_and_save(self.dbgym_cfg, order_fpath) as f: + with open_and_save(self.dbgym_workspace, order_fpath) as f: for line in f: qid, qpath = line.strip().split(",") qpath = Path(qpath) assert is_fully_resolved(qpath) - with open_and_save(self.dbgym_cfg, qpath) as qf: + with open_and_save(self.dbgym_workspace, qpath) as qf: self.queries[qid] = qf.read() self.query_order.append(qid) diff --git a/manage/cli.py b/manage/cli.py index 123f50d4..5d785d9f 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -8,7 +8,7 @@ from util.log import DBGYM_LOGGER_NAME, DBGYM_OUTPUT_LOGGER_NAME from util.workspace import ( - DBGymConfig, + DBGymWorkspace, get_runs_path_from_workspace_path, get_symlinks_path_from_workspace_path, is_child_path, @@ -17,7 +17,7 @@ # This is used in test_clean.py. It's defined here to avoid a circular import. -class MockDBGymConfig: +class MockDBGymWorkspace: def __init__(self, scratchspace_path: Path): self.dbgym_workspace_path = scratchspace_path self.dbgym_symlinks_path = get_symlinks_path_from_workspace_path( @@ -39,16 +39,16 @@ def manage_group() -> None: default="safe", help='The mode to clean the workspace (default="safe"). "aggressive" means "only keep run_*/ folders referenced by a file in symlinks/". "safe" means "in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping."', ) -def manage_clean(dbgym_cfg: DBGymConfig, mode: str) -> None: - clean_workspace(dbgym_cfg, mode=mode, verbose=True) +def manage_clean(dbgym_workspace: DBGymWorkspace, mode: str) -> None: + clean_workspace(dbgym_workspace, mode=mode, verbose=True) @click.command("count") @click.pass_obj -def manage_count(dbgym_cfg: DBGymConfig) -> None: - num_files = _count_files_in_workspace(dbgym_cfg) +def manage_count(dbgym_workspace: DBGymWorkspace) -> None: + num_files = _count_files_in_workspace(dbgym_workspace) logging.getLogger(DBGYM_OUTPUT_LOGGER_NAME).info( - f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." + f"The workspace ({dbgym_workspace.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." ) @@ -68,13 +68,15 @@ def add_symlinks_in_dpath( processed_symlinks.add(file_path) -def _count_files_in_workspace(dbgym_cfg: DBGymConfig | MockDBGymConfig) -> int: +def _count_files_in_workspace( + dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, +) -> int: """ Counts the number of files (regular file or dir or symlink) in the workspace. """ total_count = 0 for dirpath, dirnames, filenames in os.walk( - dbgym_cfg.dbgym_workspace_path, followlinks=False + dbgym_workspace.dbgym_workspace_path, followlinks=False ): # Check if any of the directories are symbolic links and remove them from dirnames dirnames[:] = [ @@ -88,7 +90,9 @@ def _count_files_in_workspace(dbgym_cfg: DBGymConfig | MockDBGymConfig) -> int: def clean_workspace( - dbgym_cfg: DBGymConfig | MockDBGymConfig, mode: str = "safe", verbose: bool = False + dbgym_workspace: DBGymWorkspace | MockDBGymWorkspace, + mode: str = "safe", + verbose: bool = False, ) -> None: """ Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks". @@ -102,9 +106,11 @@ def clean_workspace( processed_symlinks: set[Path] = set() # 1. Initialize paths to process - if dbgym_cfg.dbgym_symlinks_path.exists(): + if dbgym_workspace.dbgym_symlinks_path.exists(): add_symlinks_in_dpath( - symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks + symlink_fpaths_to_process, + dbgym_workspace.dbgym_symlinks_path, + processed_symlinks, ) # 2. Go through symlinks, figuring out which "children of task runs" to keep @@ -114,7 +120,7 @@ def clean_workspace( # instead of "run_dpaths". task_run_child_fordpaths_to_keep = set() - if dbgym_cfg.dbgym_runs_path.exists(): + if dbgym_workspace.dbgym_runs_path.exists(): while symlink_fpaths_to_process: symlink_fpath: Path = symlink_fpaths_to_process.pop() assert symlink_fpath.is_symlink() @@ -132,14 +138,16 @@ def clean_workspace( continue # We're only trying to figure out which direct children of task_runs/ to save. If the file isn't # even a descendant, we don't care about it. - if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path): + if not is_child_path(real_fordpath, dbgym_workspace.dbgym_runs_path): continue - assert not real_fordpath.samefile(dbgym_cfg.dbgym_runs_path) + assert not real_fordpath.samefile(dbgym_workspace.dbgym_runs_path) # Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep task_run_child_fordpath = None - if parent_dpath_of_path(real_fordpath).samefile(dbgym_cfg.dbgym_runs_path): + if parent_dpath_of_path(real_fordpath).samefile( + dbgym_workspace.dbgym_runs_path + ): # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/, # we'll just not delete it if the user happens to have one like this. Even if the user messed up # the structure somehow, it's just a good idea not to delete it. @@ -150,15 +158,15 @@ def clean_workspace( # some reason. task_run_child_fordpath = real_fordpath while not parent_dpath_of_path(task_run_child_fordpath).samefile( - dbgym_cfg.dbgym_runs_path + dbgym_workspace.dbgym_runs_path ): task_run_child_fordpath = parent_dpath_of_path( task_run_child_fordpath ) assert task_run_child_fordpath != None assert parent_dpath_of_path(task_run_child_fordpath).samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path" + dbgym_workspace.dbgym_runs_path + ), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_workspace.dbgym_runs_path" task_run_child_fordpaths_to_keep.add(task_run_child_fordpath) # If on safe mode, add symlinks inside the task_run_child_fordpath to be processed @@ -171,15 +179,15 @@ def clean_workspace( # 3. Go through all children of task_runs/*, deleting any that we weren't told to keep # It's true that symlinks might link outside of task_runs/*. We'll just not care about those - starting_num_files = _count_files_in_workspace(dbgym_cfg) - if dbgym_cfg.dbgym_runs_path.exists(): - for child_fordpath in dbgym_cfg.dbgym_runs_path.iterdir(): + starting_num_files = _count_files_in_workspace(dbgym_workspace) + if dbgym_workspace.dbgym_runs_path.exists(): + for child_fordpath in dbgym_workspace.dbgym_runs_path.iterdir(): if child_fordpath not in task_run_child_fordpaths_to_keep: if child_fordpath.is_dir(): shutil.rmtree(child_fordpath) else: os.remove(child_fordpath) - ending_num_files = _count_files_in_workspace(dbgym_cfg) + ending_num_files = _count_files_in_workspace(dbgym_workspace) if verbose: logging.getLogger(DBGYM_LOGGER_NAME).info( diff --git a/manage/tests/unittest_clean.py b/manage/tests/unittest_clean.py index 056d9eb7..2e82d4a9 100644 --- a/manage/tests/unittest_clean.py +++ b/manage/tests/unittest_clean.py @@ -1,13 +1,15 @@ -import copy import logging -import os import shutil import unittest from pathlib import Path -from typing import Any, NewType, cast -from manage.cli import MockDBGymConfig, clean_workspace -from util.workspace import path_exists_dont_follow_symlinks +from manage.cli import MockDBGymWorkspace, clean_workspace +from util.tests.filesystem_unittest_util import ( + FilesystemStructure, + create_structure, + make_workspace_structure, + verify_structure, +) # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which # does not go through the `if __name__ == "__main__"` codepath. @@ -17,115 +19,14 @@ logging.basicConfig(level=logging.CRITICAL) -FilesystemStructure = NewType("FilesystemStructure", dict[str, Any]) - - class CleanTests(unittest.TestCase): scratchspace_path: Path = Path() - - @staticmethod - def create_structure(root_path: Path, structure: FilesystemStructure) -> None: - def create_structure_internal( - root_path: Path, cur_path: Path, structure: FilesystemStructure - ) -> None: - for path, content in structure.items(): - full_path: Path = cur_path / path - - if isinstance(content, dict): # Directory - full_path.mkdir(parents=True, exist_ok=True) - create_structure_internal( - root_path, - full_path, - FilesystemStructure(cast(dict[str, Any], content)), - ) - elif isinstance(content, tuple) and content[0] == "file": - assert len(content) == 1 - full_path.touch() - elif isinstance(content, tuple) and content[0] == "symlink": - assert len(content) == 2 - target_path = root_path / content[1] - os.symlink(target_path, full_path) - else: - raise ValueError(f"Unsupported type for path ({path}): {content}") - - root_path.mkdir(parents=True, exist_ok=True) - create_structure_internal(root_path, root_path, structure) - - @staticmethod - def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool: - def verify_structure_internal( - root_path: Path, cur_path: Path, structure: FilesystemStructure - ) -> bool: - # Check for the presence of each item specified in the structure - for name, item in structure.items(): - new_cur_path = cur_path / name - if not path_exists_dont_follow_symlinks(new_cur_path): - logging.debug(f"{new_cur_path} does not exist") - return False - elif isinstance(item, dict): - if not new_cur_path.is_dir(): - logging.debug(f"expected {new_cur_path} to be a directory") - return False - if not verify_structure_internal( - root_path, - new_cur_path, - FilesystemStructure(cast(dict[str, Any], item)), - ): - return False - elif isinstance(item, tuple) and item[0] == "file": - if not new_cur_path.is_file(): - logging.debug(f"expected {new_cur_path} to be a regular file") - return False - elif isinstance(item, tuple) and item[0] == "symlink": - if not new_cur_path.is_symlink(): - logging.debug(f"expected {new_cur_path} to be a symlink") - return False - # If item[1] is None, this indicates that we expect the symlink to be broken - if item[1] != None: - expected_target = root_path / item[1] - if not new_cur_path.resolve().samefile(expected_target): - logging.debug( - f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}" - ) - return False - else: - assert False, "structure misconfigured" - - # Check for any extra files or directories not described by the structure - expected_names = set(structure.keys()) - actual_names = {entry.name for entry in cur_path.iterdir()} - if not expected_names.issuperset(actual_names): - logging.debug( - f"expected_names={expected_names}, actual_names={actual_names}" - ) - return False - - return True - - if not root_path.exists(): - logging.debug(f"{root_path} does not exist") - return False - return verify_structure_internal(root_path, root_path, structure) - - @staticmethod - def make_workspace_structure( - symlinks_structure: FilesystemStructure, - task_runs_structure: FilesystemStructure, - ) -> FilesystemStructure: - """ - This function exists so that it's easier to refactor the tests in case we ever change - how the workspace is organized. - """ - return FilesystemStructure( - { - "symlinks": symlinks_structure, - "task_runs": task_runs_structure, - } - ) + workspace_path: Path = Path() @classmethod def setUpClass(cls) -> None: cls.scratchspace_path = Path.cwd() / "manage/tests/test_clean_scratchspace/" + cls.workspace_path = cls.scratchspace_path / "dbgym_workspace" def setUp(self) -> None: if self.scratchspace_path.exists(): @@ -135,183 +36,91 @@ def tearDown(self) -> None: if self.scratchspace_path.exists(): shutil.rmtree(self.scratchspace_path) - def test_structure_helpers(self) -> None: - structure = FilesystemStructure( - { - "dir1": {"file1.txt": ("file",), "dir2": {"file2.txt": ("file",)}}, - "dir3": {"nested_link_to_dir1": ("symlink", "dir1")}, - "link_to_dir1": ("symlink", "dir1"), - "link_to_file2": ("symlink", "dir1/dir2/file2.txt"), - } - ) - CleanTests.create_structure(self.scratchspace_path, structure) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, structure)) - - extra_dir_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure) - ) - extra_dir_structure["dir4"] = {} - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure) - ) - - missing_dir_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure) - ) - del missing_dir_structure["dir1"] - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure) - ) - - extra_file_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, extra_file_structure) - ) - extra_file_structure["file3.txt"] = ("file",) - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, extra_file_structure) - ) - - missing_file_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, missing_file_structure) - ) - del missing_file_structure["dir1"]["file1.txt"] - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, missing_file_structure) - ) - - extra_link_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, extra_link_structure) - ) - extra_link_structure["link_to_dir3"] = ("symlink", "dir3") - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, extra_link_structure) - ) - - missing_link_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, missing_link_structure) - ) - del missing_link_structure["link_to_dir1"] - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, missing_link_structure) - ) - - wrong_link_structure = copy.deepcopy(structure) - # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure) - ) - wrong_link_structure["link_to_dir1"] = ("symlink", "dir3") - self.assertFalse( - CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure) - ) - def test_nonexistent_workspace(self) -> None: - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + # This just ensures that it doesn't raise an exception. + clean_workspace(MockDBGymWorkspace(self.workspace_path)) - def test_no_symlinks_dir_and_no_task_runs_dir(self) -> None: - starting_structure = FilesystemStructure({}) - ending_structure = FilesystemStructure({}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + def test_empty_workspace(self) -> None: + starting_structure = FilesystemStructure({"dbgym_workspace": {}}) + ending_structure = FilesystemStructure({"dbgym_workspace": {}}) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_symlinks_dir_and_yes_task_runs_dir(self) -> None: starting_structure = FilesystemStructure( - {"task_runs": {"file1.txt": ("file",)}} - ) - ending_structure = FilesystemStructure({"task_runs": {}}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) + {"dbgym_workspace": {"task_runs": {"file1.txt": ("file",)}}} ) + ending_structure = FilesystemStructure({"dbgym_workspace": {"task_runs": {}}}) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_yes_symlinks_dir_and_no_task_runs_dir(self) -> None: - starting_structure = FilesystemStructure({"symlinks": {}}) - ending_structure = FilesystemStructure({"symlinks": {}}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + # If there are no task runs there can't be any symlinks. + starting_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) + ending_structure = FilesystemStructure({"dbgym_workspace": {"symlinks": {}}}) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_symlinks_in_dir_and_no_task_runs_in_dir(self) -> None: starting_symlinks_structure = FilesystemStructure({}) starting_task_runs_structure = FilesystemStructure({}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure({}) ending_task_runs_structure = FilesystemStructure({}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_no_links_in_symlinks(self) -> None: starting_symlinks_structure = FilesystemStructure({}) starting_task_runs_structure = FilesystemStructure({"run_0": {}}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure({}) ending_task_runs_structure = FilesystemStructure({}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_file_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( {"file1.txt": ("file",), "file2.txt": ("file",)} ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/file1.txt")} ) ending_task_runs_structure = FilesystemStructure({"file1.txt": ("file",)}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_dir_directly_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { @@ -319,28 +128,26 @@ def test_link_to_dir_directly_in_task_runs(self) -> None: "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { @@ -348,28 +155,26 @@ def test_link_to_file_in_dir_in_task_runs(self) -> None: "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",)}} ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_dir_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/dir2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/dir2")} ) starting_task_runs_structure = FilesystemStructure( { @@ -377,398 +182,395 @@ def test_link_to_dir_in_dir_in_task_runs(self) -> None: "dir3": {"file3.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/dir2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/dir2")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": {"dir2": {"file1.txt": ("file",)}, "file2.txt": ("file",)}, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_link_crashes(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/symlink2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "symlink2": ("symlink", "task_runs/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt"), "file1.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) with self.assertRaises(AssertionError): - clean_workspace(MockDBGymConfig(self.scratchspace_path)) + clean_workspace(MockDBGymWorkspace(self.workspace_path)) def test_safe_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + }, "file1.txt": ("file",), "file2.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + }, "file1.txt": ("file",), } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_safe_mode_link_to_file_in_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt"), }, "file2.txt": ("file",), "file3.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt"), }, "file2.txt": ("file",), } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/dir2/file2.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt") + }, "dir2": { "file2.txt": ("file",), }, "file3.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/dir2/file2.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt") + }, "dir2": { "file2.txt": ("file",), }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_aggressive_mode_link_to_dir_with_link(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file1.txt") + }, "file1.txt": ("file",), "file2.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": {"symlink2": ("symlink", None)}, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="aggressive") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="aggressive") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_link_to_link_to_file_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/symlink2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "task_runs/file2.txt")}, + "dir1": { + "symlink2": ("symlink", "dbgym_workspace/task_runs/file2.txt") + }, "file2.txt": ("file",), } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) # We disallow links to links so it's an AssertionError with self.assertRaises(AssertionError): - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") def test_multi_link_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/symlink2")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/symlink2")} ) starting_task_runs_structure = FilesystemStructure( { - "dir1": {"symlink2": ("symlink", "symlinks/symlink1")}, + "dir1": {"symlink2": ("symlink", "dbgym_workspace/symlinks/symlink1")}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) # pathlib disallows multi-link loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") def test_link_self_loop_gives_error(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "symlinks/symlink1")} + {"symlink1": ("symlink", "dbgym_workspace/symlinks/symlink1")} ) starting_task_runs_structure = FilesystemStructure({}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) + create_structure(self.scratchspace_path, starting_structure) # pathlib disallows link self-loops so it's a RuntimeError with self.assertRaises(RuntimeError): - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs( self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir2/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir2/file2.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_itself( self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt"), + "symlink2": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt"), }, } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_broken_symlink_has_no_effect(self) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) starting_task_runs_structure = FilesystemStructure( { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/non_existent_file.txt"), + "symlink2": ( + "symlink", + "dbgym_workspace/task_runs/dir1/non_existent_file.txt", + ), }, "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1")} ) ending_task_runs_structure = FilesystemStructure( {"dir1": {"file1.txt": ("file",), "symlink2": ("symlink", None)}} ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) # The idea behind this test is that we shouldn't be following links outside of task_runs, even on safe mode def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save_other_run( self, ) -> None: starting_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) starting_task_runs_structure = FilesystemStructure( { @@ -779,19 +581,19 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save "dir2": {"file2.txt": ("file",)}, } ) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) starting_structure["external"] = FilesystemStructure( { "dir3": { "file3.txt": ("file",), - "symlink3": ("symlink", "task_runs/dir2/file2.txt"), + "symlink3": ("symlink", "dbgym_workspace/task_runs/dir2/file2.txt"), } } ) ending_symlinks_structure = FilesystemStructure( - {"symlink1": ("symlink", "task_runs/dir1/file1.txt")} + {"symlink1": ("symlink", "dbgym_workspace/task_runs/dir1/file1.txt")} ) ending_task_runs_structure = FilesystemStructure( { @@ -801,38 +603,34 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save } } ) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) ending_structure["external"] = { "dir3": {"file3.txt": ("file",), "symlink3": ("symlink", None)} } - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) def test_outside_task_runs_doesnt_get_deleted(self) -> None: starting_symlinks_structure = FilesystemStructure({}) starting_task_runs_structure = FilesystemStructure({"dir1": {}}) - starting_structure = CleanTests.make_workspace_structure( + starting_structure = make_workspace_structure( starting_symlinks_structure, starting_task_runs_structure ) starting_structure["external"] = FilesystemStructure({"file1.txt": ("file",)}) ending_symlinks_structure = FilesystemStructure({}) ending_task_runs_structure = FilesystemStructure({}) - ending_structure = CleanTests.make_workspace_structure( + ending_structure = make_workspace_structure( ending_symlinks_structure, ending_task_runs_structure ) ending_structure["external"] = FilesystemStructure({"file1.txt": ("file",)}) - CleanTests.create_structure(self.scratchspace_path, starting_structure) - clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue( - CleanTests.verify_structure(self.scratchspace_path, ending_structure) - ) + create_structure(self.scratchspace_path, starting_structure) + clean_workspace(MockDBGymWorkspace(self.workspace_path), mode="safe") + self.assertTrue(verify_structure(self.scratchspace_path, ending_structure)) if __name__ == "__main__": diff --git a/task.py b/task.py index 0b59cae5..1205999b 100644 --- a/task.py +++ b/task.py @@ -4,7 +4,7 @@ from dbms.cli import dbms_group from manage.cli import manage_group from util.log import set_up_loggers, set_up_warnings -from util.workspace import make_standard_dbgym_cfg +from util.workspace import make_standard_dbgym_workspace # TODO(phw2): Save commit, git diff, and run command. # TODO(phw2): Remove write permissions on old run_*/ dirs to enforce that they are immutable. @@ -15,10 +15,10 @@ @click.pass_context def task(ctx: click.Context) -> None: """🛢️ CMU-DB Database Gym: github.com/cmu-db/dbgym 🏋️""" - dbgym_cfg = make_standard_dbgym_cfg() - ctx.obj = dbgym_cfg + dbgym_workspace = make_standard_dbgym_workspace() + ctx.obj = dbgym_workspace - log_dpath = dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + log_dpath = dbgym_workspace.cur_task_runs_artifacts_path(mkdir=True) set_up_loggers(log_dpath) set_up_warnings(log_dpath) diff --git a/util/pg.py b/util/pg.py index 43252727..32f2fdee 100644 --- a/util/pg.py +++ b/util/pg.py @@ -11,7 +11,7 @@ import sqlalchemy from sqlalchemy import create_engine, text -from util.workspace import DBGymConfig, open_and_save +from util.workspace import DBGymWorkspace, open_and_save DBGYM_POSTGRES_USER = "dbgym_user" DBGYM_POSTGRES_PASS = "dbgym_pass" @@ -27,8 +27,8 @@ def sqlalchemy_conn_execute( return conn.execute(text(sql)) -def sql_file_queries(dbgym_cfg: DBGymConfig, filepath: Path) -> list[str]: - with open_and_save(dbgym_cfg, filepath) as f: +def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]: + with open_and_save(dbgym_workspace, filepath) as f: lines: list[str] = [] for line in f: if line.startswith("--"): @@ -42,9 +42,9 @@ def sql_file_queries(dbgym_cfg: DBGymConfig, filepath: Path) -> list[str]: def sql_file_execute( - dbgym_cfg: DBGymConfig, conn: sqlalchemy.Connection, filepath: Path + dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path ) -> None: - for sql in sql_file_queries(dbgym_cfg, filepath): + for sql in sql_file_queries(dbgym_workspace, filepath): sqlalchemy_conn_execute(conn, sql) diff --git a/util/tests/__init__.py b/util/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/util/tests/filesystem_unittest_util.py b/util/tests/filesystem_unittest_util.py new file mode 100644 index 00000000..37997c71 --- /dev/null +++ b/util/tests/filesystem_unittest_util.py @@ -0,0 +1,134 @@ +import logging +import os +from pathlib import Path +from typing import Any, NewType, cast + +FilesystemStructure = NewType("FilesystemStructure", dict[str, Any]) + + +def create_structure(root_path: Path, structure: FilesystemStructure) -> None: + """ + Create files and directories according to the structure. + """ + + def create_structure_internal( + root_path: Path, cur_path: Path, structure: FilesystemStructure + ) -> None: + for path, content in structure.items(): + full_path: Path = cur_path / path + + if isinstance(content, dict): # Directory + full_path.mkdir(parents=True, exist_ok=True) + create_structure_internal( + root_path, + full_path, + FilesystemStructure(cast(dict[str, Any], content)), + ) + elif isinstance(content, tuple) and content[0] == "file": + full_path.parent.mkdir(parents=True, exist_ok=True) + if len(content) == 2: + full_path.write_text(content[1]) + else: + assert len(content) == 1 + full_path.touch() + elif isinstance(content, tuple) and content[0] == "symlink": + assert len(content) == 2 + target_path = root_path / content[1] + os.symlink(target_path, full_path) + else: + raise ValueError(f"Unsupported type for path ({path}): {content}") + + root_path.mkdir(parents=True, exist_ok=True) + create_structure_internal(root_path, root_path, structure) + + +def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool: + """ + Verify that the files and directories match the expected structure. + """ + + def verify_structure_internal( + root_path: Path, cur_path: Path, structure: FilesystemStructure + ) -> bool: + # Check for the presence of each item specified in the structure + for name, item in structure.items(): + new_cur_path = cur_path / name + if not path_exists_dont_follow_symlinks(new_cur_path): + logging.debug(f"{new_cur_path} does not exist") + return False + elif isinstance(item, dict): + if not new_cur_path.is_dir(): + logging.debug(f"expected {new_cur_path} to be a directory") + return False + if not verify_structure_internal( + root_path, + new_cur_path, + FilesystemStructure(cast(dict[str, Any], item)), + ): + return False + elif isinstance(item, tuple) and item[0] == "file": + if not new_cur_path.is_file(): + logging.debug(f"expected {new_cur_path} to be a regular file") + return False + elif isinstance(item, tuple) and item[0] == "symlink": + if not new_cur_path.is_symlink(): + logging.debug(f"expected {new_cur_path} to be a symlink") + return False + # If item[1] is None, this indicates that we expect the symlink to be broken + if item[1] != None: + expected_target = root_path / item[1] + if not new_cur_path.resolve().samefile(expected_target): + logging.debug( + f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}" + ) + return False + else: + assert False, "structure misconfigured" + + # Check for any extra files or directories not described by the structure + expected_names = set(structure.keys()) + actual_names = {entry.name for entry in cur_path.iterdir()} + if not expected_names.issuperset(actual_names): + logging.debug( + f"expected_names={expected_names}, actual_names={actual_names}" + ) + return False + + return True + + if not root_path.exists(): + logging.debug(f"{root_path} does not exist") + return False + return verify_structure_internal(root_path, root_path, structure) + + +def make_workspace_structure( + symlinks_structure: FilesystemStructure, + task_runs_structure: FilesystemStructure, +) -> FilesystemStructure: + """ + This function exists so that it's easier to refactor the tests in case we ever change + how the workspace is organized. + """ + return FilesystemStructure( + { + "dbgym_workspace": { + "symlinks": symlinks_structure, + "task_runs": task_runs_structure, + "tmp": {}, + } + } + ) + + +def path_exists_dont_follow_symlinks(path: Path) -> bool: + """ + As of writing this comment, ray is currently constraining us to python <3.12. However, the "follow_symlinks" option in + Path.exists() only comes up in python 3.12. Thus, this is the only way to check if a path exists without following symlinks. + """ + # If the path exists and is a symlink, os.path.islink() will be true (even if the symlink is broken) + if os.path.islink(path): + return True + # Otherwise, we know it's either non-existent or not a symlink, so path.exists() works fine + else: + return path.exists() diff --git a/util/tests/unittest_filesystem_unittest_util.py b/util/tests/unittest_filesystem_unittest_util.py new file mode 100644 index 00000000..04f3fe7d --- /dev/null +++ b/util/tests/unittest_filesystem_unittest_util.py @@ -0,0 +1,96 @@ +import copy +import shutil +import unittest +from pathlib import Path + +from util.tests.filesystem_unittest_util import ( + FilesystemStructure, + create_structure, + verify_structure, +) + + +class FilesystemUnittestUtilTests(unittest.TestCase): + scratchspace_path: Path = Path() + + @classmethod + def setUpClass(cls) -> None: + cls.scratchspace_path = ( + Path.cwd() / "util/tests/test_filesystem_unittest_util_scratchspace/" + ) + + def setUp(self) -> None: + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + def tearDown(self) -> None: + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + def test_filesystem_unittest_util(self) -> None: + structure = FilesystemStructure( + { + "dir1": {"file1.txt": ("file",), "dir2": {"file2.txt": ("file",)}}, + "dir3": {"nested_link_to_dir1": ("symlink", "dir1")}, + "link_to_dir1": ("symlink", "dir1"), + "link_to_file2": ("symlink", "dir1/dir2/file2.txt"), + } + ) + create_structure(self.scratchspace_path, structure) + self.assertTrue(verify_structure(self.scratchspace_path, structure)) + + extra_dir_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, extra_dir_structure)) + extra_dir_structure["dir4"] = {} + self.assertFalse(verify_structure(self.scratchspace_path, extra_dir_structure)) + + missing_dir_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, missing_dir_structure)) + del missing_dir_structure["dir1"] + self.assertFalse( + verify_structure(self.scratchspace_path, missing_dir_structure) + ) + + extra_file_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, extra_file_structure)) + extra_file_structure["file3.txt"] = ("file",) + self.assertFalse(verify_structure(self.scratchspace_path, extra_file_structure)) + + missing_file_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue( + verify_structure(self.scratchspace_path, missing_file_structure) + ) + del missing_file_structure["dir1"]["file1.txt"] + self.assertFalse( + verify_structure(self.scratchspace_path, missing_file_structure) + ) + + extra_link_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, extra_link_structure)) + extra_link_structure["link_to_dir3"] = ("symlink", "dir3") + self.assertFalse(verify_structure(self.scratchspace_path, extra_link_structure)) + + missing_link_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue( + verify_structure(self.scratchspace_path, missing_link_structure) + ) + del missing_link_structure["link_to_dir1"] + self.assertFalse( + verify_structure(self.scratchspace_path, missing_link_structure) + ) + + wrong_link_structure = copy.deepcopy(structure) + # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it + self.assertTrue(verify_structure(self.scratchspace_path, wrong_link_structure)) + wrong_link_structure["link_to_dir1"] = ("symlink", "dir3") + self.assertFalse(verify_structure(self.scratchspace_path, wrong_link_structure)) + + +if __name__ == "__main__": + unittest.main() diff --git a/util/tests/unittest_workspace.py b/util/tests/unittest_workspace.py new file mode 100644 index 00000000..89a1c6c1 --- /dev/null +++ b/util/tests/unittest_workspace.py @@ -0,0 +1,461 @@ +# TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers. + +import os +import shutil +import unittest +from pathlib import Path +from typing import Optional + +from util.tests.filesystem_unittest_util import ( + FilesystemStructure, + create_structure, + make_workspace_structure, + verify_structure, +) +from util.workspace import DBGymWorkspace + + +class WorkspaceTests(unittest.TestCase): + scratchspace_path: Path = Path() + workspace_path: Path = Path() + + @classmethod + def setUpClass(cls) -> None: + cls.scratchspace_path = Path.cwd() / "util/tests/test_workspace_scratchspace/" + cls.workspace_path = cls.scratchspace_path / "dbgym_workspace" + + def setUp(self) -> None: + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + self.workspace: Optional[DBGymWorkspace] = None + self.expected_structure: Optional[FilesystemStructure] = None + + def tearDown(self) -> None: + # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). + if self.scratchspace_path.exists(): + shutil.rmtree(self.scratchspace_path) + + # All these helper functions will perform an action, update the expected structure, and then verify the structure. + # Importantly though, I don't have helper functions for the complex functions that I want to test (e.g. link_result and save_file). + def init_workspace_helper(self) -> None: + # Reset this to avoid the error of it being created twice. + # In real usage, the second run would be a different Python process so DBGymWorkspace.num_times_created_this_run would be 0. + DBGymWorkspace.num_times_created_this_run = 0 + self.workspace = DBGymWorkspace(self.workspace_path) + + if self.expected_structure is None: + self.expected_structure = make_workspace_structure( + FilesystemStructure({}), + FilesystemStructure( + { + "latest_run.link": ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}", + ), + self.workspace.dbgym_this_run_path.name: {}, + } + ), + ) + else: + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ] = {} + self.expected_structure["dbgym_workspace"]["task_runs"][ + "latest_run.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}", + ) + + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def make_file_helper( + self, relative_path: str, file_obj: tuple[str, ...] = ("file",) + ) -> Path: + """ + You can override file_obj to make it a symlink instead. + """ + assert self.workspace is not None and self.expected_structure is not None + assert ( + ".." not in relative_path + ), 'relative_path should not contain ".." (it should be inside the scratchspace dir)' + file_path = self.scratchspace_path / relative_path + file_path.parent.mkdir(parents=True, exist_ok=True) + + if file_obj[0] == "file": + assert len(file_obj) in [1, 2] + file_path.touch() + elif file_obj[0] == "symlink": + assert len(file_obj) == 2 + target_path = self.scratchspace_path / file_obj[1] + os.symlink(target_path, file_path) + else: + assert False, f"Unsupported file_obj: {file_obj}" + + # Build up the nested dict structure for the expected path + current_dict = self.expected_structure + path_parts = relative_path.split("/") + for part in path_parts[:-1]: + if part not in current_dict: + current_dict[part] = {} + current_dict = current_dict[part] + current_dict[path_parts[-1]] = file_obj + + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + return file_path + + def make_result_helper( + self, relative_path: str = "result.txt", file_obj: tuple[str, ...] = ("file",) + ) -> Path: + assert self.workspace is not None and self.expected_structure is not None + assert ( + ".." not in relative_path + ), 'relative_path should not contain ".." (it should be inside the run_*/ dir)' + return self.make_file_helper( + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{relative_path}", + file_obj=file_obj, + ) + + def test_init_fields(self) -> None: + workspace = DBGymWorkspace(self.workspace_path) + self.assertEqual(workspace.app_name, "dbgym") + + def test_init_from_nonexistent_workspace(self) -> None: + self.init_workspace_helper() + + def test_init_from_empty_workspace(self) -> None: + starting_structure = FilesystemStructure({"dbgym_workspace": {}}) + create_structure(self.scratchspace_path, starting_structure) + self.init_workspace_helper() + + def test_init_from_already_initialized_workspace(self) -> None: + self.init_workspace_helper() + self.init_workspace_helper() + + def test_link_result_basic_functionality(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + self.workspace.link_result(result_path) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_link_result_does_not_copy_directory_structure_to_symlinks_dir( + self, + ) -> None: + """ + We always just want link_result to link to the base symlinks dir. + """ + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper(relative_path="dir1/dir2/dir3/result.txt") + self.workspace.link_result(result_path) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/dir1/dir2/dir3/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_link_result_invalid_custom_link_name(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + with self.assertRaisesRegex( + AssertionError, 'link_name \\(custom\\) should end with "\\.link"' + ): + self.workspace.link_result(result_path, custom_link_name=f"custom") + + def test_link_result_valid_custom_link_name(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + self.workspace.link_result(result_path, custom_link_name="custom.link") + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + "custom.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_link_same_result_twice_with_same_link_name(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + self.workspace.link_result(result_path) + self.workspace.link_result(result_path) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_link_same_result_with_different_name(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + self.workspace.link_result(result_path) + self.workspace.link_result(result_path, custom_link_name="custom.link") + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"] = {} + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"{result_path.name}.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.expected_structure["dbgym_workspace"]["symlinks"]["dbgym"][ + f"custom.link" + ] = ( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_link_result_from_another_run_raises_error(self) -> None: + self.init_workspace_helper() + result_path = self.make_result_helper() + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + with self.assertRaisesRegex( + AssertionError, + "The result must have been generated in \*this\* run\_\*/ dir", + ): + self.workspace.link_result(result_path) + + def test_link_result_from_external_dir_raises_error(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_file_helper("external/result.txt") + with self.assertRaisesRegex( + AssertionError, + "The result must have been generated in \*this\* run\_\*/ dir", + ): + self.workspace.link_result(result_path) + + def test_link_result_cannot_link_symlink(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + symlink_path = self.make_result_helper( + "symlink.link", + file_obj=( + "symlink", + f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", + ), + ) + with self.assertRaisesRegex( + AssertionError, + "result_fordpath \(.*\) should be a fully resolved path", + ): + self.workspace.link_result(symlink_path) + + def test_save_file_dependency(self) -> None: + """ + See the comments in save_file() for what a "dependency" is. + """ + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result_path = self.make_result_helper() + self.init_workspace_helper() + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_same_dependency_twice(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result_path = self.make_result_helper(file_obj=("file",)) + self.init_workspace_helper() + self.workspace.save_file(result_path) + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/{result_path.name}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_two_different_dependencies_with_same_filename_both_directly_inside_run( + self, + ) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_names = [] + prev_run_names.append(self.workspace.dbgym_this_run_path.name) + result1_path = self.make_result_helper(file_obj=("file",)) + self.init_workspace_helper() + prev_run_names.append(self.workspace.dbgym_this_run_path.name) + result2_path = self.make_result_helper(file_obj=("file",)) + filename = result1_path.name + assert filename == result2_path.name + + self.init_workspace_helper() + self.workspace.save_file(result1_path) + self.workspace.save_file(result2_path) + # The second save_file() should have overwritten the first one. + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{filename}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_names[-1]}/{filename}", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_two_different_dependencies_with_same_filename_but_different_outermost_dirs( + self, + ) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result1_path = self.make_result_helper("dir1/result.txt", file_obj=("file",)) + result2_path = self.make_result_helper("result.txt", file_obj=("file",)) + filename = result1_path.name + assert filename == result2_path.name + + self.init_workspace_helper() + self.workspace.save_file(result1_path) + self.workspace.save_file(result2_path) + # The second save_file() should not overwrite the first one because the outermost dirs are different. + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{filename}.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/{filename}", + ) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ]["dir1.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/dir1", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_config(self) -> None: + """ + See the comments in save_file() for what a "config" is. + """ + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_file_helper( + "external/result.txt", file_obj=("file", "contents") + ) + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}"] = ("file", "contents") + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_same_config_twice(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_file_helper( + "external/result.txt", file_obj=("file", "contents") + ) + self.workspace.save_file(result_path) + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{result_path.name}"] = ("file", "contents") + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_two_different_configs_with_same_filename(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result1_path = self.make_file_helper( + "external/result.txt", file_obj=("file", "contents1") + ) + result2_path = self.make_file_helper( + "external/dir1/result.txt", file_obj=("file", "contents2") + ) + filename = result1_path.name + assert filename == result2_path.name + + self.workspace.save_file(result1_path) + self.workspace.save_file(result2_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ][f"{filename}"] = ("file", "contents2") + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_dependency_inside_directory(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + prev_run_name = self.workspace.dbgym_this_run_path.name + result_path = self.make_result_helper("dir1/dir2/result.txt") + self.make_result_helper("dir1/other1.txt") + self.make_result_helper("dir1/dir3/other2.txt") + self.init_workspace_helper() + self.workspace.save_file(result_path) + self.expected_structure["dbgym_workspace"]["task_runs"][ + self.workspace.dbgym_this_run_path.name + ]["dir1.link"] = ( + "symlink", + f"dbgym_workspace/task_runs/{prev_run_name}/dir1", + ) + self.assertTrue( + verify_structure(self.scratchspace_path, self.expected_structure) + ) + + def test_save_file_generated_this_run_raises_error(self) -> None: + self.init_workspace_helper() + assert self.workspace is not None and self.expected_structure is not None + result_path = self.make_result_helper() + with self.assertRaisesRegex( + AssertionError, + "fpath \(.*\) was generated in this task run \(.*\)\. You do not need to save it", + ): + self.workspace.save_file(result_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/util/workspace.py b/util/workspace.py index 3c580a7f..11e0d620 100644 --- a/util/workspace.py +++ b/util/workspace.py @@ -8,7 +8,6 @@ import subprocess import time from datetime import datetime -from enum import Enum from pathlib import Path from typing import IO, Any, Optional @@ -19,12 +18,6 @@ from util.log import DBGYM_LOGGER_NAME from util.shell import subprocess_run -# Enums -TuningMode = Enum("TuningMode", ["HPO", "TUNE", "REPLAY"]) - -# Default values -DEFAULT_WORKLOAD_TIMEOUT = 600 - # Relative paths of different folders in the codebase DBMS_PATH = Path("dbms") POSTGRES_PATH = DBMS_PATH / "postgres" @@ -99,7 +92,7 @@ def get_default_tables_dname(scale_factor: float | str) -> str: # Paths of dependencies in the workspace. These are named "*_path" because they will be an absolute path # The reason these _cannot_ be relative paths is because relative paths are relative to the codebase root, not the workspace root # Note that it's okay to hardcode the codebase paths (like dbgym_dbms_postgres) here. In the worst case, we'll just break an -# integration test. The "source of truth" of codebase paths is based on DBGymConfig.cur_source_path(), which will always +# integration test. The "source of truth" of codebase paths is based on DBGymWorkspace.cur_source_path(), which will always # reflect the actual codebase structure. As long as we automatically enforce getting the right codebase paths when writing, it's # ok to have to hardcode them when reading. # Details @@ -159,43 +152,33 @@ def get_default_pgbin_path(workspace_path: Path) -> Path: return get_default_repo_path(workspace_path) / "boot" / "build" / "postgres" / "bin" -class DBGymConfig: +class DBGymWorkspace: """ Global configurations that apply to all parts of DB-Gym """ num_times_created_this_run: int = 0 - def __init__(self, dbgym_config_path: Path): - # The logic around dbgym_tmp_path assumes that DBGymConfig is only constructed once. - DBGymConfig.num_times_created_this_run += 1 + def __init__(self, dbgym_workspace_path: Path): + # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once. + DBGymWorkspace.num_times_created_this_run += 1 assert ( - DBGymConfig.num_times_created_this_run == 1 - ), f"DBGymConfig has been created {DBGymConfig.num_times_created_this_run} times. It should only be created once per run." - - assert is_base_git_dir( - os.getcwd() - ), "This script should be invoked from the root of the dbgym repo." - - # Parse the YAML file. - contents: str = dbgym_config_path.read_text() - yaml_config: dict[str, Any] = yaml.safe_load(contents) + DBGymWorkspace.num_times_created_this_run == 1 + ), f"DBGymWorkspace has been created {DBGymWorkspace.num_times_created_this_run} times. It should only be created once per run." - # Require dbgym_workspace_path to be absolute. - # All future paths should be constructed from dbgym_workspace_path. - dbgym_workspace_path = ( - Path(yaml_config["dbgym_workspace_path"]).resolve().absolute() - ) - - self.path: Path = dbgym_config_path + self.base_dbgym_repo_dpath = get_base_dbgym_repo_dpath() self.cur_path_list: list[str] = ["dbgym"] - self.root_yaml: dict[str, Any] = yaml_config - self.cur_yaml: dict[str, Any] = self.root_yaml + self.app_name = ( + "dbgym" # TODO: discover this dynamically. app means dbgym or an agent + ) # Set and create paths. - self.dbgym_repo_path = Path(os.getcwd()) self.dbgym_workspace_path = dbgym_workspace_path self.dbgym_workspace_path.mkdir(parents=True, exist_ok=True) + + # Now that the workspace is guaranteed to be created, we can check if it's fully resolved. + assert is_fully_resolved(self.dbgym_workspace_path) + self.dbgym_runs_path = get_runs_path_from_workspace_path( self.dbgym_workspace_path ) @@ -211,8 +194,8 @@ def __init__(self, dbgym_config_path: Path): self.dbgym_tmp_path = get_tmp_path_from_workspace_path( self.dbgym_workspace_path ) - # The best place to delete the old dbgym_tmp_path is in DBGymConfig.__init__(). - # This is better than deleting the dbgym_tmp_path is in DBGymConfig.__del__() because DBGymConfig may get deleted before execution has completed. + # The best place to delete the old dbgym_tmp_path is in DBGymWorkspace.__init__(). + # This is better than deleting the dbgym_tmp_path is in DBGymWorkspace.__del__() because DBGymWorkspace may get deleted before execution has completed. # Also, by keeping the tmp directory around, you can look at it to debug issues. if self.dbgym_tmp_path.exists(): shutil.rmtree(self.dbgym_tmp_path) @@ -240,14 +223,133 @@ def __init__(self, dbgym_config_path: Path): try_remove_file(self.dbgym_latest_run_path) try_create_symlink(self.dbgym_this_run_path, self.dbgym_latest_run_path) - # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is - # explained further in the documentation. + # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead + def link_result( + self, + result_fordpath: Path, + custom_link_name: Optional[str] = None, + ) -> Path: + """ + result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. + Further, result_fordpath must have been generated by this invocation to task.py. This also means that + result_fordpath itself can be a file or a dir but not a symlink. + Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside + symlinks/[codebase]/[org]/. + Will override the old symlink if there is one, so that symlinks/ always contains the latest generated + version of a file. + This function will return the path to the symlink that was created. + """ + assert isinstance(result_fordpath, Path) + assert is_fully_resolved( + result_fordpath + ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" + assert is_child_path( + result_fordpath, self.dbgym_this_run_path + ), "The result must have been generated in *this* run_*/ dir" + assert not os.path.islink(result_fordpath) + + if type(custom_link_name) is str: + link_name = custom_link_name + else: + if os.path.isfile(result_fordpath): + link_name = basename_of_path(result_fordpath) + ".link" + elif os.path.isdir(result_fordpath): + link_name = basename_of_path(result_fordpath) + ".link" + else: + raise AssertionError("result_fordpath must be either a file or dir") + + symlink_parent_dpath = self.dbgym_symlinks_path / self.app_name + symlink_parent_dpath.mkdir(parents=True, exist_ok=True) + + # Remove the old symlink ("old" meaning created in an earlier run) if there is one + # Note that in a multi-threaded setting, this might remove one created by a process in the same run, + # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink + # file of the current run regardless of the order of threads. + assert link_name.endswith(".link") and not link_name.endswith( + ".link.link" + ), f'link_name ({link_name}) should end with ".link"' + symlink_path = symlink_parent_dpath / link_name + try_remove_file(symlink_path) + try_create_symlink(result_fordpath, symlink_path) + + return symlink_path + + def get_run_dpath_from_fpath(self, fpath: Path) -> Path: + run_dpath = fpath + while not parent_dpath_of_path(run_dpath).samefile(self.dbgym_runs_path): + run_dpath = parent_dpath_of_path(run_dpath) + return run_dpath + + # TODO(phw2): really look at the clean PR to see what it changed + # TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save() + def save_file(self, fpath: Path) -> None: + """ + If an external function takes in a file/directory as input, you will not be able to call open_and_save(). + In these situations, just call save_file(). + Like open_and_save(), this function only works with real absolute paths. + "Saving" can mean either copying the file or creating a symlink to it + We copy the file if it is a "config", meaning it just exists without having been generated + We create a symlink if it is a "dependency", meaning a task.py command was run to generate it + In these cases we create a symlink so we have full provenance for how the dependency was created + + **Notable Behavior** + - When you save a dependency, it actually creates a link to the outermost directory still inside run_*/. + - The second save will overwrite the first. + - If you save the same file twice in the same run, the second save will overwrite the first. + - If you save two configs with the same name, the second save will overwrite the first. + - If you save two dependencies with the same *outermost* directory, or two dependencies with the same filename + both directly inside run_*/, the second save will overwrite the first. + """ + # validate fpath + assert isinstance(fpath, Path) + assert not os.path.islink(fpath), f"fpath ({fpath}) should not be a symlink" + assert os.path.exists(fpath), f"fpath ({fpath}) does not exist" + assert os.path.isfile(fpath), f"fpath ({fpath}) is not a file" + assert not is_child_path( + fpath, self.dbgym_this_run_path + ), f"fpath ({fpath}) was generated in this task run ({self.dbgym_this_run_path}). You do not need to save it" + + # Save _something_ to dbgym_this_run_path. + # Save a symlink if the opened file was generated by a run. This is for two reasons: + # 1. Files or dirs generated by a run are supposed to be immutable so saving a symlink is safe. + # 2. Files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them. + if is_child_path(fpath, self.dbgym_runs_path): + # If the fpath file is directly in run_dpath, we symlink the file directly. + run_dpath = self.get_run_dpath_from_fpath(fpath) + parent_dpath = parent_dpath_of_path(fpath) + if parent_dpath.samefile(run_dpath): + fname = basename_of_path(fpath) + symlink_fpath = self.dbgym_this_run_path / (fname + ".link") + try_remove_file(symlink_fpath) + try_create_symlink(fpath, symlink_fpath) + # Otherwise, we know the fpath file is _not_ directly inside run_dpath dir. + # We go as far back as we can while still staying in run_dpath and symlink that "base" dir. + # This is because lots of runs create dirs within run_dpath and it creates too much clutter to symlink every individual file. + # Further, this avoids an edge case where you both save a file and the dir it's in. + else: + # Set base_dpath such that its parent is run_dpath. + base_dpath = parent_dpath + while not parent_dpath_of_path(base_dpath).samefile(run_dpath): + base_dpath = parent_dpath_of_path(base_dpath) + + # Create symlink + open_base_dname = basename_of_path(base_dpath) + symlink_dpath = self.dbgym_this_run_path / (open_base_dname + ".link") + try_remove_file(symlink_dpath) + try_create_symlink(base_dpath, symlink_dpath) + # If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable. + else: + fname = basename_of_path(fpath) + # In this case, we want to copy instead of symlinking since it might disappear in the future. + copy_fpath = self.dbgym_this_run_path / fname + shutil.copy(fpath, copy_fpath) + + # `append_group()` is used to mark the "codebase path" of an invocation of the CLI. The "codebase path" is explained further in the documentation. def append_group(self, name: str) -> None: self.cur_path_list.append(name) - self.cur_yaml = self.cur_yaml.get(name, {}) def cur_source_path(self, *dirs: str) -> Path: - cur_path = self.dbgym_repo_path + cur_path = self.base_dbgym_repo_dpath assert self.cur_path_list[0] == "dbgym" for folder in self.cur_path_list[1:]: cur_path = cur_path / folder @@ -292,17 +394,28 @@ def cur_task_runs_artifacts_path(self, *dirs: str, mkdir: bool = False) -> Path: return self.cur_task_runs_path("artifacts", *dirs, mkdir=mkdir) -def make_standard_dbgym_cfg() -> DBGymConfig: +def get_workspace_path_from_config(dbgym_config_path: Path) -> Path: + """ + Returns the workspace path (as a fully resolved path) from the config file. """ - The "standard" way to make a DBGymConfig using the DBGYM_CONFIG_PATH envvar and the + with open(dbgym_config_path) as f: + # We do *not* call fully_resolve_path() here because the workspace may not exist yet. + return Path(yaml.safe_load(f)["dbgym_workspace_path"]).resolve().absolute() + + +def make_standard_dbgym_workspace() -> DBGymWorkspace: + """ + The "standard" way to make a DBGymWorkspace using the DBGYM_CONFIG_PATH envvar and the default path of dbgym_config.yaml. """ dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) - dbgym_cfg = DBGymConfig(dbgym_config_path) - return dbgym_cfg + assert dbgym_config_path == Path("env/tests/gymlib_integtest_dbgym_config.yaml") + dbgym_workspace_path = get_workspace_path_from_config(dbgym_config_path) + dbgym_workspace = DBGymWorkspace(dbgym_workspace_path) + return dbgym_workspace -def fully_resolve_path(dbgym_cfg: DBGymConfig, inputpath: os.PathLike[str]) -> Path: +def fully_resolve_path(inputpath: os.PathLike[str]) -> Path: """ Fully resolve any path to a real, absolute path. @@ -320,12 +433,12 @@ def fully_resolve_path(dbgym_cfg: DBGymConfig, inputpath: os.PathLike[str]) -> P realabspath = Path(inputpath) # `expanduser()` is always "ok" to call first. realabspath = realabspath.expanduser() - # The reason we don't call Path.absolute() is because the path should be relative to dbgym_cfg.dbgym_repo_path, + # The reason we don't call Path.absolute() is because the path should be relative to get_base_dbgym_repo_dpath(), # which is not necessary where cwd() points at the time of calling this function. if not realabspath.is_absolute(): - realabspath = dbgym_cfg.dbgym_repo_path / realabspath + realabspath = get_base_dbgym_repo_dpath() / realabspath # `resolve()` has two uses: normalize the path (remove ..) and resolve symlinks. - # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does it this + # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does these together this # way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs) realabspath = realabspath.resolve() assert is_fully_resolved( @@ -334,7 +447,15 @@ def fully_resolve_path(dbgym_cfg: DBGymConfig, inputpath: os.PathLike[str]) -> P return realabspath -def is_base_git_dir(cwd: str) -> bool: +def get_base_dbgym_repo_dpath() -> Path: + path = Path(os.getcwd()) + assert _is_base_dbgym_repo_dpath( + path + ), "This script should be invoked from the root of the dbgym repo." + return path + + +def _is_base_dbgym_repo_dpath(path: Path) -> bool: """ Returns whether we are in the base directory of some git repository """ @@ -342,16 +463,21 @@ def is_base_git_dir(cwd: str) -> bool: git_toplevel = subprocess.check_output( ["git", "rev-parse", "--show-toplevel"], encoding="utf-8" ).strip() - return git_toplevel == cwd - except subprocess.CalledProcessError as e: - # this means we are not in _any_ git repo + return Path(git_toplevel) == path + except subprocess.CalledProcessError: + # This means we are not in _any_ git repo return False + except Exception as e: + raise e def is_fully_resolved(path: Path) -> bool: """ Checks if a path is fully resolved (exists, is absolute, and contains no symlinks in its entire ancestry). + The reason we check for existence is because that's the only way we know that there are no symlinks in its entire ancestry. + If we didn't check for existence, we could later create a new symlink in the path's ancestry. + Even if a path exists, is absolute, and is not itself a symlink, it could still contain symlinks in its parent directories. For example: /home/user/ # Real directory @@ -379,19 +505,6 @@ def is_fully_resolved(path: Path) -> bool: return str(resolved_path) == str(path) -def path_exists_dont_follow_symlinks(path: Path) -> bool: - """ - As of writing this comment, ray is currently constraining us to python <3.12. However, the "follow_symlinks" option in - Path.exists() only comes up in python 3.12. Thus, this is the only way to check if a path exists without following symlinks. - """ - # If the path exists and is a symlink, os.path.islink() will be true (even if the symlink is broken) - if os.path.islink(path): - return True - # Otherwise, we know it's either non-existent or not a symlink, so path.exists() works fine - else: - return path.exists() - - def parent_dpath_of_path(dpath: Path) -> Path: """ This function only calls Path.parent, but in a safer way. @@ -436,23 +549,20 @@ def is_child_path(child_path: os.PathLike[str], parent_dpath: os.PathLike[str]) ) -def open_and_save(dbgym_cfg: DBGymConfig, open_fpath: Path, mode: str = "r") -> IO[Any]: +def open_and_save( + dbgym_workspace: DBGymWorkspace, open_fpath: Path, mode: str = "r" +) -> IO[Any]: """ Open a file and "save" it to [workspace]/task_runs/run_*/. It takes in a str | Path to match the interface of open(). This file does not work if open_fpath is a symlink, to make its interface identical to that of open(). Make sure to resolve all symlinks with fully_resolve_path(). To avoid confusion, I'm enforcing this function to only work with absolute paths. + # TODO: maybe make it work on non-fully-resolved paths to better match open() See the comment of save_file() for what "saving" means If you are generating a "result" for the run, _do not_ use this. Just use the normal open(). This shouldn't be too hard to remember because this function crashes if open_fpath doesn't exist, and when you write results you're usually opening open_fpaths which do not exist. - - **Notable Behavior** - - If you open the same "config" file twice in the same run, it'll only be saved the first time (even if the file has changed in between). - - "Dependency" files should be immutable so there's no problem here. - - If you open two "config" files of the same name but different paths, only the first open will be saved. - - Opening two "dependency" files of the same name but different paths will lead to two different "base dirs" being symlinked. """ # validate open_fpath assert isinstance(open_fpath, Path) @@ -468,14 +578,14 @@ def open_and_save(dbgym_cfg: DBGymConfig, open_fpath: Path, mode: str = "r") -> assert os.path.isfile(open_fpath), f"open_fpath ({open_fpath}) is not a file" # save - save_file(dbgym_cfg, open_fpath) + save_file(dbgym_workspace, open_fpath) # open return open(open_fpath, mode=mode) def extract_from_task_run_fordpath( - dbgym_cfg: DBGymConfig, task_run_fordpath: Path + dbgym_workspace: DBGymWorkspace, task_run_fordpath: Path ) -> tuple[Path, str, Path, str]: """ The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want. @@ -486,19 +596,19 @@ def extract_from_task_run_fordpath( parent_dpath = task_run_fordpath.parent # TODO(phw2): make this a common function assert not parent_dpath.samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_cfg.dbgym_runs_path ({dbgym_cfg.dbgym_runs_path})" + dbgym_workspace.dbgym_runs_path + ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})" assert not parent_dpath_of_path(parent_dpath).samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" + dbgym_workspace.dbgym_runs_path + ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" assert not parent_dpath_of_path(parent_dpath_of_path(parent_dpath)).samefile( - dbgym_cfg.dbgym_runs_path - ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" + dbgym_workspace.dbgym_runs_path + ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" # org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_fordpath is in org_dpath = parent_dpath while not parent_dpath_of_path( parent_dpath_of_path(parent_dpath_of_path(org_dpath)) - ).samefile(dbgym_cfg.dbgym_runs_path): + ).samefile(dbgym_workspace.dbgym_runs_path): org_dpath = parent_dpath_of_path(org_dpath) org_dname = basename_of_path(org_dpath) codebase_dpath = parent_dpath_of_path(org_dpath) @@ -507,9 +617,8 @@ def extract_from_task_run_fordpath( return codebase_dpath, codebase_dname, org_dpath, org_dname -# TODO(phw2): really look at the clean PR to see what it changed -# TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save() -def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: +# TODO(phw2): deprecate this once I'm done with unittest_workspace.py +def save_file(dbgym_workspace: DBGymWorkspace, fpath: Path) -> None: """ If an external function takes in a file/directory as input, you will not be able to call open_and_save(). In these situations, just call save_file(). @@ -520,24 +629,24 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: In these cases we create a symlink so we have full provenance for how the dependency was created """ # validate fpath - assert isinstance(fpath, Path) - assert not os.path.islink(fpath), f"fpath ({fpath}) should not be a symlink" - assert os.path.exists(fpath), f"fpath ({fpath}) does not exist" - assert os.path.isfile(fpath), f"fpath ({fpath}) is not a file" + assert is_fully_resolved(fpath), f"fpath ({fpath}) should be a fully resolved path" + assert os.path.isfile(fpath), f"fpath ({fpath}) should be a file" assert not is_child_path( - fpath, dbgym_cfg.dbgym_this_run_path - ), f"fpath ({fpath}) was generated in this task run ({dbgym_cfg.dbgym_this_run_path}). You do not need to save it" + fpath, dbgym_workspace.dbgym_this_run_path + ), f"fpath ({fpath}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it" # save _something_ to dbgym_this_run_path # save a symlink if the opened file was generated by a run. this is for two reasons: # 1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe # 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them - if is_child_path(fpath, dbgym_cfg.dbgym_runs_path): + if is_child_path(fpath, dbgym_workspace.dbgym_runs_path): # get paths we'll need later. _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath( - dbgym_cfg, fpath + dbgym_workspace, fpath + ) + this_run_save_dpath = ( + dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname ) - this_run_save_dpath = dbgym_cfg.dbgym_this_run_path / codebase_dname / org_dname os.makedirs(this_run_save_dpath, exist_ok=True) # if the fpath file is directly in org_dpath, we symlink the file directly @@ -562,7 +671,7 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: # if it wasn't generated by a run else: # since we don't know where the file is at all, the location is "unknown" and the org is "all" - this_run_save_dpath = dbgym_cfg.dbgym_this_run_path / "unknown" / "all" + this_run_save_dpath = dbgym_workspace.dbgym_this_run_path / "unknown" / "all" os.makedirs(this_run_save_dpath, exist_ok=True) fname = basename_of_path(fpath) # in this case, we want to copy instead of symlinking since it might disappear in the future @@ -570,14 +679,14 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> None: shutil.copy(fpath, copy_fpath) -# TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead +# TODO(phw2): deprecate this once I'm done with unittest_workspace.py def link_result( - dbgym_cfg: DBGymConfig, + dbgym_workspace: DBGymWorkspace, result_fordpath: Path, custom_result_name: Optional[str] = None, ) -> Path: """ - result_fordpath must be a "result", meaning it was generated inside dbgym_cfg.dbgym_this_run_path. + result_fordpath must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. Further, result_fordpath must have been generated by this invocation to task.py. This also means that result_fordpath itself can be a file or a dir but not a symlink. Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside @@ -590,8 +699,7 @@ def link_result( assert is_fully_resolved( result_fordpath ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" - result_fordpath = fully_resolve_path(dbgym_cfg, result_fordpath) - assert is_child_path(result_fordpath, dbgym_cfg.dbgym_this_run_path) + assert is_child_path(result_fordpath, dbgym_workspace.dbgym_this_run_path) assert not os.path.islink(result_fordpath) if type(custom_result_name) is str: @@ -606,13 +714,15 @@ def link_result( # Figure out the parent directory path of the symlink codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath( - dbgym_cfg, result_fordpath + dbgym_workspace, result_fordpath ) # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path() assert codebase_dpath.samefile( - dbgym_cfg.cur_task_runs_path() + dbgym_workspace.cur_task_runs_path() ), f"link_result should only be called on files generated by this invocation to task.py" - symlink_parent_dpath = dbgym_cfg.dbgym_symlinks_path / codebase_dname / org_dname + symlink_parent_dpath = ( + dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname + ) symlink_parent_dpath.mkdir(parents=True, exist_ok=True) # Remove the old symlink ("old" meaning created in an earlier run) if there is one @@ -654,6 +764,7 @@ def try_remove_file(path: Path) -> None: pass +# TODO: move this stuff to shell.py def restart_ray(redis_port: int) -> None: """ Stop and start Ray.