diff --git a/.github/workflows/branch-docs.yml b/.github/workflows/branch-docs.yml index b197c02bd4..4b23a68b4d 100644 --- a/.github/workflows/branch-docs.yml +++ b/.github/workflows/branch-docs.yml @@ -1,13 +1,15 @@ name: ActivitySim Branch Docs # This workflow is provided as a service for forks to build branch-specific documentation. -on: push +on: + - push + - workflow_dispatch jobs: docbuild: if: "contains(github.event.head_commit.message, '[makedocs]') && (github.repository_owner != 'ActivitySim') && (github.ref_name != 'develop')" # develop branch docs are built at the end of the core test workflow, regardless of repository owner or commit message flags - name: ubuntu-latest py3.9 + name: ubuntu-latest py3.10 runs-on: ubuntu-latest defaults: run: @@ -16,24 +18,35 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 # get all tags, lets setuptools_scm do its thing - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Install dependencies + + - name: Setup Mambaforge uses: conda-incubator/setup-miniconda@v2 with: miniforge-variant: Mambaforge miniforge-version: latest use-mamba: true - environment-file: conda-environments/docbuild.yml - python-version: 3.9 + python-version: "3.10" activate-environment: docbuild auto-activate-base: false auto-update-conda: false + + - name: Set cache date for year and month + run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV + + - uses: actions/cache@v3 + with: + path: /usr/share/miniconda3/envs/docbuild + key: linux-64-conda-${{ hashFiles('conda-environments/docbuild.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }} + id: cache + + - name: Update environment + run: mamba env update -n docbuild -f conda-environments/docbuild.yml + if: steps.cache.outputs.cache-hit != 'true' + - name: Install activitysim run: | python -m pip install . + - name: Conda checkup run: | conda info -a @@ -41,11 +54,13 @@ jobs: echo REPOSITORY ${{ github.repository }} echo REF ${{ github.ref }} echo REF_NAME ${{ github.ref_name }} + - name: Build the docs run: | cd docs make clean make html + - name: Push to GitHub Pages uses: peaceiris/actions-gh-pages@v3.8.0 with: diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml index 0cd77dff4d..a72406dd23 100644 --- a/.github/workflows/core_tests.yml +++ b/.github/workflows/core_tests.yml @@ -10,14 +10,14 @@ on: - '*' env: - CACHE_NUMBER: 1 # increase to reset cache manually + CACHE_NUMBER: 0 # increase to reset cache manually jobs: foundation: strategy: matrix: - python-version: [3.9] + python-version: ["3.10"] defaults: run: shell: bash -l {0} @@ -38,7 +38,7 @@ jobs: - name: Set cache date for year and month run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV - - uses: actions/cache@v2 + - uses: actions/cache@v3 with: path: /usr/share/miniconda3/envs/asim-test key: linux-64-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }} @@ -81,6 +81,10 @@ jobs: run: | python -m pytest --pyargs activitysim.cli + - name: Test activitysim.examples.test + run: | + python -m pytest --pyargs activitysim.examples.test + cross-platform: # also test foundation cross platforms, but do not require a successful @@ -92,12 +96,12 @@ jobs: - os: macos-latest label: macOS prefix: /Users/runner/miniconda3/envs/asim-test - python-version: 3.9 + python-version: "3.10" - os: windows-latest label: win-64 prefix: C:\Miniconda3\envs\asim-test - python-version: 3.9 + python-version: "3.10" defaults: run: @@ -120,7 +124,7 @@ jobs: - name: Set cache date for year and month run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV - - uses: actions/cache@v2 + - uses: actions/cache@v3 with: path: ${{ matrix.prefix }} key: ${{ matrix.label }}-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }} @@ -164,11 +168,11 @@ jobs: python -m pytest --pyargs activitysim.cli - regional_models: + builtin_regional_models: needs: foundation env: mamba-env-prefix: /usr/share/miniconda3/envs/asim-test - python-version: 3.9 + python-version: "3.10" label: linux-64 strategy: matrix: @@ -204,7 +208,7 @@ jobs: - name: Set cache date for year and month run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV - - uses: actions/cache@v2 + - uses: actions/cache@v3 with: path: ${{ env.mamba-env-prefix }} key: ${{ env.label }}-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }} @@ -235,7 +239,7 @@ jobs: # - name: Get a random number # run: echo "RANDOM_SUFFIX=${RANDOM}${RANDOM}" >> $GITHUB_ENV # - # - uses: actions/cache@v2 + # - uses: actions/cache@v3 # # store the regional model's cache directory in github actions cache # # this will (almost) never hit on primary key due to the random number # # but will pull the most recent cache from restore-keys... and then @@ -253,11 +257,71 @@ jobs: run: | python -m pytest activitysim/examples/${{ matrix.region }}/test --durations=0 + external_regional_models: + needs: foundation + env: + mamba-env-prefix: /usr/share/miniconda3/envs/asim-test + python-version: "3.10" + label: linux-64 + strategy: + matrix: + region: + - prototype_mtc + - prototype_psrc_in_development + fail-fast: false + defaults: + run: + shell: bash -l {0} + name: ${{ matrix.region }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Setup Mambaforge + uses: conda-incubator/setup-miniconda@v2 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + activate-environment: asim-test + use-mamba: true + python-version: ${{ env.python-version }} + + - name: Set cache date for year and month + run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV + + - uses: actions/cache@v3 + with: + path: | + ${{ env.mamba-env-prefix }} + ~/.cache/ActivitySim + key: ${{ env.label }}-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }} + id: cache + + - name: Update environment + run: mamba env update -n asim-test -f conda-environments/github-actions-tests.yml + if: steps.cache.outputs.cache-hit != 'true' + + - name: Install activitysim + # installing without dependencies is faster, we trust that all needed dependencies + # are in the conda environment defined above. Also, this avoids pip getting + # confused and reinstalling tables (pytables). + run: | + python -m pip install -e . --no-deps + + - name: Conda checkup + run: | + mamba info -a + mamba list + + - name: Test ${{ matrix.region }} + run: | + python -m activitysim test ${{ matrix.region }} + random_seed_generation: needs: foundation env: mamba-env-prefix: /usr/share/miniconda3/envs/asim-test - python-version: 3.9 + python-version: "3.10" label: linux-64 defaults: run: @@ -280,7 +344,7 @@ jobs: - name: Set cache date for year and month run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV - - uses: actions/cache@v2 + - uses: actions/cache@v3 with: path: ${{ env.mamba-env-prefix }} key: ${{ env.label }}-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }} @@ -310,7 +374,7 @@ jobs: needs: foundation env: mamba-env-prefix: /usr/share/miniconda3/envs/asim-test - python-version: 3.9 + python-version: "3.10" label: linux-64 defaults: run: @@ -332,7 +396,7 @@ jobs: - name: Set cache date for year and month run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV - - uses: actions/cache@v2 + - uses: actions/cache@v3 with: path: ${{ env.mamba-env-prefix }} key: ${{ env.label }}-conda-${{ hashFiles('conda-environments/github-actions-tests.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }} @@ -343,7 +407,7 @@ jobs: if: steps.cache.outputs.cache-hit != 'true' - name: Install Larch - run: mamba install "larch>=5.5.3" + run: mamba install "larch>=5.7.1" - name: Install activitysim # installing without dependencies is faster, we trust that all needed dependencies @@ -373,10 +437,10 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 # get all tags, lets setuptools_scm do its thing - - name: Set up Python 3.9 + - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: "3.10" - name: Install dependencies uses: conda-incubator/setup-miniconda@v2 with: @@ -384,7 +448,7 @@ jobs: miniforge-version: latest use-mamba: true environment-file: conda-environments/docbuild.yml - python-version: 3.9 + python-version: "3.10" activate-environment: docbuild auto-activate-base: false auto-update-conda: false diff --git a/.gitignore b/.gitignore index b1c67af876..f92dc6b1d7 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,7 @@ _test_est **/output/ **/_generated_version.py docs/**/_generated +activitysim/examples/prototype_mtc_extended/test/*.ipynb +activitysim/examples/prototype_mtc/*.ipynb +Untitled.ipynb +docs/dev-guide/_generated2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3a59ed1529..41fecfab5a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,14 +1,14 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v4.4.0 hooks: - id: end-of-file-fixer exclude: .*\.ipynb - id: trailing-whitespace - repo: https://github.com/pycqa/isort - rev: 5.10.1 + rev: 5.12.0 hooks: - id: isort args: ["--profile", "black", "--filter-files"] @@ -18,7 +18,7 @@ repos: hooks: - id: black -- repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 +#- repo: https://github.com/PyCQA/flake8 +# rev: 5.0.4 +# hooks: +# - id: flake8 diff --git a/AAA-BreakingChanges.md b/AAA-BreakingChanges.md new file mode 100644 index 0000000000..feae1ae508 --- /dev/null +++ b/AAA-BreakingChanges.md @@ -0,0 +1,3 @@ + + +- The 'run_list' key in settings.yaml is no longer supported. diff --git a/activitysim/abm/__init__.py b/activitysim/abm/__init__.py index eb5a299611..d877297e89 100644 --- a/activitysim/abm/__init__.py +++ b/activitysim/abm/__init__.py @@ -1,3 +1,4 @@ # ActivitySim # See full license in LICENSE.txt. -from . import misc, models, tables +from activitysim.abm import misc, models, tables +from activitysim.core.steps import output # also contains workflow.step # noqa: F401 diff --git a/activitysim/abm/misc.py b/activitysim/abm/misc.py index 528c8db937..858c7358b8 100644 --- a/activitysim/abm/misc.py +++ b/activitysim/abm/misc.py @@ -1,10 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, inject +from activitysim.core import workflow # FIXME # warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning) @@ -13,23 +15,27 @@ logger = logging.getLogger(__name__) -@inject.injectable(cache=True) -def households_sample_size(settings, override_hh_ids): +@workflow.cached_object +def households_sample_size(state: workflow.State, override_hh_ids): if override_hh_ids is None: - return settings.get("households_sample_size", 0) + return state.settings, households_sample_size else: return 0 if override_hh_ids is None else len(override_hh_ids) -@inject.injectable(cache=True) -def override_hh_ids(settings): +@workflow.cached_object +def override_hh_ids(state: workflow.State): - hh_ids_filename = settings.get("hh_ids", None) + hh_ids_filename = state.settings.hh_ids if hh_ids_filename is None: return None - file_path = config.data_file_path(hh_ids_filename, mandatory=False) + file_path = state.filesystem.get_data_file_path(hh_ids_filename, mandatory=False) + if not file_path: + file_path = state.filesystem.get_config_file_path( + hh_ids_filename, mandatory=False + ) if not file_path: logger.error( "hh_ids file name '%s' specified in settings not found" % hh_ids_filename @@ -56,41 +62,31 @@ def override_hh_ids(settings): return household_ids -@inject.injectable(cache=True) -def trace_hh_id(settings): - - id = settings.get("trace_hh_id", None) - - if id and not isinstance(id, int): - logger.warning( - "setting trace_hh_id is wrong type, should be an int, but was %s" % type(id) - ) - id = None - - return id - +@workflow.cached_object +def trace_od(state: workflow.State): -@inject.injectable(cache=True) -def trace_od(settings): - - od = settings.get("trace_od", None) + od = state.settings.trace_od if od and not ( - isinstance(od, list) and len(od) == 2 and all(isinstance(x, int) for x in od) + isinstance(od, (list, tuple)) + and len(od) == 2 + and all(isinstance(x, int) for x in od) ): - logger.warning("setting trace_od should be a list of length 2, but was %s" % od) + logger.warning( + "setting trace_od should be a list or tuple of length 2, but was %s" % od + ) od = None return od -@inject.injectable(cache=True) -def chunk_size(settings): - _chunk_size = int(settings.get("chunk_size", 0) or 0) +@workflow.cached_object +def chunk_size(state: workflow.State): + _chunk_size = int(state.settings.chunk_size or 0) return _chunk_size -@inject.injectable(cache=True) -def check_for_variability(settings): - return bool(settings.get("check_for_variability", False)) +@workflow.cached_object +def check_for_variability(state: workflow.State): + return bool(state.settings.check_for_variability) diff --git a/activitysim/abm/models/accessibility.py b/activitysim/abm/models/accessibility.py index edd928e5c6..dc5dc82832 100644 --- a/activitysim/abm/models/accessibility.py +++ b/activitysim/abm/models/accessibility.py @@ -1,26 +1,41 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging +import numba as nb import numpy as np import pandas as pd -from activitysim.core import assign, chunk, config, inject, los, mem, pipeline, tracing -from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.core import assign, chunk, los, workflow logger = logging.getLogger(__name__) +@nb.njit +def _accumulate_accessibility(arr, orig_zone_count, dest_zone_count): + assert arr.size == orig_zone_count * dest_zone_count + arr2 = arr.reshape((orig_zone_count, dest_zone_count)) + result = np.empty((orig_zone_count,), dtype=arr.dtype) + for o in range(orig_zone_count): + x = 0 + for d in range(dest_zone_count): + x += arr2[o, d] + result[o] = np.log1p(x) + return result + + def compute_accessibilities_for_zones( + state, accessibility_df, land_use_df, assignment_spec, constants, network_los, - trace_od, trace_label, + chunk_sizer, ): - orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values @@ -33,23 +48,29 @@ def compute_accessibilities_for_zones( ) # create OD dataframe - od_df = pd.DataFrame( - data={ - "orig": np.repeat(orig_zones, dest_zone_count), - "dest": np.tile(dest_zones, orig_zone_count), - } - ) - + od_data = { + "orig": np.repeat(orig_zones, dest_zone_count), + "dest": np.tile(dest_zones, orig_zone_count), + } + # previously, the land use was added to the dataframe via pd.merge + # but the merge is expensive and unnecessary as we can just tile. + logger.debug(f"{trace_label}: tiling land_use_columns into od_data") + for c in land_use_df.columns: + od_data[c] = np.tile(land_use_df[c].to_numpy(), orig_zone_count) + logger.debug(f"{trace_label}: converting od_data to DataFrame") + od_df = pd.DataFrame(od_data) + logger.debug(f"{trace_label}: dropping od_data") + del od_data + logger.debug(f"{trace_label}: dropping od_data complete") + + trace_od = state.settings.trace_od if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None - # merge land_use_columns into od_df - logger.info(f"{trace_label}: merge land_use_columns into od_df") - od_df = pd.merge(od_df, land_use_df, left_on="dest", right_index=True).sort_index() - chunk.log_df(trace_label, "od_df", od_df) + chunk_sizer.log_df(trace_label, "od_df", od_df) locals_d = { "log": np.log, @@ -59,6 +80,7 @@ def compute_accessibilities_for_zones( locals_d.update(constants) skim_dict = network_los.get_default_skim_dict() + # FIXME: because od_df is so huge, next two lines use a fair bit of memory locals_d["skim_od"] = skim_dict.wrap("orig", "dest").set_df(od_df) locals_d["skim_do"] = skim_dict.wrap("dest", "orig").set_df(od_df) @@ -67,36 +89,40 @@ def compute_accessibilities_for_zones( logger.info(f"{trace_label}: assign.assign_variables") results, trace_results, trace_assigned_locals = assign.assign_variables( + state, assignment_spec, od_df, locals_d, trace_rows=trace_od_rows, trace_label=trace_label, - chunk_log=True, + chunk_log=chunk_sizer, ) - chunk.log_df(trace_label, "results", results) + chunk_sizer.log_df(trace_label, "results", results) logger.info(f"{trace_label}: have results") # accessibility_df = accessibility_df.copy() + accessibility_new_columns = {} for column in results.columns: - data = np.asanyarray(results[column]) - data.shape = (orig_zone_count, dest_zone_count) # (o,d) - accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) + logger.info(f"{trace_label}: aggregating column {column}") + accessibility_new_columns[column] = _accumulate_accessibility( + results[column].to_numpy(), orig_zone_count, dest_zone_count + ) + logger.info(f"{trace_label}: completed aggregating") + accessibility_df = accessibility_df.assign(**accessibility_new_columns) + logger.info(f"{trace_label}: completed aggregating info df") if trace_od: - if not trace_od_rows.any(): logger.warning( f"trace_od not found origin = {trace_orig}, dest = {trace_dest}" ) else: - # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging - tracing.trace_df( + state.tracing.trace_df( df, label="accessibility", index_label="skim_offset", @@ -105,16 +131,20 @@ def compute_accessibilities_for_zones( ) if trace_assigned_locals: - tracing.write_csv( + state.tracing.write_csv( trace_assigned_locals, file_name="accessibility_locals" ) return accessibility_df -@inject.step() -def compute_accessibility(land_use, accessibility, network_los, chunk_size, trace_od): - +@workflow.step +def compute_accessibility( + state: workflow.State, + land_use: pd.DataFrame, + accessibility: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: """ Compute accessibility for each zone in land use file using expressions from accessibility_spec @@ -132,23 +162,23 @@ def compute_accessibility(land_use, accessibility, network_los, chunk_size, trac """ trace_label = "compute_accessibility" - model_settings = config.read_model_settings("accessibility.yaml") + model_settings = state.filesystem.read_model_settings("accessibility.yaml") assignment_spec = assign.read_assignment_spec( - config.config_file_path("accessibility.csv") + state.filesystem.get_config_file_path("accessibility.csv") ) - accessibility_df = accessibility.to_frame() + accessibility_df = accessibility if len(accessibility_df.columns) > 0: logger.warning( f"accessibility table is not empty. Columns:{list(accessibility_df.columns)}" ) - raise RuntimeError(f"accessibility table is not empty.") + raise RuntimeError("accessibility table is not empty.") - constants = config.get_model_constants(model_settings) + constants = model_settings.get("CONSTANTS", {}) # only include the land_use columns needed by spec, as specified by land_use_columns model_setting land_use_columns = model_settings.get("land_use_columns", []) - land_use_df = land_use.to_frame() + land_use_df = land_use land_use_df = land_use_df[land_use_columns] logger.info( @@ -157,18 +187,22 @@ def compute_accessibility(land_use, accessibility, network_los, chunk_size, trac accessibilities_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - accessibility_df, chunk_size, trace_label - ): + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(state, accessibility_df, trace_label): accessibilities = compute_accessibilities_for_zones( + state, chooser_chunk, land_use_df, assignment_spec, constants, network_los, - trace_od, trace_label, + chunk_sizer, ) accessibilities_list.append(accessibilities) @@ -177,4 +211,4 @@ def compute_accessibility(land_use, accessibility, network_los, chunk_size, trac logger.info(f"{trace_label} computed accessibilities {accessibility_df.shape}") # - write table to pipeline - pipeline.replace_table("accessibility", accessibility_df) + state.add_table("accessibility", accessibility_df) diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index 1b69cde775..e545e2b34d 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -1,28 +1,29 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, inject, pipeline, simulate, tracing -from activitysim.core.interaction_sample import interaction_sample -from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.abm.models.util import tour_destination +from activitysim.core import config, estimation, los, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation, tour_destination - logger = logging.getLogger(__name__) DUMP = False -@inject.step() +@workflow.step def atwork_subtour_destination( - tours, persons_merged, network_los, chunk_size, trace_hh_id -): - + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: trace_label = "atwork_subtour_destination" model_settings_file_name = "atwork_subtour_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) future_settings = { "SIZE_TERM_SELECTOR": "atwork", @@ -39,13 +40,9 @@ def atwork_subtour_destination( sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) - persons_merged = persons_merged.to_frame() - - tours = tours.to_frame() subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours @@ -53,21 +50,20 @@ def atwork_subtour_destination( tracing.no_results("atwork_subtour_destination") return - estimator = estimation.manager.begin_estimation("atwork_subtour_destination") + estimator = estimation.manager.begin_estimation(state, "atwork_subtour_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + state.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(state.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( + state, subtours, persons_merged, want_logsums, @@ -75,8 +71,6 @@ def atwork_subtour_destination( model_settings, network_los, estimator, - chunk_size, - trace_hh_id, trace_label, ) @@ -95,7 +89,7 @@ def atwork_subtour_destination( subtours[logsum_column_name] = choices_df["logsum"] assign_in_place(tours, subtours[[logsum_column_name]]) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) tracing.print_summary( destination_column_name, subtours[destination_column_name], describe=True @@ -104,9 +98,9 @@ def atwork_subtour_destination( if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # save_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( tours, label="atwork_subtour_destination", columns=["destination"] ) diff --git a/activitysim/abm/models/atwork_subtour_frequency.py b/activitysim/abm/models/atwork_subtour_frequency.py index d42b97fdc3..7bbee371f4 100644 --- a/activitysim/abm/models/atwork_subtour_frequency.py +++ b/activitysim/abm/models/atwork_subtour_frequency.py @@ -1,26 +1,37 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.tour_frequency import process_atwork_subtours +from activitysim.abm.models.util.tour_frequency import process_atwork_subtours +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger(__name__) -def add_null_results(trace_label, tours): +def add_null_results(state, trace_label, tours): logger.info("Skipping %s: add_null_results", trace_label) tours["atwork_subtour_frequency"] = np.nan - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) -@inject.step() -def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): +@workflow.step +def atwork_subtour_frequency( + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, +) -> None: """ This model predicts the frequency of making at-work subtour tours (alternatives for this model come from a separate csv file which is @@ -29,28 +40,28 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): trace_label = "atwork_subtour_frequency" model_settings_file_name = "atwork_subtour_frequency.yaml" - - tours = tours.to_frame() + trace_hh_id = state.settings.trace_hh_id work_tours = tours[tours.tour_type == "work"] # - if no work_tours if len(work_tours) == 0: - add_null_results(trace_label, tours) + add_null_results(state, trace_label, tours) return - model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("atwork_subtour_frequency") + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + estimator = estimation.manager.begin_estimation(state, "atwork_subtour_frequency") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) alternatives = simulate.read_model_alts( - "atwork_subtour_frequency_alternatives.csv", set_index="alt" + state, "atwork_subtour_frequency_alternatives.csv", set_index="alt" ) # merge persons into work_tours - persons_merged = persons_merged.to_frame() work_tours = pd.merge( work_tours, persons_merged, left_on="person_id", right_index=True ) @@ -63,9 +74,11 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - expressions.assign_columns( - df=work_tours, model_settings=preprocessor_settings, trace_label=trace_label + state, + df=work_tours, + model_settings=preprocessor_settings, + trace_label=trace_label, ) if estimator: @@ -75,11 +88,11 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): estimator.write_choosers(work_tours) choices = simulate.simple_simulate( + state, choosers=work_tours, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="atwork_subtour_frequency", estimator=estimator, @@ -99,22 +112,22 @@ def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): # add atwork_subtour_frequency column to tours # reindex since we are working with a subset of tours tours["atwork_subtour_frequency"] = choices.reindex(tours.index) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) # - create atwork_subtours based on atwork_subtour_frequency choice names work_tours = tours[tours.tour_type == "work"] assert not work_tours.atwork_subtour_frequency.isnull().any() - subtours = process_atwork_subtours(work_tours, alternatives) + subtours = process_atwork_subtours(state, work_tours, alternatives) - tours = pipeline.extend_table("tours", subtours) + tours = state.extend_table("tours", subtours) - tracing.register_traceable_table("tours", subtours) - pipeline.get_rn_generator().add_channel("tours", subtours) + state.tracing.register_traceable_table("tours", subtours) + state.get_rn_generator().add_channel("tours", subtours) tracing.print_summary( "atwork_subtour_frequency", tours.atwork_subtour_frequency, value_counts=True ) if trace_hh_id: - tracing.trace_df(tours, label="atwork_subtour_frequency.tours") + state.tracing.trace_df(tours, label="atwork_subtour_frequency.tours") diff --git a/activitysim/abm/models/atwork_subtour_mode_choice.py b/activitysim/abm/models/atwork_subtour_mode_choice.py index bc3f1c66c1..a989572c91 100644 --- a/activitysim/abm/models/atwork_subtour_mode_choice.py +++ b/activitysim/abm/models/atwork_subtour_mode_choice.py @@ -1,45 +1,40 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import ( - config, - expressions, - inject, - los, - pipeline, - simulate, - tracing, -) -from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.abm.models.util.mode import run_tour_mode_choice_simulate +from activitysim.core import config, estimation, expressions, los, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation -from .util.mode import run_tour_mode_choice_simulate - logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def atwork_subtour_mode_choice( - tours, persons_merged, network_los, chunk_size, trace_hh_id -): + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: """ At-work subtour mode choice simulate """ trace_label = "atwork_subtour_mode_choice" + trace_hh_id = state.settings.trace_hh_id + model_settings_file_name = "tour_mode_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "tour_mode" - tours = tours.to_frame() subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours @@ -49,7 +44,7 @@ def atwork_subtour_mode_choice( subtours_merged = pd.merge( subtours, - persons_merged.to_frame(), + persons_merged, left_on="person_id", right_index=True, how="left", @@ -129,7 +124,7 @@ def atwork_subtour_mode_choice( network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") ) - estimator = estimation.manager.begin_estimation("atwork_subtour_mode_choice") + estimator = estimation.manager.begin_estimation(state, "atwork_subtour_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) @@ -138,6 +133,7 @@ def atwork_subtour_mode_choice( # FIXME run_tour_mode_choice_simulate writes choosers post-annotation choices_df = run_tour_mode_choice_simulate( + state, subtours_merged, tour_purpose="atwork", model_settings=model_settings, @@ -147,28 +143,23 @@ def atwork_subtour_mode_choice( skims=skims, constants=constants, estimator=estimator, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="tour_mode_choice", ) # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") for mode, path_types in tvpb_mode_path_types.items(): - for direction, skim in zip( ["od", "do"], [tvpb_logsum_odt, tvpb_logsum_dot] ): - path_type = path_types[direction] skim_cache = skim.cache[path_type] print(f"mode {mode} direction {direction} path_type {path_type}") for c in skim_cache: - dest_col = f"{direction}_{c}" if dest_col not in choices_df: @@ -195,20 +186,21 @@ def atwork_subtour_mode_choice( ) assign_in_place(tours, choices_df) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) # - annotate tours table if model_settings.get("annotate_tours"): - tours = inject.get_table("tours").to_frame() + tours = state.get_dataframe("tours") expressions.assign_columns( + state, df=tours, model_settings=model_settings.get("annotate_tours"), trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), ) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( tours[tours.tour_category == "atwork"], label=tracing.extend_trace_label(trace_label, mode_column_name), slicer="tour_id", diff --git a/activitysim/abm/models/atwork_subtour_scheduling.py b/activitysim/abm/models/atwork_subtour_scheduling.py index 041d899a26..f6c71351df 100644 --- a/activitysim/abm/models/atwork_subtour_scheduling.py +++ b/activitysim/abm/models/atwork_subtour_scheduling.py @@ -1,35 +1,40 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate +from activitysim.abm.models.util.vectorize_tour_scheduling import ( + vectorize_subtour_scheduling, +) +from activitysim.core import config, estimation, expressions, simulate from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation -from .util.vectorize_tour_scheduling import vectorize_subtour_scheduling - logger = logging.getLogger(__name__) DUMP = False -@inject.step() +@workflow.step def atwork_subtour_scheduling( - tours, persons_merged, tdd_alts, skim_dict, chunk_size, trace_hh_id -): + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + tdd_alts: pd.DataFrame, + skim_dict, +) -> None: """ This model predicts the departure time and duration of each activity for at work subtours tours """ trace_label = "atwork_subtour_scheduling" model_settings_file_name = "tour_scheduling_atwork.yaml" - - tours = tours.to_frame() + trace_hh_id = state.settings.trace_hh_id subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours @@ -37,15 +42,15 @@ def atwork_subtour_scheduling( tracing.no_results(trace_label) return - model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("atwork_subtour_scheduling") + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + estimator = estimation.manager.begin_estimation(state, "atwork_subtour_scheduling") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip") - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) - - persons_merged = persons_merged.to_frame() + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) logger.info("Running %s with %d tours", trace_label, len(subtours)) @@ -56,7 +61,7 @@ def atwork_subtour_scheduling( "od_skims": od_skim_wrapper, } expressions.annotate_preprocessors( - subtours, constants, skims, model_settings, trace_label + state, subtours, constants, skims, model_settings, trace_label ) # parent_tours table with columns ['tour_id', 'tdd'] index = tour_id @@ -71,6 +76,7 @@ def atwork_subtour_scheduling( # we don't need to update timetable because subtours are scheduled inside work trip windows choices = vectorize_subtour_scheduling( + state, parent_tours, subtours, persons_merged, @@ -78,7 +84,7 @@ def atwork_subtour_scheduling( model_spec, model_settings, estimator=estimator, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=trace_label, sharrow_skip=sharrow_skip, ) @@ -96,10 +102,10 @@ def atwork_subtour_scheduling( ) assign_in_place(tours, tdd_choices) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( tours[tours.tour_category == "atwork"], label="atwork_subtour_scheduling", slicer="person_id", @@ -111,12 +117,12 @@ def atwork_subtour_scheduling( subtours = tours[tours.tour_category == "atwork"] parent_tours = tours[tours.index.isin(subtours.parent_tour_id)] - tracing.dump_df(DUMP, subtours, trace_label, "sub_tours") - tracing.dump_df(DUMP, parent_tours, trace_label, "parent_tours") + state.tracing.dump_df(DUMP, subtours, trace_label, "sub_tours") + state.tracing.dump_df(DUMP, parent_tours, trace_label, "parent_tours") parent_tours["parent_tour_id"] = parent_tours.index subtours = pd.concat([parent_tours, subtours]) - tracing.dump_df( + state.tracing.dump_df( DUMP, tt.tour_map( parent_tours, subtours, tdd_alts, persons_id_col="parent_tour_id" diff --git a/activitysim/abm/models/auto_ownership.py b/activitysim/abm/models/auto_ownership.py index 564d6f94b6..ba18a09df8 100644 --- a/activitysim/abm/models/auto_ownership.py +++ b/activitysim/abm/models/auto_ownership.py @@ -1,34 +1,42 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -from activitysim.core import config, inject, pipeline, simulate, tracing +import pandas as pd -from .util import estimation +from activitysim.core import config, estimation, simulate, tracing, workflow logger = logging.getLogger(__name__) -@inject.step() -def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_id): +@workflow.step +def auto_ownership_simulate( + state: workflow.State, + households: pd.DataFrame, + households_merged: pd.DataFrame, +) -> None: """ Auto ownership is a standard model which predicts how many cars a household with given characteristics owns """ trace_label = "auto_ownership_simulate" model_settings_file_name = "auto_ownership.yaml" - model_settings = config.read_model_settings(model_settings_file_name) - - estimator = estimation.manager.begin_estimation("auto_ownership") - - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + trace_hh_id = state.settings.trace_hh_id + + estimator = estimation.manager.begin_estimation(state, "auto_ownership") + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) - choosers = households_merged.to_frame() + choosers = households_merged logger.info("Running %s with %d households", trace_label, len(choosers)) @@ -38,14 +46,14 @@ def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_ estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(choosers) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="auto_ownership", log_alt_losers=log_alt_losers, @@ -58,16 +66,14 @@ def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_ estimator.write_override_choices(choices) estimator.end_estimation() - households = households.to_frame() - # no need to reindex as we used all households households["auto_ownership"] = choices - pipeline.replace_table("households", households) + state.add_table("households", households) tracing.print_summary( "auto_ownership", households.auto_ownership, value_counts=True ) if trace_hh_id: - tracing.trace_df(households, label="auto_ownership", warn_if_empty=True) + state.tracing.trace_df(households, label="auto_ownership", warn_if_empty=True) diff --git a/activitysim/abm/models/cdap.py b/activitysim/abm/models/cdap.py index f7da93687b..f4d148a41c 100644 --- a/activitysim/abm/models/cdap.py +++ b/activitysim/abm/models/cdap.py @@ -1,19 +1,32 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.abm.models.util import cdap +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) from activitysim.core.util import reindex -from .util import cdap, estimation - logger = logging.getLogger(__name__) -@inject.step() -def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): +@workflow.step +def cdap_simulate( + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + households: pd.DataFrame, +) -> None: """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other @@ -25,20 +38,21 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): """ trace_label = "cdap" - model_settings = config.read_model_settings("cdap.yaml") + model_settings = state.filesystem.read_model_settings("cdap.yaml") + trace_hh_id = state.settings.trace_hh_id person_type_map = model_settings.get("PERSON_TYPE_MAP", None) assert ( person_type_map is not None - ), f"Expected to find PERSON_TYPE_MAP setting in cdap.yaml" - estimator = estimation.manager.begin_estimation("cdap") + ), "Expected to find PERSON_TYPE_MAP setting in cdap.yaml" + estimator = estimation.manager.begin_estimation(state, "cdap") - cdap_indiv_spec = simulate.read_model_spec( + cdap_indiv_spec = state.filesystem.read_model_spec( file_name=model_settings["INDIV_AND_HHSIZE1_SPEC"] ) - coefficients_df = simulate.read_model_coefficients(model_settings) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) cdap_indiv_spec = simulate.eval_coefficients( - cdap_indiv_spec, coefficients_df, estimator + state, cdap_indiv_spec, coefficients_df, estimator ) # Rules and coefficients for generating interaction specs for different household sizes @@ -46,7 +60,8 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): "INTERACTION_COEFFICIENTS", "cdap_interaction_coefficients.csv" ) cdap_interaction_coefficients = pd.read_csv( - config.config_file_path(interaction_coefficients_file_name), comment="#" + state.filesystem.get_config_file_path(interaction_coefficients_file_name), + comment="#", ) # replace cdap_interaction_coefficients coefficient labels with numeric values @@ -74,7 +89,7 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) """ - cdap_fixed_relative_proportions = simulate.read_model_spec( + cdap_fixed_relative_proportions = state.filesystem.read_model_spec( file_name=model_settings["FIXED_RELATIVE_PROPORTIONS_SPEC"] ) @@ -86,11 +101,10 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): "JOINT_TOUR_COEFFICIENTS", "cdap_joint_tour_coefficients.csv" ) cdap_joint_tour_coefficients = pd.read_csv( - config.config_file_path(joint_tour_coefficients_file_name), comment="#" + state.filesystem.get_config_file_path(joint_tour_coefficients_file_name), + comment="#", ) - persons_merged = persons_merged.to_frame() - # add tour-based chunk_id so we can chunk all trips in tour together assert "chunk_id" not in persons_merged.columns unique_household_ids = persons_merged.household_id.unique() @@ -113,24 +127,27 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec( + state, cdap_interaction_coefficients, hhsize, cache=True, joint_tour_alt=add_joint_tour_utility, ) - if inject.get_injectable("locutor", False): + if state.get_injectable("locutor", False): spec.to_csv( - config.output_file_path("cdap_spec_%s.csv" % hhsize), index=True + state.get_output_file_path(f"cdap_spec_{hhsize}.csv"), index=True ) if add_joint_tour_utility: # build cdap joint tour spec # joint_spec_dependency = spec.loc[[c for c in spec.index if c.startswith(('M_p', 'N_p', 'H_p'))]] joint_spec = cdap.build_cdap_joint_spec( - cdap_joint_tour_coefficients, hhsize, cache=True + state, cdap_joint_tour_coefficients, hhsize, cache=True ) - if inject.get_injectable("locutor", False): + if state.get_injectable("locutor", False): joint_spec.to_csv( - config.output_file_path("cdap_joint_spec_%s.csv" % hhsize), + state.get_output_file_path( + f"cdap_joint_spec_{hhsize}.csv", + ), index=True, ) @@ -149,36 +166,37 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): ) estimator.write_choosers(persons_merged) for hhsize in range(2, cdap.MAX_HHSIZE + 1): - spec = cdap.get_cached_spec(hhsize) + spec = cdap.get_cached_spec(state, hhsize) estimator.write_table(spec, "spec_%s" % hhsize, append=False) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) if add_joint_tour_utility: choices, hh_joint = cdap.run_cdap( + state, persons=persons_merged, person_type_map=person_type_map, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, add_joint_tour_utility=add_joint_tour_utility, ) else: choices = cdap.run_cdap( + state, persons=persons_merged, person_type_map=person_type_map, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, - add_joint_tour_utility=add_joint_tour_utility, ) if estimator: @@ -187,33 +205,30 @@ def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): estimator.write_override_choices(choices) estimator.end_estimation() - # - assign results to persons table and annotate - persons = persons.to_frame() - choices = choices.reindex(persons.index) persons["cdap_activity"] = choices expressions.assign_columns( + state, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) # - annotate households table - households = households.to_frame() - if add_joint_tour_utility: hh_joint = hh_joint.reindex(households.index) households["has_joint_tour"] = hh_joint expressions.assign_columns( + state, df=households, model_settings=model_settings.get("annotate_households"), trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), ) - pipeline.replace_table("households", households) + state.add_table("households", households) tracing.print_summary("cdap_activity", persons.cdap_activity, value_counts=True) logger.info( diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py index fe79d3fcdf..856c4e1b41 100644 --- a/activitysim/abm/models/disaggregate_accessibility.py +++ b/activitysim/abm/models/disaggregate_accessibility.py @@ -1,26 +1,29 @@ +# ActivitySim +# See full license in LICENSE.txt. +from __future__ import annotations + import logging import random from functools import reduce import numpy as np import pandas as pd -from orca import orca from sklearn.cluster import KMeans from activitysim.abm.models import initialize, location_choice -from activitysim.abm.models.util import estimation, tour_destination +from activitysim.abm.models.util import tour_destination from activitysim.abm.tables import shadow_pricing -from activitysim.core import config, inject, los, pipeline, tracing, util +from activitysim.core import estimation, los, tracing, util, workflow from activitysim.core.expressions import assign_columns logger = logging.getLogger(__name__) -def read_disaggregate_accessibility_yaml(file_name): +def read_disaggregate_accessibility_yaml(state: workflow.State, file_name): """ Adds in default table suffixes 'proto_' if not defined in the settings file """ - model_settings = config.read_model_settings(file_name) + model_settings = state.filesystem.read_model_settings(file_name) if not model_settings.get("suffixes"): model_settings["suffixes"] = { "SUFFIX": "proto_", @@ -39,25 +42,26 @@ def read_disaggregate_accessibility_yaml(file_name): size = model_settings.get(sample, 0) if size > 0 and size < 1: model_settings[sample] = round( - size * len(pipeline.get_table("land_use").index) + size * len(state.get_dataframe("land_use").index) ) return model_settings class ProtoPop: - def __init__(self, network_los, chunk_size): + def __init__(self, state: workflow.State, network_los, chunk_size): + self.state = state # Run necessary inits for later - initialize.initialize_landuse() + initialize.initialize_landuse(state) # Initialization self.proto_pop = {} self.zone_list = [] - self.land_use = pipeline.get_table("land_use") + self.land_use = state.get_dataframe("land_use") self.network_los = network_los self.chunk_size = chunk_size self.model_settings = read_disaggregate_accessibility_yaml( - "disaggregate_accessibility.yaml" + state, "disaggregate_accessibility.yaml" ) # Random seed @@ -76,8 +80,8 @@ def __init__(self, network_los, chunk_size): self.model_settings["DESTINATION_SAMPLE_SIZE"], ) ) - self.inject_tables() - self.annotate_tables() + self.inject_tables(state) + self.annotate_tables(state) self.merge_persons() # - initialize shadow_pricing size tables after annotating household and person tables @@ -86,8 +90,8 @@ def __init__(self, network_los, chunk_size): add_size_tables = self.model_settings.get("add_size_tables", True) if add_size_tables: # warnings.warn(f"Calling add_size_tables from initialize will be removed in the future.", FutureWarning) - shadow_pricing._add_size_tables( - self.model_settings.get("suffixes"), scale=False + shadow_pricing.add_size_tables( + state, self.model_settings.get("suffixes"), scale=False ) def zone_sampler(self): @@ -164,7 +168,7 @@ def zone_sampler(self): ), "K-Means only implemented for 2-zone systems for now" # Performs a simple k-means clustering using centroid XY coordinates - centroids_df = pipeline.get_table("maz_centroids") + centroids_df = self.state.get_dataframe("maz_centroids") # Assert that land_use zone ids is subset of centroid zone ids assert set(self.land_use.index).issubset(set(centroids_df.index)) @@ -463,7 +467,9 @@ def create_proto_pop(self): if self.model_settings.get("FROM_TEMPLATES"): table_params = {k: self.params.get(k) for k in klist} tables = { - k: pd.read_csv(config.config_file_path(v.get("file"))) + k: pd.read_csv( + self.state.filesystem.get_config_file_path(v.get("file")) + ) for k, v in table_params.items() } households, persons, tours = self.expand_template_zones(tables) @@ -516,26 +522,25 @@ def create_proto_pop(self): if len(colnames) > 0: df.rename(columns=colnames, inplace=True) - def inject_tables(self): + def inject_tables(self, state: workflow.State): # Update canonical tables lists - inject.add_injectable( - "traceable_tables", - inject.get_injectable("traceable_tables") + list(self.proto_pop.keys()), + state.tracing.traceable_tables = state.tracing.traceable_tables + list( + self.proto_pop.keys() ) for tablename, df in self.proto_pop.items(): - inject.add_table(tablename, df) - pipeline.get_rn_generator().add_channel(tablename, df) - tracing.register_traceable_table(tablename, df) - # pipeline.get_rn_generator().drop_channel(tablename) + state.add_table(tablename, df) + self.state.get_rn_generator().add_channel(tablename, df) + state.tracing.register_traceable_table(tablename, df) - def annotate_tables(self): + def annotate_tables(self, state: workflow.State): # Extract annotations for annotations in self.model_settings["annotate_proto_tables"]: tablename = annotations["tablename"] - df = pipeline.get_table(tablename) + df = self.state.get_dataframe(tablename) assert df is not None assert annotations is not None assign_columns( + state, df=df, model_settings={ **annotations["annotate"], @@ -543,11 +548,11 @@ def annotate_tables(self): }, trace_label=tracing.extend_trace_label("ProtoPop.annotate", tablename), ) - pipeline.replace_table(tablename, df) + self.state.add_table(tablename, df) def merge_persons(self): - persons = pipeline.get_table("proto_persons") - households = pipeline.get_table("proto_households") + persons = self.state.get_dataframe("proto_persons") + households = self.state.get_dataframe("proto_households") # For dropping any extra columns created during merge cols_to_use = households.columns.difference(persons.columns) @@ -566,16 +571,18 @@ def merge_persons(self): self.proto_pop["proto_persons_merged"] = persons_merged # Store in pipeline - inject.add_table("proto_persons_merged", persons_merged) + self.state.add_table("proto_persons_merged", persons_merged) -def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): +def get_disaggregate_logsums( + state: workflow.State, network_los, chunk_size, trace_hh_id +): logsums = {} - persons_merged = pipeline.get_table("proto_persons_merged").sort_index( + persons_merged = state.get_dataframe("proto_persons_merged").sort_index( inplace=False ) disagg_model_settings = read_disaggregate_accessibility_yaml( - "disaggregate_accessibility.yaml" + state, "disaggregate_accessibility.yaml" ) for model_name in [ @@ -585,14 +592,14 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): ]: trace_label = tracing.extend_trace_label(model_name, "accessibilities") print("Running model {}".format(trace_label)) - model_settings = config.read_model_settings(model_name + ".yaml") + model_settings = state.filesystem.read_model_settings(model_name + ".yaml") model_settings["SAMPLE_SIZE"] = disagg_model_settings.get( "DESTINATION_SAMPLE_SIZE" ) - estimator = estimation.manager.begin_estimation(trace_label) + estimator = estimation.manager.begin_estimation(state, trace_label) if estimator: location_choice.write_estimation_specs( - estimator, model_settings, model_name + ".yaml" + state, estimator, model_settings, model_name + ".yaml" ) # Append table references in settings with "proto_" @@ -607,7 +614,7 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): model_settings["LOGSUM_SETTINGS"] = " ".join(suffixes) if model_name != "non_mandatory_tour_destination": - spc = shadow_pricing.load_shadow_price_calculator(model_settings) + spc = shadow_pricing.load_shadow_price_calculator(state, model_settings) # explicitly turning off shadow pricing for disaggregate accessibilities spc.use_shadow_pricing = False # filter to only workers or students @@ -616,6 +623,7 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): # run location choice and return logsums _logsums, _ = location_choice.run_location_choice( + state, choosers, network_los, shadow_price_calculator=spc, @@ -625,7 +633,6 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): model_settings=model_settings, chunk_size=chunk_size, chunk_tag=trace_label, - trace_hh_id=trace_hh_id, trace_label=trace_label, skip_choice=True, ) @@ -638,10 +645,11 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): ) else: - tours = pipeline.get_table("proto_tours") + tours = state.get_dataframe("proto_tours") tours = tours[tours.tour_category == "non_mandatory"] _logsums, _ = tour_destination.run_tour_destination( + state, tours, persons_merged, want_logsums=True, @@ -649,8 +657,6 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): model_settings=model_settings, network_los=network_los, estimator=estimator, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, skip_choice=True, ) @@ -670,34 +676,46 @@ def get_disaggregate_logsums(network_los, chunk_size, trace_hh_id): return logsums -@inject.step() -def initialize_proto_population(network_los, chunk_size): +@workflow.step +def initialize_proto_population( + state: workflow.State, + network_los: los.Network_LOS, +) -> None: # Synthesize the proto-population - ProtoPop(network_los, chunk_size) + ProtoPop(state, network_los, state.settings.chunk_size) return -@inject.step() -def compute_disaggregate_accessibility(network_los, chunk_size, trace_hh_id): +@workflow.step +def compute_disaggregate_accessibility( + state: workflow.State, + network_los: los.Network_LOS, +) -> None: """ Compute enhanced disaggregate accessibility for user specified population segments, as well as each zone in land use file using expressions from accessibility_spec. """ + tables_prior = list(state.existing_table_status) # Re-Register tables in this step, necessary for multiprocessing for tablename in ["proto_households", "proto_persons", "proto_tours"]: - df = inject.get_table(tablename).to_frame() - traceables = inject.get_injectable("traceable_tables") - if tablename not in pipeline.get_rn_generator().channels: - pipeline.get_rn_generator().add_channel(tablename, df) + df = state.get_dataframe(tablename) + traceables = state.tracing.traceable_tables + if tablename not in state.get_rn_generator().channels: + state.get_rn_generator().add_channel(tablename, df) if tablename not in traceables: - inject.add_injectable("traceable_tables", traceables + [tablename]) - tracing.register_traceable_table(tablename, df) + state.tracing.traceable_tables = traceables + [tablename] + state.tracing.register_traceable_table(tablename, df) del df # Run location choice - logsums = get_disaggregate_logsums(network_los, chunk_size, trace_hh_id) + logsums = get_disaggregate_logsums( + state, + network_los, + state.settings.chunk_size, + state.settings.trace_hh_id, + ) logsums = {k + "_accessibility": v for k, v in logsums.items()} # Combined accessibility table @@ -726,7 +744,7 @@ def compute_disaggregate_accessibility(network_los, chunk_size, trace_hh_id): # Merge in the proto pop data and inject it access_df = ( access_df.merge( - pipeline.get_table("proto_persons_merged").reset_index(), + state.get_dataframe("proto_persons_merged").reset_index(), on="proto_household_id", ) .set_index("proto_person_id") @@ -735,29 +753,20 @@ def compute_disaggregate_accessibility(network_los, chunk_size, trace_hh_id): logsums["proto_disaggregate_accessibility"] = access_df - # Drop any tables prematurely created - for tablename in [ - "school_destination_size", - "workplace_destination_size", - ]: - pipeline.drop_table(tablename) - - for ch in list(pipeline.get_rn_generator().channels.keys()): - pipeline.get_rn_generator().drop_channel(ch) + for ch in list(state.get_rn_generator().channels.keys()): + state.get_rn_generator().drop_channel(ch) # Drop any prematurely added traceables - for trace in [ - x for x in inject.get_injectable("traceable_tables") if "proto_" not in x - ]: - tracing.deregister_traceable_table(trace) + for trace in [x for x in state.tracing.traceable_tables if "proto_" not in x]: + state.tracing.deregister_traceable_table(trace) - # need to clear any premature tables that were added during the previous run - orca._TABLES.clear() - for name, func in inject._DECORATED_TABLES.items(): - logger.debug("reinject decorated table %s" % name) - orca.add_table(name, func) + # # need to clear any premature tables that were added during the previous run + for name in list(state.existing_table_status): + if name not in tables_prior: + state.drop_table(name) # Inject accessibility results into pipeline - [inject.add_table(k, df) for k, df in logsums.items()] + for k, df in logsums.items(): + state.add_table(k, df) return diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index 086422ba27..4db76ff201 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -1,48 +1,80 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging +from typing import Any -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +import pandas as pd -from .util import estimation +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger(__name__) -@inject.step() -def free_parking(persons_merged, persons, chunk_size, trace_hh_id): - """ """ - - trace_label = "free_parking" - model_settings_file_name = "free_parking.yaml" - - choosers = persons_merged.to_frame() +@workflow.step +def free_parking( + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + model_settings_file_name: str = "free_parking.yaml", + model_settings: dict[str, Any] = workflow.from_yaml("free_parking.yaml"), + trace_label: str = "free_parking", +) -> None: + """ + Determine for each person whether they have free parking available at work. + + Parameters + ---------- + state : workflow.State + persons_merged : DataFrame + persons : DataFrame + model_settings_file_name : str + This filename is used to write settings files in estimation mode. + model_settings : dict + The settings used in this model component. + trace_label : str + + Returns + ------- + + """ + + choosers = pd.DataFrame(persons_merged) choosers = choosers[choosers.workplace_zone_id > -1] logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("free_parking") + estimator = estimation.manager.begin_estimation(state, "free_parking") constants = config.get_model_constants(model_settings) # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -53,11 +85,11 @@ def free_parking(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_choosers(choosers) choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="free_parking_at_work", estimator=estimator, @@ -74,16 +106,15 @@ def free_parking(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["free_parking_at_work"] = ( choices.reindex(persons.index).fillna(0).astype(bool) ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary( "free_parking", persons.free_parking_at_work, value_counts=True ) - if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/initialize.py b/activitysim/abm/models/initialize.py index fd9f47bb81..dc83fef425 100644 --- a/activitysim/abm/models/initialize.py +++ b/activitysim/abm/models/initialize.py @@ -1,18 +1,13 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os import warnings -import pandas as pd - -from activitysim.abm.tables import shadow_pricing, disaggregate_accessibility -from activitysim.core import chunk, config, expressions, inject, mem, pipeline, tracing -from activitysim.core.steps.output import ( - track_skim_usage, - write_data_dictionary, - write_tables, -) +from activitysim.abm.tables import disaggregate_accessibility, shadow_pricing +from activitysim.core import chunk, expressions, tracing, workflow # We are using the naming conventions in the mtc_asim.h5 example # file for our default list. This provides backwards compatibility @@ -31,11 +26,24 @@ logger = logging.getLogger(__name__) -def annotate_tables(model_settings, trace_label): +def annotate_tables(state: workflow.State, model_settings, trace_label, chunk_sizer): + """ + + Parameters + ---------- + state : workflow.State + model_settings : + trace_label : str + chunk_sizer : ChunkSizer + + Returns + ------- + + """ trace_label = tracing.extend_trace_label(trace_label, "annotate_tables") - chunk.log_rss(trace_label) + chunk_sizer.log_rss(trace_label) annotate_tables = model_settings.get("annotate_tables", []) @@ -51,18 +59,16 @@ def annotate_tables(model_settings, trace_label): t0 = tracing.print_elapsed_time() for table_info in annotate_tables: - tablename = table_info["tablename"] - chunk.log_rss(f"{trace_label}.pre-get_table.{tablename}") + chunk_sizer.log_rss(f"{trace_label}.pre-get_table.{tablename}") - df = inject.get_table(tablename).to_frame() - chunk.log_df(trace_label, tablename, df) + df = state.get_dataframe(tablename) + chunk_sizer.log_df(trace_label, tablename, df) # - rename columns column_map = table_info.get("column_map", None) if column_map: - warnings.warn( f"Setting 'column_map' has been changed to 'rename_columns'. " f"Support for 'column_map' in annotate_tables will be removed in future versions.", @@ -79,61 +85,69 @@ def annotate_tables(model_settings, trace_label): f"{trace_label} - annotating {tablename} SPEC {annotate['SPEC']}" ) expressions.assign_columns( - df=df, model_settings=annotate, trace_label=trace_label + state, df=df, model_settings=annotate, trace_label=trace_label ) - chunk.log_df(trace_label, tablename, df) + chunk_sizer.log_df(trace_label, tablename, df) # - write table to pipeline - pipeline.replace_table(tablename, df) + state.add_table(tablename, df) del df - chunk.log_df(trace_label, tablename, None) + chunk_sizer.log_df(trace_label, tablename, None) -@inject.step() -def initialize_landuse(): +@workflow.step +def initialize_landuse(state: workflow.State) -> None: + """ + Initialize the land use table. - trace_label = "initialize_landuse" + Parameters + ---------- + state : State - with chunk.chunk_log(trace_label, base=True): + Returns + ------- + ? + """ + trace_label = "initialize_landuse" + settings_filename = "initialize_landuse.yaml" - model_settings = config.read_model_settings( - "initialize_landuse.yaml", mandatory=True + with chunk.chunk_log(state, trace_label, base=True) as chunk_sizer: + model_settings = state.filesystem.read_settings_file( + settings_filename, mandatory=True ) - annotate_tables(model_settings, trace_label) + annotate_tables(state, model_settings, trace_label, chunk_sizer) # instantiate accessibility (must be checkpointed to be be used to slice accessibility) - accessibility = pipeline.get_table("accessibility") - chunk.log_df(trace_label, "accessibility", accessibility) - + accessibility = state.get_dataframe("accessibility") + chunk_sizer.log_df(trace_label, "accessibility", accessibility) -@inject.step() -def initialize_households(): +@workflow.step +def initialize_households(state: workflow.State) -> None: trace_label = "initialize_households" - with chunk.chunk_log(trace_label, base=True): - - chunk.log_rss(f"{trace_label}.inside-yield") + with chunk.chunk_log(state, trace_label, base=True) as chunk_sizer: + chunk_sizer.log_rss(f"{trace_label}.inside-yield") - households = inject.get_table("households").to_frame() + households = state.get_dataframe("households") assert not households._is_view - chunk.log_df(trace_label, "households", households) + chunk_sizer.log_df(trace_label, "households", households) del households - chunk.log_df(trace_label, "households", None) + chunk_sizer.log_df(trace_label, "households", None) - persons = inject.get_table("persons").to_frame() + persons = state.get_dataframe("persons") assert not persons._is_view - chunk.log_df(trace_label, "persons", persons) + chunk_sizer.log_df(trace_label, "persons", persons) del persons - chunk.log_df(trace_label, "persons", None) + chunk_sizer.log_df(trace_label, "persons", None) - model_settings = config.read_model_settings( + model_settings = state.filesystem.read_settings_file( "initialize_households.yaml", mandatory=True ) - annotate_tables(model_settings, trace_label) + annotate_tables(state, model_settings, trace_label, chunk_sizer) # - initialize shadow_pricing size tables after annotating household and person tables # since these are scaled to model size, they have to be created while single-process @@ -141,59 +155,47 @@ def initialize_households(): add_size_tables = model_settings.get("add_size_tables", True) if add_size_tables: # warnings.warn(f"Calling add_size_tables from initialize will be removed in the future.", FutureWarning) - suffixes = inject.get_injectable("disaggregate_suffixes") - shadow_pricing.add_size_tables(suffixes) + suffixes = disaggregate_accessibility.disaggregate_suffixes(state) + shadow_pricing.add_size_tables(state, suffixes) # - preload person_windows - person_windows = inject.get_table("person_windows").to_frame() - chunk.log_df(trace_label, "person_windows", person_windows) + person_windows = state.get_dataframe("person_windows") + chunk_sizer.log_df(trace_label, "person_windows", person_windows) -@inject.injectable(cache=True) -def preload_injectables(): +@workflow.cached_object +def preload_injectables(state: workflow.State): """ preload bulky injectables up front - stuff that isn't inserted into the pipeline """ logger.info("preload_injectables") - inject.add_step("track_skim_usage", track_skim_usage) - inject.add_step("write_data_dictionary", write_data_dictionary) - inject.add_step("write_tables", write_tables) - - table_list = config.setting("input_table_list") - - # default ActivitySim table names and indices - if table_list is None: - logger.warning( - "No 'input_table_list' found in settings. This will be a " - "required setting in upcoming versions of ActivitySim." - ) - - new_settings = inject.get_injectable("settings") - new_settings["input_table_list"] = DEFAULT_TABLE_LIST - inject.add_injectable("settings", new_settings) - # FIXME undocumented feature - if config.setting("write_raw_tables"): - + if state.settings.write_raw_tables: # write raw input tables as csv (before annotation) - csv_dir = config.output_file_path("raw_tables") + csv_dir = state.get_output_file_path("raw_tables") if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed - table_names = [t["tablename"] for t in table_list] + # default ActivitySim table names and indices + if state.settings.input_table_list is None: + raise ValueError( + "no `input_table_list` found in settings, " "cannot `write_raw_tables`." + ) + + table_names = [t["tablename"] for t in state.settings.input_table_list] for t in table_names: - df = inject.get_table(t).to_frame() + df = state.get_dataframe(t) df.to_csv(os.path.join(csv_dir, "%s.csv" % t), index=True) t0 = tracing.print_elapsed_time() - if config.setting("benchmarking", False): + if state.settings.benchmarking: # we don't want to pay for skim_dict inside any model component during # benchmarking, so we'll preload skim_dict here. Preloading is not needed # for regular operation, as activitysim components can load-on-demand. - if inject.get_injectable("skim_dict", None) is not None: + if state.get_injectable("skim_dict", None) is not None: t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True) return True diff --git a/activitysim/abm/models/initialize_los.py b/activitysim/abm/models/initialize_los.py index 2649c09a3b..22b5b1c799 100644 --- a/activitysim/abm/models/initialize_los.py +++ b/activitysim/abm/models/initialize_los.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import multiprocessing import os @@ -10,17 +12,7 @@ import numpy as np import pandas as pd -from activitysim.core import ( - assign, - chunk, - config, - inject, - los, - pathbuilder, - pipeline, - simulate, - tracing, -) +from activitysim.core import chunk, los, pathbuilder, tracing, workflow logger = logging.getLogger(__name__) @@ -55,21 +47,19 @@ def num_nans(data): def any_uninitialized(data, lock=None): - with lock_data(lock): result = any_nans(data) return result def num_uninitialized(data, lock=None): - with lock_data(lock): result = num_nans(data) return result -@inject.step() -def initialize_los(network_los): +@workflow.step +def initialize_los(state: workflow.State, network_los: los.Network_LOS) -> None: """ Currently, this step is only needed for THREE_ZONE systems in which the tap_tap_utilities are precomputed in the (presumably subsequent) initialize_tvpb step. @@ -84,13 +74,12 @@ def initialize_los(network_los): trace_label = "initialize_los" if network_los.zone_system == los.THREE_ZONE: - tap_cache = network_los.tvpb.tap_cache uid_calculator = network_los.tvpb.uid_calculator attribute_combinations_df = uid_calculator.scalar_attribute_combinations() # - write table to pipeline (so we can slice it, when multiprocessing) - pipeline.replace_table("attribute_combinations", attribute_combinations_df) + state.add_table("attribute_combinations", attribute_combinations_df) # clean up any unwanted cache files from previous run if network_los.rebuild_tvpb_cache: @@ -118,9 +107,8 @@ def initialize_los(network_los): def compute_utilities_for_attribute_tuple( - network_los, scalar_attributes, data, chunk_size, trace_label + state, network_los, scalar_attributes, data, trace_label ): - # scalar_attributes is a dict of attribute name/value pairs for this combination # (e.g. {'demographic_segment': 0, 'tod': 'AM', 'access_mode': 'walk'}) @@ -146,12 +134,17 @@ def compute_utilities_for_attribute_tuple( # choosers_df is pretty big and was custom made for compute_utilities but we don't need to chunk_log it # since it is created outside of adaptive_chunked_choosers and so will show up in baseline - assert not chunk.chunk_logging() # otherwise we should chunk_log this + assert not chunk.chunk_logging(state) # otherwise we should chunk_log this chunk_tag = "initialize_tvpb" # all attribute_combinations can use same cached data for row_size calc - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( + state, choosers_df, trace_label, chunk_tag=chunk_tag ): # we should count choosers_df as chunk overhead since its pretty big and was custom made for compute_utilities if chooser_chunk._is_view: @@ -164,15 +157,16 @@ def compute_utilities_for_attribute_tuple( # TODO: convert to Dataset or otherwise stop this copying, without # harming anything else. chooser_chunk = chooser_chunk.copy() - chunk.log_df(trace_label, "attribute_chooser_chunk", chooser_chunk) + chunk_sizer.log_df(trace_label, "attribute_chooser_chunk", chooser_chunk) # add any attribute columns specified as column attributes in settings (the rest will be scalars in locals_dict) for attribute_name in attributes_as_columns: chooser_chunk[attribute_name] = scalar_attributes[attribute_name] - chunk.log_df(trace_label, "attribute_chooser_chunk", chooser_chunk) + chunk_sizer.log_df(trace_label, "attribute_chooser_chunk", chooser_chunk) utilities_df = pathbuilder.compute_utilities( + state, network_los, model_settings=model_settings, choosers=chooser_chunk, @@ -180,7 +174,7 @@ def compute_utilities_for_attribute_tuple( trace_label=trace_label, ) - chunk.log_df(trace_label, "utilities_df", utilities_df) + chunk_sizer.log_df(trace_label, "utilities_df", utilities_df) assert len(utilities_df) == len(chooser_chunk) assert len(utilities_df.columns) == data.shape[1] @@ -189,13 +183,17 @@ def compute_utilities_for_attribute_tuple( data[chooser_chunk.index.values, :] = utilities_df.values del chooser_chunk - chunk.log_df(trace_label, "attribute_chooser_chunk", None) + chunk_sizer.log_df(trace_label, "attribute_chooser_chunk", None) logger.debug(f"{trace_label} updated utilities") -@inject.step() -def initialize_tvpb(network_los, attribute_combinations, chunk_size): +@workflow.step +def initialize_tvpb( + state: workflow.State, + network_los: los.Network_LOS, + attribute_combinations: pd.DataFrame, +) -> None: """ Initialize STATIC tap_tap_utility cache and write mmap to disk. @@ -220,7 +218,7 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): ) return - attribute_combinations_df = attribute_combinations.to_frame() + attribute_combinations_df = attribute_combinations multiprocess = network_los.multiprocess() uid_calculator = network_los.tvpb.uid_calculator @@ -258,7 +256,7 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): tuple_trace_label = tracing.extend_trace_label(trace_label, f"offset{offset}") compute_utilities_for_attribute_tuple( - network_los, scalar_attributes, data, chunk_size, tuple_trace_label + state, network_los, scalar_attributes, data, tuple_trace_label ) # make sure we populated the entire offset @@ -266,12 +264,11 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): data.reshape(uid_calculator.skim_shape)[offset], lock ) - if multiprocess and not inject.get_injectable("locutor", False): + if multiprocess and not state.get_injectable("locutor", False): return - write_results = not multiprocess or inject.get_injectable("locutor", False) + write_results = not multiprocess or state.get_injectable("locutor", False) if write_results: - if multiprocess: # if multiprocessing, wait for all processes to fully populate share data before writing results # (the other processes don't have to wait, since we were sliced by attribute combination diff --git a/activitysim/abm/models/initialize_tours.py b/activitysim/abm/models/initialize_tours.py index 476a53feb1..7cd416a898 100644 --- a/activitysim/abm/models/initialize_tours.py +++ b/activitysim/abm/models/initialize_tours.py @@ -1,13 +1,13 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import os -import warnings import pandas as pd from activitysim.abm.models.util import tour_frequency as tf -from activitysim.core import config, expressions, inject, pipeline, tracing +from activitysim.core import expressions, tracing, workflow from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) @@ -17,11 +17,11 @@ SURVEY_PARTICIPANT_ID = "external_participant_id" ASIM_TOUR_ID = "tour_id" ASIM_PARENT_TOUR_ID = "parent_tour_id" -REQUIRED_TOUR_COLUMNS = set(["person_id", "tour_category", "tour_type"]) +REQUIRED_TOUR_COLUMNS = {"person_id", "tour_category", "tour_type"} -def patch_tour_ids(tours): - def set_tour_index(tours, parent_tour_num_col, is_joint): +def patch_tour_ids(state: workflow.State, tours): + def set_tour_index(state: workflow.State, tours, parent_tour_num_col, is_joint): group_cols = ["person_id", "tour_category", "tour_type"] if "parent_tour_num" in tours: @@ -32,7 +32,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ) return tf.set_tour_index( - tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint + state, tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint ) assert REQUIRED_TOUR_COLUMNS.issubset( @@ -48,6 +48,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # mandatory tours mandatory_tours = set_tour_index( + state, tours[tours.tour_category == "mandatory"], parent_tour_num_col=None, is_joint=False, @@ -60,6 +61,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # non_mandatory tours non_mandatory_tours = set_tour_index( + state, tours[tours.tour_category == "non_mandatory"], parent_tour_num_col=None, is_joint=False, @@ -74,27 +76,31 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): return patched_tours -@inject.step() -def initialize_tours(network_los, households, persons, trace_hh_id): - +@workflow.step +def initialize_tours( + state: workflow.State, + households: pd.DataFrame, + persons: pd.DataFrame, +) -> None: trace_label = "initialize_tours" - tours = read_input_table("tours") + trace_hh_id = state.settings.trace_hh_id + tours = read_input_table(state, "tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... - slice_happened = ( - inject.get_injectable("households_sample_size", 0) > 0 - or inject.get_injectable("households_sample_size", 0) > 0 - ) + slice_happened = state.settings.households_sample_size > 0 if slice_happened: logger.info("slicing tours %s" % (tours.shape,)) # keep all persons in the sampled households tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above - model_settings = config.read_model_settings("initialize_tours.yaml", mandatory=True) + model_settings = state.filesystem.read_model_settings( + "initialize_tours.yaml", mandatory=True + ) expressions.assign_columns( + state, df=tours, model_settings=model_settings.get("annotate_tours"), trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), @@ -104,15 +110,15 @@ def initialize_tours(network_los, households, persons, trace_hh_id): if skip_patch_tour_ids: pass else: - tours = patch_tour_ids(tours) + tours = patch_tour_ids(state, tours) assert tours.index.name == "tour_id" # replace table function with dataframe - inject.add_table("tours", tours) + state.add_table("tours", tours) - pipeline.get_rn_generator().add_channel("tours", tours) + state.get_rn_generator().add_channel("tours", tours) - tracing.register_traceable_table("tours", tours) + state.tracing.register_traceable_table("tours", tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") @@ -127,4 +133,4 @@ def initialize_tours(network_los, households, persons, trace_hh_id): raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: - tracing.trace_df(tours, label="initialize_tours", warn_if_empty=True) + state.tracing.trace_df(tours, label="initialize_tours", warn_if_empty=True) diff --git a/activitysim/abm/models/joint_tour_composition.py b/activitysim/abm/models/joint_tour_composition.py index a041fb9e83..50c458e7a4 100644 --- a/activitysim/abm/models/joint_tour_composition.py +++ b/activitysim/abm/models/joint_tour_composition.py @@ -1,47 +1,56 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.overlap import hh_time_window_overlap +from activitysim.abm.models.util.overlap import hh_time_window_overlap +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger(__name__) -def add_null_results(trace_label, tours): +def add_null_results(state, trace_label, tours): logger.info("Skipping %s: add_null_results" % trace_label) tours["composition"] = "" - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) -@inject.step() -def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_composition( + state: workflow.State, + tours: pd.DataFrame, + households: pd.DataFrame, + persons: pd.DataFrame, +) -> None: """ This model predicts the makeup of the travel party (adults, children, or mixed). """ trace_label = "joint_tour_composition" model_settings_file_name = "joint_tour_composition.yaml" - tours = tours.to_frame() joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours if joint_tours.shape[0] == 0: - add_null_results(trace_label, tours) + add_null_results(state, trace_label, tours) return - model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("joint_tour_composition") + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + estimator = estimation.manager.begin_estimation(state, "joint_tour_composition") # - only interested in households with joint_tours - households = households.to_frame() households = households[households.num_hh_joint_tours > 0] - persons = persons.to_frame() persons = persons[persons.household_id.isin(households.index)] logger.info( @@ -51,13 +60,13 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): # - run preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { "persons": persons, - "hh_time_window_overlap": hh_time_window_overlap, + "hh_time_window_overlap": lambda *x: hh_time_window_overlap(state, *x), } expressions.assign_columns( + state, df=households, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -69,9 +78,11 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): ) # - simple_simulate - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -83,11 +94,11 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): estimator.write_choosers(joint_tours_merged) choices = simulate.simple_simulate( + state, choosers=joint_tours_merged, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="composition", estimator=estimator, @@ -107,14 +118,14 @@ def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): # reindex since we ran model on a subset of households tours["composition"] = choices.reindex(tours.index).fillna("").astype(str) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) tracing.print_summary( "joint_tour_composition", joint_tours.composition, value_counts=True ) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( joint_tours, label="joint_tour_composition.joint_tours", slicer="household_id", diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py index 02651d2a44..d94658b78a 100644 --- a/activitysim/abm/models/joint_tour_destination.py +++ b/activitysim/abm/models/joint_tour_destination.py @@ -1,22 +1,25 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, inject, pipeline, simulate, tracing +from activitysim.abm.models.util import tour_destination +from activitysim.core import estimation, los, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation, tour_destination - logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def joint_tour_destination( - tours, persons_merged, households_merged, network_los, chunk_size, trace_hh_id -): - + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: """ Given the tour generation from the above, each tour needs to have a destination, so in this case tours are the choosers (with the associated @@ -25,43 +28,39 @@ def joint_tour_destination( trace_label = "joint_tour_destination" model_settings_file_name = "joint_tour_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + trace_hh_id = state.settings.trace_hh_id logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) # choosers are tours - in a sense tours are choosing their destination - tours = tours.to_frame() joint_tours = tours[tours.tour_category == "joint"] - persons_merged = persons_merged.to_frame() - # - if no joint tours if joint_tours.shape[0] == 0: tracing.no_results("joint_tour_destination") return - estimator = estimation.manager.begin_estimation("joint_tour_destination") + estimator = estimation.manager.begin_estimation(state, "joint_tour_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + state.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(state.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( + state, tours, persons_merged, want_logsums, @@ -69,8 +68,6 @@ def joint_tour_destination( model_settings, network_los, estimator, - chunk_size, - trace_hh_id, trace_label, ) @@ -85,7 +82,7 @@ def joint_tour_destination( # add column as we want joint_tours table for tracing. joint_tours["destination"] = choices_df.choice assign_in_place(tours, joint_tours[["destination"]]) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) if want_logsums: joint_tours[logsum_column_name] = choices_df["logsum"] @@ -96,7 +93,7 @@ def joint_tour_destination( if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # save_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df(joint_tours, label="joint_tour_destination.joint_tours") + state.tracing.trace_df(joint_tours, label="joint_tour_destination.joint_tours") diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py index 1039646651..66996ce1de 100644 --- a/activitysim/abm/models/joint_tour_frequency.py +++ b/activitysim/abm/models/joint_tour_frequency.py @@ -1,44 +1,52 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.overlap import hh_time_window_overlap -from .util.tour_frequency import process_joint_tours +from activitysim.abm.models.util.overlap import hh_time_window_overlap +from activitysim.abm.models.util.tour_frequency import process_joint_tours +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger(__name__) -@inject.step() -def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_frequency( + state: workflow.State, households: pd.DataFrame, persons: pd.DataFrame +) -> None: """ This model predicts the frequency of making fully joint trips (see the alternatives above). """ trace_label = "joint_tour_frequency" model_settings_file_name = "joint_tour_frequency.yaml" + trace_hh_id = state.settings.trace_hh_id - estimator = estimation.manager.begin_estimation("joint_tour_frequency") + estimator = estimation.manager.begin_estimation(state, "joint_tour_frequency") - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) alternatives = simulate.read_model_alts( - "joint_tour_frequency_alternatives.csv", set_index="alt" + state, "joint_tour_frequency_alternatives.csv", set_index="alt" ) # - only interested in households with more than one cdap travel_active person and # - at least one non-preschooler - households = households.to_frame() multi_person_households = households[households.participates_in_jtf_model].copy() # - only interested in persons in multi_person_households # FIXME - gratuitous pathological efficiency move, just let yaml specify persons? - persons = persons.to_frame() persons = persons[persons.household_id.isin(multi_person_households.index)] logger.info( @@ -49,22 +57,24 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { "persons": persons, - "hh_time_window_overlap": hh_time_window_overlap, + "hh_time_window_overlap": lambda *x: hh_time_window_overlap(state, *x), } expressions.assign_columns( + state, df=multi_person_households, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -76,11 +86,11 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): estimator.write_choosers(multi_person_households) choices = simulate.simple_simulate( + state, choosers=multi_person_households, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="joint_tour_frequency", estimator=estimator, @@ -108,12 +118,12 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): temp_point_persons = temp_point_persons.set_index("household_id") temp_point_persons = temp_point_persons[["person_id", "home_zone_id"]] - joint_tours = process_joint_tours(choices, alternatives, temp_point_persons) + joint_tours = process_joint_tours(state, choices, alternatives, temp_point_persons) - tours = pipeline.extend_table("tours", joint_tours) + tours = state.extend_table("tours", joint_tours) - tracing.register_traceable_table("tours", joint_tours) - pipeline.get_rn_generator().add_channel("tours", joint_tours) + state.tracing.register_traceable_table("tours", joint_tours) + state.get_rn_generator().add_channel("tours", joint_tours) # - annotate households @@ -131,16 +141,16 @@ def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): .astype(np.int8) ) - pipeline.replace_table("households", households) + state.add_table("households", households) tracing.print_summary( "joint_tour_frequency", households.joint_tour_frequency, value_counts=True ) if trace_hh_id: - tracing.trace_df(households, label="joint_tour_frequency.households") + state.tracing.trace_df(households, label="joint_tour_frequency.households") - tracing.trace_df( + state.tracing.trace_df( joint_tours, label="joint_tour_frequency.joint_tours", slicer="household_id" ) diff --git a/activitysim/abm/models/joint_tour_frequency_composition.py b/activitysim/abm/models/joint_tour_frequency_composition.py index 3e52d27fae..66fcc534c5 100644 --- a/activitysim/abm/models/joint_tour_frequency_composition.py +++ b/activitysim/abm/models/joint_tour_frequency_composition.py @@ -1,31 +1,35 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import logging import numpy as np import pandas as pd -import os -from activitysim.core.interaction_simulate import interaction_simulate - -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions -from .util import estimation - -from .util.overlap import hh_time_window_overlap -from .util.tour_frequency import process_joint_tours_frequency_composition +from activitysim.abm.models.util.overlap import hh_time_window_overlap +from activitysim.abm.models.util.tour_frequency import ( + process_joint_tours_frequency_composition, +) +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) +from activitysim.core.interaction_simulate import interaction_simulate logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def joint_tour_frequency_composition( - households_merged, persons, chunk_size, trace_hh_id -): + state: workflow.State, + households_merged: pd.DataFrame, + persons: pd.DataFrame, +) -> None: """ This model predicts the frequency and composition of fully joint tours. """ @@ -33,19 +37,17 @@ def joint_tour_frequency_composition( trace_label = "joint_tour_frequency_composition" model_settings_file_name = "joint_tour_frequency_composition.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) alt_tdd = simulate.read_model_alts( - "joint_tour_frequency_composition_alternatives.csv", set_index="alt" + state, "joint_tour_frequency_composition_alternatives.csv", set_index="alt" ) # - only interested in households with more than one cdap travel_active person and # - at least one non-preschooler - households_merged = households_merged.to_frame() choosers = households_merged[households_merged.participates_in_jtf_model].copy() # - only interested in persons in choosers households - persons = persons.to_frame() persons = persons[persons.household_id.isin(choosers.index)] logger.info("Running %s with %d households", trace_label, len(choosers)) @@ -53,12 +55,12 @@ def joint_tour_frequency_composition( # alt preprocessor alt_preprocessor_settings = model_settings.get("ALTS_PREPROCESSOR", None) if alt_preprocessor_settings: - locals_dict = {} alt_tdd = alt_tdd.copy() expressions.assign_columns( + state, df=alt_tdd, model_settings=alt_preprocessor_settings, locals_dict=locals_dict, @@ -68,24 +70,28 @@ def joint_tour_frequency_composition( # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { "persons": persons, - "hh_time_window_overlap": hh_time_window_overlap, + "hh_time_window_overlap": lambda *x: hh_time_window_overlap(state, *x), } expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label, ) - estimator = estimation.manager.begin_estimation("joint_tour_frequency_composition") + estimator = estimation.manager.begin_estimation( + state, "joint_tour_frequency_composition" + ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) constants = config.get_model_constants(model_settings) @@ -105,11 +111,12 @@ def joint_tour_frequency_composition( # The choice value 'joint_tour_frequency_composition' assigned by interaction_simulate # is the index value of the chosen alternative in the alternatives table. choices = interaction_simulate( + state, choosers=choosers, alternatives=alt_tdd, spec=model_spec, locals_d=constants, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=trace_label, trace_choice_name=trace_label, estimator=estimator, @@ -162,13 +169,13 @@ def joint_tour_frequency_composition( # 33333 shop joint adults joint_tours = process_joint_tours_frequency_composition( - choices, alt_tdd, temp_point_persons + state, choices, alt_tdd, temp_point_persons ) - tours = pipeline.extend_table("tours", joint_tours) + tours = state.extend_table("tours", joint_tours) - tracing.register_traceable_table("tours", joint_tours) - pipeline.get_rn_generator().add_channel("tours", joint_tours) + state.tracing.register_traceable_table("tours", joint_tours) + state.get_rn_generator().add_channel("tours", joint_tours) # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = 0 @@ -184,7 +191,7 @@ def joint_tour_frequency_composition( .astype(np.int8) ) - pipeline.replace_table("households", households_merged) + state.add_table("households", households_merged) tracing.print_summary( "joint_tour_frequency_composition", @@ -192,12 +199,12 @@ def joint_tour_frequency_composition( value_counts=True, ) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( households_merged, label="joint_tour_frequency_composition.households" ) - tracing.trace_df( + state.tracing.trace_df( joint_tours, label="joint_tour_frequency_composition.joint_tours", slicer="household_id", diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index ee8658ae5f..066ef703c7 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -1,31 +1,29 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import pandas as pd import numpy as np +import pandas as pd from activitysim.abm.models.util.canonical_ids import MAX_PARTICIPANT_PNUM +from activitysim.abm.models.util.overlap import person_time_window_overlap from activitysim.core import ( - chunk, config, + estimation, expressions, - inject, logit, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.util import assign_in_place, reindex -from .util import estimation -from .util.overlap import person_time_window_overlap - logger = logging.getLogger(__name__) def joint_tour_participation_candidates(joint_tours, persons_merged): - # - only interested in persons from households with joint_tours persons_merged = persons_merged[persons_merged.num_hh_joint_tours > 0] @@ -66,7 +64,7 @@ def joint_tour_participation_candidates(joint_tours, persons_merged): candidates["participant_id"] = ( candidates[joint_tours.index.name] * MAX_PARTICIPANT_PNUM ) + candidates.PNUM - candidates["participant_id"] = candidates["participant_id"].astype(np.uint64) + candidates["participant_id"] = candidates["participant_id"].astype(np.int64) candidates.set_index( "participant_id", drop=True, inplace=True, verify_integrity=True ) @@ -75,11 +73,9 @@ def joint_tour_participation_candidates(joint_tours, persons_merged): def get_tour_satisfaction(candidates, participate): - tour_ids = candidates.tour_id.unique() if participate.any(): - candidates = candidates[participate] # if this happens, we would need to filter them out! @@ -123,7 +119,13 @@ def get_tour_satisfaction(candidates, participate): return satisfaction -def participants_chooser(probs, choosers, spec, trace_label): +def participants_chooser( + state: workflow.State, + probs: pd.DataFrame, + choosers: pd.DataFrame, + spec: pd.DataFrame, + trace_label: str, +) -> tuple[pd.Series, pd.Series]: """ custom alternative to logit.make_choices for simulate.simple_simulate @@ -161,7 +163,9 @@ def participants_chooser(probs, choosers, spec, trace_label): assert probs.index.equals(choosers.index) # choice is boolean (participate or not) - model_settings = config.read_model_settings("joint_tour_participation.yaml") + model_settings = state.filesystem.read_model_settings( + "joint_tour_participation.yaml" + ) choice_col = model_settings.get("participation_choice", "participate") assert ( @@ -185,7 +189,6 @@ def participants_chooser(probs, choosers, spec, trace_label): iter = 0 while candidates.shape[0] > 0: - iter += 1 if iter > MAX_ITERATIONS: @@ -194,7 +197,7 @@ def participants_chooser(probs, choosers, spec, trace_label): ) diagnostic_cols = ["tour_id", "household_id", "composition", "adult"] unsatisfied_candidates = candidates[diagnostic_cols].join(probs) - tracing.write_csv( + state.tracing.write_csv( unsatisfied_candidates, file_name="%s.UNSATISFIED" % trace_label, transpose=False, @@ -215,7 +218,7 @@ def participants_chooser(probs, choosers, spec, trace_label): ) choices, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=choosers + state, probs, trace_label=trace_label, trace_choosers=choosers ) participate = choices == PARTICIPATE_CHOICE @@ -231,7 +234,6 @@ def participants_chooser(probs, choosers, spec, trace_label): ) if num_tours_satisfied_this_iter > 0: - num_tours_remaining -= num_tours_satisfied_this_iter satisfied = reindex(tour_satisfaction, candidates.tour_id) @@ -266,19 +268,19 @@ def participants_chooser(probs, choosers, spec, trace_label): return choices, rands -def annotate_jtp(model_settings, trace_label): - +def annotate_jtp(state: workflow.State, model_settings, trace_label): # - annotate persons - persons = inject.get_table("persons").to_frame() + persons = state.get_dataframe("persons") expressions.assign_columns( + state, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) -def add_null_results(model_settings, trace_label): +def add_null_results(state, model_settings, trace_label): logger.info("Skipping %s: joint tours", trace_label) # participants table is used downstream in non-joint tour expressions @@ -286,35 +288,37 @@ def add_null_results(model_settings, trace_label): participants = pd.DataFrame(columns=PARTICIPANT_COLS) participants.index.name = "participant_id" - pipeline.replace_table("joint_tour_participants", participants) + state.add_table("joint_tour_participants", participants) # - run annotations - annotate_jtp(model_settings, trace_label) + annotate_jtp(state, model_settings, trace_label) -@inject.step() -def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_participation( + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, +) -> None: """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = "joint_tour_participation" model_settings_file_name = "joint_tour_participation.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + trace_hh_id = state.settings.trace_hh_id - tours = tours.to_frame() joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours if joint_tours.shape[0] == 0: - add_null_results(model_settings, trace_label) + add_null_results(state, model_settings, trace_label) return - persons_merged = persons_merged.to_frame() - # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) - tracing.register_traceable_table("joint_tour_participants", candidates) - pipeline.get_rn_generator().add_channel("joint_tour_participants", candidates) + state.tracing.register_traceable_table("joint_tour_participants", candidates) + state.get_rn_generator().add_channel("joint_tour_participants", candidates) logger.info( "Running joint_tours_participation with %d potential participants (candidates)" @@ -324,13 +328,15 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { - "person_time_window_overlap": person_time_window_overlap, + "person_time_window_overlap": lambda x: person_time_window_overlap( + state, x + ), "persons": persons_merged, } expressions.assign_columns( + state, df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -339,11 +345,13 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): # - simple_simulate - estimator = estimation.manager.begin_estimation("joint_tour_participation") + estimator = estimation.manager.begin_estimation(state, "joint_tour_participation") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -363,11 +371,11 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): candidates["chunk_id"] = reindex(household_chunk_ids, candidates.household_id) choices = simulate.simple_simulate_by_chunk_id( + state, choosers=candidates, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="participation", custom_chooser=participants_chooser, @@ -421,10 +429,10 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): + 1 ) - pipeline.replace_table("joint_tour_participants", participants) + state.add_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) - pipeline.get_rn_generator().drop_channel("joint_tour_participants") + state.get_rn_generator().drop_channel("joint_tour_participants") # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] @@ -435,12 +443,16 @@ def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): assign_in_place(tours, joint_tours[["person_id", "number_of_participants"]]) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) # - run annotations - annotate_jtp(model_settings, trace_label) + annotate_jtp(state, model_settings, trace_label) if trace_hh_id: - tracing.trace_df(participants, label="joint_tour_participation.participants") + state.tracing.trace_df( + participants, label="joint_tour_participation.participants" + ) - tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours") + state.tracing.trace_df( + joint_tours, label="joint_tour_participation.joint_tours" + ) diff --git a/activitysim/abm/models/joint_tour_scheduling.py b/activitysim/abm/models/joint_tour_scheduling.py index ca56327d03..0a488757ad 100644 --- a/activitysim/abm/models/joint_tour_scheduling.py +++ b/activitysim/abm/models/joint_tour_scheduling.py @@ -1,29 +1,43 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.abm.models.util.vectorize_tour_scheduling import ( + vectorize_joint_tour_scheduling, +) +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) from activitysim.core.util import assign_in_place, reindex -from .util import estimation -from .util.vectorize_tour_scheduling import vectorize_joint_tour_scheduling - logger = logging.getLogger(__name__) -@inject.step() -def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): +@workflow.step +def joint_tour_scheduling( + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + tdd_alts: pd.DataFrame, +) -> None: """ This model predicts the departure time and duration of each joint tour """ trace_label = "joint_tour_scheduling" model_settings_file_name = "joint_tour_scheduling.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) - tours = tours.to_frame() + trace_hh_id = state.settings.trace_hh_id joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours @@ -31,10 +45,8 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ tracing.no_results(trace_label) return - # use inject.get_table as this won't exist if there are no joint_tours - joint_tour_participants = inject.get_table("joint_tour_participants").to_frame() - - persons_merged = persons_merged.to_frame() + # use state.get_dataframe as this won't exist if there are no joint_tours + joint_tour_participants = state.get_dataframe("joint_tour_participants") logger.info("Running %s with %d joint tours", trace_label, joint_tours.shape[0]) @@ -54,26 +66,28 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + state, df=joint_tours, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - timetable = inject.get_injectable("timetable") + timetable = state.get_injectable("timetable") - estimator = estimation.manager.begin_estimation("joint_tour_scheduling") + estimator = estimation.manager.begin_estimation(state, "joint_tour_scheduling") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) sharrow_skip = model_settings.get("sharrow_skip", False) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) if estimator: estimator.write_model_settings(model_settings, model_settings_file_name) @@ -82,6 +96,7 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ timetable.begin_transaction(estimator) choices = vectorize_joint_tour_scheduling( + state, joint_tours, joint_tour_participants, persons_merged, @@ -90,7 +105,7 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ spec=model_spec, model_settings=model_settings, estimator=estimator, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=trace_label, sharrow_skip=sharrow_skip, ) @@ -117,7 +132,7 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ nth_participants.person_id, reindex(choices, nth_participants.tour_id) ) - timetable.replace_table() + timetable.replace_table(state) # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table @@ -126,12 +141,12 @@ def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_ ) assign_in_place(tours, choices) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) # updated df for tracing joint_tours = tours[tours.tour_category == "joint"] if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( joint_tours, label="joint_tour_scheduling", slicer="household_id" ) diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index dd5e279b39..b58a3bcb7f 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -1,27 +1,19 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd +from activitysim.abm.models.util import logsums as logsum +from activitysim.abm.models.util import tour_destination from activitysim.abm.tables import shadow_pricing -from activitysim.core import ( - config, - expressions, - inject, - los, - pipeline, - simulate, - tracing, -) +from activitysim.core import estimation, expressions, los, simulate, tracing, workflow from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate -from .util import estimation -from .util import logsums as logsum -from .util import tour_destination - # import multiprocessing @@ -80,12 +72,19 @@ ALT_LOGSUM = "mode_choice_logsum" -def write_estimation_specs(estimator, model_settings, settings_file): +def write_estimation_specs( + state: workflow.State, + estimator: estimation.Estimator, + model_settings, + settings_file, +): """ write sample_spec, spec, and coefficients to estimation data bundle Parameters ---------- + state : workflow.State + estimator : estimation.Estimator model_settings settings_file """ @@ -96,14 +95,13 @@ def write_estimation_specs(estimator, model_settings, settings_file): estimator.write_coefficients(model_settings=model_settings) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + state.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(state.get_dataframe("land_use"), "landuse", append=False) def _location_sample( + state: workflow.State, segment_name, choosers, alternatives, @@ -138,7 +136,11 @@ def _location_sample( logger.info("Running %s with %d persons" % (trace_label, len(choosers.index))) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + + if "sched" in trace_label: + print() + + if state.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count @@ -155,10 +157,11 @@ def _location_sample( "dest_col_name": skims.dest_key, # added for sharrow flows "timeframe": "timeless", } - constants = config.get_model_constants(model_settings) + constants = model_settings.get("CONSTANTS", {}) locals_d.update(constants) spec = simulate.spec_for_segment( + state, model_settings, spec_id="SAMPLE_SPEC", segment_name=segment_name, @@ -166,9 +169,10 @@ def _location_sample( ) # here since presumably we want this when called for either sample or presample - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_sample( + state, choosers, alternatives, spec=spec, @@ -187,6 +191,7 @@ def _location_sample( def location_sample( + state, segment_name, persons_merged, network_los, @@ -197,7 +202,6 @@ def location_sample( chunk_tag, trace_label, ): - # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] choosers = persons_merged[chooser_columns] @@ -212,6 +216,7 @@ def location_sample( alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _location_sample( + state, segment_name, choosers, dest_size_terms, @@ -233,7 +238,9 @@ def location_sample( DEST_MAZ = "dest_MAZ" -def aggregate_size_terms(dest_size_terms, network_los, model_settings): +def aggregate_size_terms( + state: workflow.State, dest_size_terms, network_los, model_settings +): # # aggregate MAZ_size_terms to TAZ_size_terms # @@ -269,7 +276,7 @@ def aggregate_size_terms(dest_size_terms, network_los, model_settings): for c in weighted_average_cols: TAZ_size_terms[c] /= TAZ_size_terms["size_term"] # weighted average - spc = shadow_pricing.load_shadow_price_calculator(model_settings) + spc = shadow_pricing.load_shadow_price_calculator(state, model_settings) if spc.use_shadow_pricing and ( spc.shadow_settings["SHADOW_PRICE_METHOD"] == "simulation" ): @@ -316,6 +323,7 @@ def aggregate_size_terms(dest_size_terms, network_los, model_settings): def location_presample( + state, segment_name, persons_merged, network_los, @@ -326,7 +334,6 @@ def location_presample( chunk_tag, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") logger.info(f"{trace_label} location_presample") @@ -335,7 +342,7 @@ def location_presample( assert DEST_TAZ != alt_dest_col_name MAZ_size_terms, TAZ_size_terms = aggregate_size_terms( - dest_size_terms, network_los, model_settings + state, dest_size_terms, network_los, model_settings ) # convert MAZ zone_id to 'TAZ' in choosers (persons_merged) @@ -359,6 +366,7 @@ def location_presample( skims = skim_dict.wrap(HOME_TAZ, DEST_TAZ) taz_sample = _location_sample( + state, segment_name, choosers, TAZ_size_terms, @@ -382,7 +390,7 @@ def location_presample( # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_choices = tour_destination.choose_MAZ_for_TAZ( - taz_sample, MAZ_size_terms, trace_label + state, taz_sample, MAZ_size_terms, trace_label ) assert DEST_MAZ in maz_choices @@ -392,6 +400,7 @@ def location_presample( def run_location_sample( + state: workflow.State, segment_name, persons_merged, network_los, @@ -430,7 +439,7 @@ def run_location_sample( # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not state.settings.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -438,13 +447,13 @@ def run_location_sample( ) if pre_sample_taz: - logger.info( "Running %s location_presample with %d persons" % (trace_label, len(persons_merged)) ) choices = location_presample( + state, segment_name, persons_merged, network_los, @@ -457,8 +466,8 @@ def run_location_sample( ) else: - choices = location_sample( + state, segment_name, persons_merged, network_los, @@ -474,6 +483,7 @@ def run_location_sample( def run_location_logsums( + state, segment_name, persons_merged_df, network_los, @@ -506,7 +516,9 @@ def run_location_logsums( assert not location_sample_df.empty - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = state.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) # FIXME - MEMORY HACK - only include columns actually used in spec persons_merged_df = logsum.filter_chooser_columns( @@ -524,6 +536,7 @@ def run_location_logsums( tour_purpose = tour_purpose[segment_name] logsums = logsum.compute_logsums( + state, choosers, tour_purpose, logsum_settings, @@ -544,6 +557,7 @@ def run_location_logsums( def run_location_simulate( + state: workflow.State, segment_name, persons_merged, location_sample_df, @@ -602,7 +616,7 @@ def run_location_simulate( "dest_col_name": skims.dest_key, # added for sharrow flows "timeframe": "timeless", } - constants = config.get_model_constants(model_settings) + constants = model_settings.get("CONSTANTS", {}) if constants is not None: locals_d.update(constants) @@ -613,12 +627,17 @@ def run_location_simulate( estimator.write_interaction_sample_alternatives(alternatives) spec = simulate.spec_for_segment( - model_settings, spec_id="SPEC", segment_name=segment_name, estimator=estimator + state, + model_settings, + spec_id="SPEC", + segment_name=segment_name, + estimator=estimator, ) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_sample_simulate( + state, choosers, alternatives, spec=spec, @@ -646,6 +665,7 @@ def run_location_simulate( def run_location_choice( + state: workflow.State, persons_merged_df, network_los, shadow_price_calculator, @@ -655,7 +675,6 @@ def run_location_choice( model_settings, chunk_size, chunk_tag, - trace_hh_id, trace_label, skip_choice=False, ): @@ -676,7 +695,6 @@ def run_location_choice( estimator: Estimator object model_settings : dict chunk_size : int - trace_hh_id : int trace_label : str Returns @@ -696,7 +714,6 @@ def run_location_choice( choices_list = [] sample_list = [] for segment_name, segment_id in segment_ids.items(): - choosers = persons_merged_df[ persons_merged_df[chooser_segment_column] == segment_id ] @@ -714,6 +731,7 @@ def run_location_choice( # - location_sample location_sample_df = run_location_sample( + state, segment_name, choosers, network_los, @@ -729,6 +747,7 @@ def run_location_choice( # - location_logsums location_sample_df = run_location_logsums( + state, segment_name, choosers, network_los, @@ -743,6 +762,7 @@ def run_location_choice( # - location_simulate choices_df = run_location_simulate( + state, segment_name, choosers, location_sample_df, @@ -760,11 +780,11 @@ def run_location_choice( ) if estimator: - if trace_hh_id: + if state.settings.trace_hh_id: estimation_trace_label = tracing.extend_trace_label( trace_label, f"estimation.{segment_name}.modeled_choices" ) - tracing.trace_df(choices_df, label=estimation_trace_label) + state.tracing.trace_df(choices_df, label=estimation_trace_label) estimator.write_choices(choices_df.choice) choices_df.choice = estimator.get_survey_values( @@ -802,11 +822,11 @@ def run_location_choice( f"{trace_label} segment {segment_name} estimation: override logsums" ) - if trace_hh_id: + if state.settings.trace_hh_id: estimation_trace_label = tracing.extend_trace_label( trace_label, f"estimation.{segment_name}.survey_choices" ) - tracing.trace_df(choices_df, estimation_trace_label) + state.tracing.trace_df(choices_df, estimation_trace_label) choices_list.append(choices_df) @@ -837,6 +857,7 @@ def run_location_choice( def iterate_location_choice( + state: workflow.State, model_settings, persons_merged, persons, @@ -844,8 +865,7 @@ def iterate_location_choice( network_los, estimator, chunk_size, - trace_hh_id, - locutor, + locutor: bool, trace_label, ): """ @@ -861,7 +881,6 @@ def iterate_location_choice( persons : injected table network_los : los.Network_LOS chunk_size : int - trace_hh_id : int locutor : bool whether this process is the privileged logger of shadow_pricing when multiprocessing trace_label : str @@ -883,11 +902,10 @@ def iterate_location_choice( sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) - persons_merged_df = persons_merged.to_frame() + persons_merged_df = persons_merged persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]] @@ -903,20 +921,21 @@ def iterate_location_choice( chooser_segment_column in persons_merged_df ), f"CHOOSER_SEGMENT_COLUMN '{chooser_segment_column}' not in persons_merged table." - spc = shadow_pricing.load_shadow_price_calculator(model_settings) + spc = shadow_pricing.load_shadow_price_calculator(state, model_settings) max_iterations = spc.max_iterations assert not (spc.use_shadow_pricing and estimator) logger.debug("%s max_iterations: %s" % (trace_label, max_iterations)) - choices_df = None # initialize to None, will be populated in first iteration + save_sample_df = ( + choices_df + ) = None # initialize to None, will be populated in first iteration for iteration in range(1, max_iterations + 1): - persons_merged_df_ = persons_merged_df.copy() if spc.use_shadow_pricing and iteration > 1: - spc.update_shadow_prices() + spc.update_shadow_prices(state) if spc.shadow_settings["SHADOW_PRICE_METHOD"] == "simulation": # filter from the sampled persons @@ -926,6 +945,7 @@ def iterate_location_choice( persons_merged_df_ = persons_merged_df_.sort_index() choices_df_, save_sample_df = run_location_choice( + state, persons_merged_df_, network_los, shadow_price_calculator=spc, @@ -935,7 +955,6 @@ def iterate_location_choice( model_settings=model_settings, chunk_size=chunk_size, chunk_tag=chunk_tag, - trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % iteration), ) @@ -974,9 +993,9 @@ def iterate_location_choice( ) if locutor: - spc.write_trace_files(iteration) + spc.write_trace_files(state, iteration) - if spc.use_shadow_pricing and spc.check_fit(iteration): + if spc.use_shadow_pricing and spc.check_fit(state, iteration): logging.info( "%s converged after iteration %s" % ( @@ -989,11 +1008,11 @@ def iterate_location_choice( # - shadow price table if locutor: if spc.use_shadow_pricing and "SHADOW_PRICE_TABLE" in model_settings: - inject.add_table(model_settings["SHADOW_PRICE_TABLE"], spc.shadow_prices) + state.add_table(model_settings["SHADOW_PRICE_TABLE"], spc.shadow_prices) if "MODELED_SIZE_TABLE" in model_settings: - inject.add_table(model_settings["MODELED_SIZE_TABLE"], spc.modeled_size) + state.add_table(model_settings["MODELED_SIZE_TABLE"], spc.modeled_size) - persons_df = persons.to_frame() + persons_df = persons # add the choice values to the dest_choice_column in persons dataframe # We only chose school locations for the subset of persons who go to school @@ -1014,38 +1033,39 @@ def iterate_location_choice( # might be None for tiny samples even if sample_table_name was specified assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # lest they try to put school and workplace samples into the same table - if pipeline.is_table(sample_table_name): + if state.is_table(sample_table_name): raise RuntimeError( "dest choice sample table %s already exists" % sample_table_name ) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) # - annotate persons table if "annotate_persons" in model_settings: expressions.assign_columns( + state, df=persons_df, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons_df) + state.add_table("persons", persons_df) - if trace_hh_id: - tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if "annotate_households" in model_settings: - - households_df = households.to_frame() + households_df = households expressions.assign_columns( + state, df=households_df, model_settings=model_settings.get("annotate_households"), trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), ) - pipeline.replace_table("households", households_df) + state.add_table("households", households_df) - if trace_hh_id: - tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) if logsum_column_name: tracing.print_summary( @@ -1055,10 +1075,15 @@ def iterate_location_choice( return persons_df -@inject.step() +@workflow.step def workplace_location( - persons_merged, persons, households, network_los, chunk_size, trace_hh_id, locutor -): + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + households: pd.DataFrame, + network_los: los.Network_LOS, + locutor: bool, +) -> None: """ workplace location choice model @@ -1066,11 +1091,13 @@ def workplace_location( """ trace_label = "workplace_location" - model_settings = config.read_model_settings("workplace_location.yaml") + model_settings = state.filesystem.read_model_settings("workplace_location.yaml") - estimator = estimation.manager.begin_estimation("workplace_location") + estimator = estimation.manager.begin_estimation(state, "workplace_location") if estimator: - write_estimation_specs(estimator, model_settings, "workplace_location.yaml") + write_estimation_specs( + state, estimator, model_settings, "workplace_location.yaml" + ) # FIXME - debugging code to test multiprocessing failure handling # process_name = multiprocessing.current_process().name @@ -1078,18 +1105,18 @@ def workplace_location( # raise RuntimeError(f"fake fail {process_name}") # disable locutor for benchmarking - if config.setting("benchmarking", False): + if state.settings.benchmarking: locutor = False iterate_location_choice( + state, model_settings, persons_merged, persons, households, network_los, estimator, - chunk_size, - trace_hh_id, + state.settings.chunk_size, locutor, trace_label, ) @@ -1098,10 +1125,15 @@ def workplace_location( estimator.end_estimation() -@inject.step() +@workflow.step def school_location( - persons_merged, persons, households, network_los, chunk_size, trace_hh_id, locutor -): + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + households: pd.DataFrame, + network_los: los.Network_LOS, + locutor: bool, +) -> None: """ School location choice model @@ -1109,25 +1141,25 @@ def school_location( """ trace_label = "school_location" - model_settings = config.read_model_settings("school_location.yaml") + model_settings = state.filesystem.read_model_settings("school_location.yaml") - estimator = estimation.manager.begin_estimation("school_location") + estimator = estimation.manager.begin_estimation(state, "school_location") if estimator: - write_estimation_specs(estimator, model_settings, "school_location.yaml") + write_estimation_specs(state, estimator, model_settings, "school_location.yaml") # disable locutor for benchmarking - if config.setting("benchmarking", False): + if state.settings.benchmarking: locutor = False iterate_location_choice( + state, model_settings, persons_merged, persons, households, network_los, estimator, - chunk_size, - trace_hh_id, + state.settings.chunk_size, locutor, trace_label, ) diff --git a/activitysim/abm/models/mandatory_scheduling.py b/activitysim/abm/models/mandatory_scheduling.py index 6a9618874d..2e5401bf1b 100644 --- a/activitysim/abm/models/mandatory_scheduling.py +++ b/activitysim/abm/models/mandatory_scheduling.py @@ -1,25 +1,28 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate +from activitysim.abm.models.util.tour_scheduling import run_tour_scheduling from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.util import assign_in_place, reindex -from .util import estimation -from .util import vectorize_tour_scheduling as vts -from .util.tour_scheduling import run_tour_scheduling - logger = logging.getLogger(__name__) DUMP = False -@inject.step() -def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): +@workflow.step +def mandatory_tour_scheduling( + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + tdd_alts: pd.DataFrame, +) -> None: """ This model predicts the departure time and duration of each activity for mandatory tours """ @@ -27,9 +30,6 @@ def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace model_name = "mandatory_tour_scheduling" trace_label = model_name - persons_merged = persons_merged.to_frame() - - tours = tours.to_frame() mandatory_tours = tours[tours.tour_category == "mandatory"] # - if no mandatory_tours @@ -54,30 +54,29 @@ def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace ) choices = run_tour_scheduling( + state, model_name, mandatory_tours, persons_merged, tdd_alts, tour_segment_col, - chunk_size, - trace_hh_id, ) assign_in_place(tours, choices) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) # updated df for tracing mandatory_tours = tours[tours.tour_category == "mandatory"] - tracing.dump_df( + state.tracing.dump_df( DUMP, tt.tour_map(persons_merged, mandatory_tours, tdd_alts), trace_label, "tour_map", ) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( mandatory_tours, label=trace_label, slicer="person_id", diff --git a/activitysim/abm/models/mandatory_tour_frequency.py b/activitysim/abm/models/mandatory_tour_frequency.py index 727a591f0c..166c671e0a 100644 --- a/activitysim/abm/models/mandatory_tour_frequency.py +++ b/activitysim/abm/models/mandatory_tour_frequency.py @@ -1,21 +1,28 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing - -from .util import estimation -from .util.tour_frequency import process_mandatory_tours +from activitysim.abm.models.util.tour_frequency import process_mandatory_tours +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger(__name__) -def add_null_results(trace_label, mandatory_tour_frequency_settings): +def add_null_results(state, trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) - persons = inject.get_table("persons").to_frame() + persons = state.get_dataframe("persons") persons["mandatory_tour_frequency"] = "" tours = pd.DataFrame() @@ -23,56 +30,63 @@ def add_null_results(trace_label, mandatory_tour_frequency_settings): tours["tour_type"] = None tours["person_id"] = None tours.index.name = "tour_id" - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) expressions.assign_columns( + state, df=persons, model_settings=mandatory_tour_frequency_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) -@inject.step() -def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): +@workflow.step +def mandatory_tour_frequency( + state: workflow.State, + persons_merged: pd.DataFrame, +) -> None: """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = "mandatory_tour_frequency" model_settings_file_name = "mandatory_tour_frequency.yaml" + trace_hh_id = state.settings.trace_hh_id - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) - choosers = persons_merged.to_frame() + choosers = persons_merged # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == "M"] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: - add_null_results(trace_label, model_settings) + add_null_results(state, trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = {} expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label, ) - estimator = estimation.manager.begin_estimation("mandatory_tour_frequency") + estimator = estimation.manager.begin_estimation(state, "mandatory_tour_frequency") - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -84,11 +98,11 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): estimator.write_choosers(choosers) choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="mandatory_tour_frequency", estimator=estimator, @@ -112,20 +126,20 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): the same as got non_mandatory_tours except trip types are "work" and "school" """ alternatives = simulate.read_model_alts( - "mandatory_tour_frequency_alternatives.csv", set_index="alt" + state, "mandatory_tour_frequency_alternatives.csv", set_index="alt" ) choosers["mandatory_tour_frequency"] = choices.reindex(choosers.index) mandatory_tours = process_mandatory_tours( - persons=choosers, mandatory_tour_frequency_alts=alternatives + state, persons=choosers, mandatory_tour_frequency_alts=alternatives ) - tours = pipeline.extend_table("tours", mandatory_tours) - tracing.register_traceable_table("tours", mandatory_tours) - pipeline.get_rn_generator().add_channel("tours", mandatory_tours) + tours = state.extend_table("tours", mandatory_tours) + state.tracing.register_traceable_table("tours", mandatory_tours) + state.get_rn_generator().add_channel("tours", mandatory_tours) # - annotate persons - persons = inject.get_table("persons").to_frame() + persons = state.get_dataframe("persons") # need to reindex as we only handled persons with cdap_activity == 'M' persons["mandatory_tour_frequency"] = ( @@ -133,24 +147,25 @@ def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): ) expressions.assign_columns( + state, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary( "mandatory_tour_frequency", persons.mandatory_tour_frequency, value_counts=True ) if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True, ) - tracing.trace_df( + state.tracing.trace_df( persons, label="mandatory_tour_frequency.persons", warn_if_empty=True ) diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index adfd4a0982..7ca5c4e902 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -1,23 +1,25 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, inject, pipeline, simulate, tracing +from activitysim.abm.models.util import annotate, tour_destination +from activitysim.core import estimation, los, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation, tour_destination, annotate - - logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def non_mandatory_tour_destination( - tours, persons_merged, network_los, chunk_size, trace_hh_id -): - + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: """ Given the tour generation from the above, each tour needs to have a destination, so in this case tours are the choosers (with the associated @@ -26,27 +28,23 @@ def non_mandatory_tour_destination( trace_label = "non_mandatory_tour_destination" model_settings_file_name = "non_mandatory_tour_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + trace_hh_id = state.settings.trace_hh_id logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) - tours = tours.to_frame() - - persons_merged = persons_merged.to_frame() - # choosers are tours - in a sense tours are choosing their destination non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] # separating out pure escort school tours # they already have their destination set - if pipeline.is_table("school_escort_tours"): + if state.is_table("school_escort_tours"): nm_tour_index = non_mandatory_tours.index pure_school_escort_tours = non_mandatory_tours[ (non_mandatory_tours["school_esc_outbound"] == "pure_escort") @@ -60,21 +58,22 @@ def non_mandatory_tour_destination( tracing.no_results(trace_label) return - estimator = estimation.manager.begin_estimation("non_mandatory_tour_destination") + estimator = estimation.manager.begin_estimation( + state, "non_mandatory_tour_destination" + ) if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + state.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(state.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( + state, non_mandatory_tours, persons_merged, want_logsums, @@ -82,8 +81,6 @@ def non_mandatory_tour_destination( model_settings, network_los, estimator, - chunk_size, - trace_hh_id, trace_label, ) @@ -98,7 +95,7 @@ def non_mandatory_tour_destination( non_mandatory_tours["destination"] = choices_df.choice # merging back in school escort tours and preserving index - if pipeline.is_table("school_escort_tours"): + if state.is_table("school_escort_tours"): non_mandatory_tours = pd.concat( [pure_school_escort_tours, non_mandatory_tours] ).set_index(nm_tour_index) @@ -113,18 +110,18 @@ def non_mandatory_tour_destination( ~tours["destination"].isna() ), f"Tours are missing destination: {tours[tours['destination'].isna()]}" - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) if model_settings.get("annotate_tours"): - annotate.annotate_tours(model_settings, trace_label) + annotate.annotate_tours(state, model_settings, trace_label) if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # save_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( tours[tours.tour_category == "non_mandatory"], label="non_mandatory_tour_destination", slicer="person_id", diff --git a/activitysim/abm/models/non_mandatory_scheduling.py b/activitysim/abm/models/non_mandatory_scheduling.py index 5b32c95501..d8cb9e1873 100644 --- a/activitysim/abm/models/non_mandatory_scheduling.py +++ b/activitysim/abm/models/non_mandatory_scheduling.py @@ -1,33 +1,34 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate +from activitysim.abm.models.util.tour_scheduling import run_tour_scheduling from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.util import assign_in_place -from .util.tour_scheduling import run_tour_scheduling logger = logging.getLogger(__name__) DUMP = False -@inject.step() +@workflow.step def non_mandatory_tour_scheduling( - tours, persons_merged, tdd_alts, chunk_size, trace_hh_id -): + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + tdd_alts: pd.DataFrame, +) -> None: """ This model predicts the departure time and duration of each activity for non-mandatory tours """ model_name = "non_mandatory_tour_scheduling" trace_label = model_name - - persons_merged = persons_merged.to_frame() - - tours = tours.to_frame() + trace_hh_id = state.settings.trace_hh_id non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] # - if no mandatory_tours @@ -38,22 +39,21 @@ def non_mandatory_tour_scheduling( tour_segment_col = None choices = run_tour_scheduling( + state, model_name, non_mandatory_tours, persons_merged, tdd_alts, tour_segment_col, - chunk_size, - trace_hh_id, ) assign_in_place(tours, choices) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) # updated df for tracing non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] - tracing.dump_df( + state.tracing.dump_df( DUMP, tt.tour_map(persons_merged, non_mandatory_tours, tdd_alts), trace_label, @@ -61,7 +61,7 @@ def non_mandatory_tour_scheduling( ) if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( non_mandatory_tours, label=trace_label, slicer="person_id", diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index 521f49c47c..86a794102b 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -1,33 +1,36 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd +from activitysim.abm.models.util import annotate +from activitysim.abm.models.util.overlap import person_max_window +from activitysim.abm.models.util.school_escort_tours_trips import ( + recompute_tour_count_statistics, +) +from activitysim.abm.models.util.tour_frequency import process_non_mandatory_tours from activitysim.core import ( config, + estimation, expressions, - inject, logit, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.interaction_simulate import interaction_simulate -from .util import estimation -from .util import annotate -from .util.school_escort_tours_trips import recompute_tour_count_statistics - -from .util.overlap import person_max_window -from .util.tour_frequency import process_non_mandatory_tours - logger = logging.getLogger(__name__) -def extension_probs(): - f = config.config_file_path("non_mandatory_tour_frequency_extension_probs.csv") +def extension_probs(state: workflow.State): + f = state.filesystem.get_config_file_path( + "non_mandatory_tour_frequency_extension_probs.csv" + ) df = pd.read_csv(f, comment="#") # convert cum probs to individual probs @@ -37,7 +40,13 @@ def extension_probs(): return df -def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label): +def extend_tour_counts( + state: workflow.State, + persons: pd.DataFrame, + tour_counts: pd.DataFrame, + alternatives, + trace_label: str, +): """ extend tour counts based on a probability table @@ -55,7 +64,6 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la alternatives alternatives from nmtv interaction_simulate only need this to know max possible frequency for a tour type - trace_hh_id trace_label Returns @@ -77,7 +85,7 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la JOIN_COLUMNS = ["ptype", "has_mandatory_tour", "has_joint_tour"] TOUR_TYPE_COL = "nonmandatory_tour_type" - probs_spec = extension_probs() + probs_spec = extension_probs(state) persons = persons[JOIN_COLUMNS] # only extend if there are 1 - 4 non_mandatory tours to start with @@ -86,10 +94,11 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la logger.info("extend_tour_counts - no persons eligible for tour_count extension") return tour_counts - have_trace_targets = trace_hh_id and tracing.has_trace_targets(extend_tour_counts) + have_trace_targets = state.settings.trace_hh_id and state.tracing.has_trace_targets( + extend_tour_counts + ) for i, tour_type in enumerate(alternatives.columns): - i_tour_type = i + 1 # (probs_spec nonmandatory_tour_type column is 1-based) tour_type_trace_label = tracing.extend_trace_label(trace_label, tour_type) @@ -113,6 +122,7 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la # - random choice of extension magnitude based on relative probs choices, rands = logit.make_choices( + state, choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, trace_choosers=choosers, @@ -123,12 +133,12 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices, tracing.extend_trace_label(tour_type_trace_label, "choices"), columns=[None, "choice"], ) - tracing.trace_df( + state.tracing.trace_df( rands, tracing.extend_trace_label(tour_type_trace_label, "rands"), columns=[None, "rand"], @@ -137,8 +147,10 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la return tour_counts -@inject.step() -def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): +@workflow.step +def non_mandatory_tour_frequency( + state: workflow.State, persons: pd.DataFrame, persons_merged: pd.DataFrame +) -> None: """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is @@ -149,26 +161,26 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i trace_label = "non_mandatory_tour_frequency" model_settings_file_name = "non_mandatory_tour_frequency.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives = simulate.read_model_alts( - "non_mandatory_tour_frequency_alternatives.csv", set_index=None + state, "non_mandatory_tour_frequency_alternatives.csv", set_index=None ) alternatives["tot_tours"] = alternatives.sum(axis=1) # filter based on results of CDAP - choosers = persons_merged.to_frame() + choosers = persons_merged choosers = choosers[choosers.cdap_activity.isin(["M", "N"])] # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - - locals_dict = {"person_max_window": person_max_window} + locals_dict = {"person_max_window": lambda x: person_max_window(state, x)} expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -179,13 +191,12 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i constants = config.get_model_constants(model_settings) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) spec_segments = model_settings.get("SPEC_SEGMENTS", {}) # segment by person type and pick the right spec for each person type choices_list = [] for segment_settings in spec_segments: - segment_name = segment_settings["NAME"] ptype = segment_settings["PTYPE"] @@ -203,12 +214,12 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i continue estimator = estimation.manager.begin_estimation( - model_name=segment_name, bundle_name="non_mandatory_tour_frequency" + state, model_name=segment_name, bundle_name="non_mandatory_tour_frequency" ) - coefficients_df = simulate.read_model_coefficients(segment_settings) + coefficients_df = state.filesystem.read_model_coefficients(segment_settings) segment_spec = simulate.eval_coefficients( - segment_spec, coefficients_df, estimator + state, segment_spec, coefficients_df, estimator ) if estimator: @@ -235,15 +246,16 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i estimator.set_chooser_id(chooser_segment.index.name) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_simulate( + state, chooser_segment, alternatives, spec=segment_spec, log_alt_losers=log_alt_losers, locals_d=constants, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label="non_mandatory_tour_frequency.%s" % segment_name, trace_choice_name="non_mandatory_tour_frequency", estimator=estimator, @@ -266,7 +278,6 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i choices = pd.concat(choices_list).sort_index() # add non_mandatory_tour_frequency column to persons - persons = persons.to_frame() # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] @@ -299,10 +310,10 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i # - extend_tour_counts - probabalistic extended_tour_counts = extend_tour_counts( + state, choosers, modeled_tour_counts.copy(), alternatives, - trace_hh_id, tracing.extend_trace_label(trace_label, "extend_tour_counts"), ) @@ -338,11 +349,12 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i """ create the non_mandatory tours based on extended_tour_counts """ - non_mandatory_tours = process_non_mandatory_tours(persons, extended_tour_counts) + non_mandatory_tours = process_non_mandatory_tours( + state, persons, extended_tour_counts + ) assert len(non_mandatory_tours) == extended_tour_counts.sum().sum() if estimator: - # make sure they created the right tours survey_tours = estimation.manager.get_survey_table("tours").sort_index() non_mandatory_survey_tours = survey_tours[ @@ -374,25 +386,26 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i assert not tours_differ.any() - pipeline.extend_table("tours", non_mandatory_tours) + state.extend_table("tours", non_mandatory_tours) - tracing.register_traceable_table("tours", non_mandatory_tours) - pipeline.get_rn_generator().add_channel("tours", non_mandatory_tours) + state.tracing.register_traceable_table("tours", non_mandatory_tours) + state.get_rn_generator().add_channel("tours", non_mandatory_tours) - if pipeline.is_table("school_escort_tours"): + if state.is_table("school_escort_tours"): # need to re-compute tour frequency statistics to account for school escort tours - recompute_tour_count_statistics() + recompute_tour_count_statistics(state) if model_settings.get("annotate_tours"): - annotate.annotate_tours(model_settings, trace_label) + annotate.annotate_tours(state, model_settings, trace_label) expressions.assign_columns( + state, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=trace_label, ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary( "non_mandatory_tour_frequency", @@ -400,18 +413,18 @@ def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_i value_counts=True, ) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True, ) - tracing.trace_df( + state.tracing.trace_df( choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True ) - tracing.trace_df( + state.tracing.trace_df( persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True, diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index a87703b8b9..870f01af95 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np @@ -8,24 +10,22 @@ from activitysim.core import ( config, expressions, - inject, logit, - pipeline, + los, simulate, tracing, + workflow, ) from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.tracing import print_elapsed_time from activitysim.core.util import assign_in_place -from .util import estimation - logger = logging.getLogger(__name__) NO_DESTINATION = -1 -def wrap_skims(model_settings): +def wrap_skims(state: workflow.State, model_settings): """ wrap skims of trip destination using origin, dest column names from model settings. Various of these are used by destination_sample, compute_logsums, and destination_simulate @@ -50,7 +50,7 @@ def wrap_skims(model_settings): dict containing skims, keyed by canonical names relative to tour orientation """ - network_los = inject.get_injectable("network_los") + network_los = state.get_injectable("network_los") skim_dict = network_los.get_default_skim_dict() origin = model_settings["TRIP_ORIGIN"] @@ -80,9 +80,8 @@ def wrap_skims(model_settings): return skims -def get_spec_for_segment(model_settings, spec_name, segment): - - omnibus_spec = simulate.read_model_spec(file_name=model_settings[spec_name]) +def get_spec_for_segment(state: workflow.State, model_settings, spec_name, segment): + omnibus_spec = state.filesystem.read_model_spec(file_name=model_settings[spec_name]) spec = omnibus_spec[[segment]] @@ -94,6 +93,7 @@ def get_spec_for_segment(model_settings, spec_name, segment): def parking_destination_simulate( + state: workflow.State, segment_name, trips, destination_sample, @@ -112,16 +112,18 @@ def parking_destination_simulate( choices - pandas.Series destination alt chosen """ - trace_label = tracing.extend_trace_label(trace_label, "trip_destination_simulate") + trace_label = tracing.extend_trace_label( + trace_label, "parking_destination_simulate" + ) - spec = get_spec_for_segment(model_settings, "SPECIFICATION", segment_name) + spec = get_spec_for_segment(state, model_settings, "SPECIFICATION", segment_name) - coefficients_df = simulate.read_model_coefficients(model_settings) - spec = simulate.eval_coefficients(spec, coefficients_df, None) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + spec = simulate.eval_coefficients(state, spec, coefficients_df, None) alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] - logger.info("Running trip_destination_simulate with %d trips", len(trips)) + logger.info("Running parking_destination_simulate with %d trips", len(trips)) locals_dict = config.get_model_constants(model_settings).copy() locals_dict.update(skims) @@ -129,6 +131,7 @@ def parking_destination_simulate( locals_dict["PARKING"] = skims["op_skims"].dest_key parking_locations = interaction_sample_simulate( + state, choosers=trips, alternatives=destination_sample, spec=spec, @@ -155,6 +158,7 @@ def parking_destination_simulate( def choose_parking_location( + state: workflow.State, segment_name, trips, alternatives, @@ -165,20 +169,19 @@ def choose_parking_location( trace_hh_id, trace_label, ): - logger.info("choose_parking_location %s with %d trips", trace_label, trips.shape[0]) t0 = print_elapsed_time() alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] destination_sample = logit.interaction_dataset( - trips, alternatives, alt_index_id=alt_dest_col_name + state, trips, alternatives, alt_index_id=alt_dest_col_name ) destination_sample.index = np.repeat(trips.index.values, len(alternatives)) destination_sample.index.name = trips.index.name - # # - trip_destination_simulate destinations = parking_destination_simulate( + state, segment_name=segment_name, trips=trips, destination_sample=destination_sample, @@ -203,6 +206,7 @@ def choose_parking_location( def run_parking_destination( + state: workflow.State, model_settings, trips, land_use, @@ -211,15 +215,13 @@ def run_parking_destination( trace_label, fail_some_trips_for_testing=False, ): - chooser_filter_column = model_settings.get("CHOOSER_FILTER_COLUMN_NAME") chooser_segment_column = model_settings.get("CHOOSER_SEGMENT_COLUMN_NAME") parking_location_column_name = model_settings["ALT_DEST_COL_NAME"] sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) choosers = trips[trips[chooser_filter_column]] @@ -228,7 +230,7 @@ def run_parking_destination( # Placeholder for trips without a parking choice trips[parking_location_column_name] = -1 - skims = wrap_skims(model_settings) + skims = wrap_skims(state, model_settings) alt_column_filter_name = model_settings.get("ALTERNATIVE_FILTER_COLUMN_NAME") alternatives = land_use[land_use[alt_column_filter_name]] @@ -244,6 +246,7 @@ def run_parking_destination( continue choices, destination_sample = choose_parking_location( + state, segment_name, chooser_segment, alternatives, @@ -278,29 +281,36 @@ def run_parking_destination( return trips[parking_location_column_name], save_sample_df -@inject.step() +@workflow.step def parking_location( - trips, trips_merged, land_use, network_los, chunk_size, trace_hh_id -): + state: workflow.State, + trips: pd.DataFrame, + trips_merged: pd.DataFrame, + land_use: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: """ Given a set of trips, each trip needs to have a parking location if it is eligible for remote parking. """ trace_label = "parking_location" - model_settings = config.read_model_settings("parking_location_choice.yaml") + model_settings = state.filesystem.read_model_settings( + "parking_location_choice.yaml" + ) + trace_hh_id = state.settings.trace_hh_id alt_destination_col_name = model_settings["ALT_DEST_COL_NAME"] preprocessor_settings = model_settings.get("PREPROCESSOR", None) - trips_df = trips.to_frame() - trips_merged_df = trips_merged.to_frame() - land_use_df = land_use.to_frame() + trips_df = trips + trips_merged_df = trips_merged + land_use_df = land_use proposed_trip_departure_period = model_settings["TRIP_DEPARTURE_PERIOD"] # TODO: the number of skim time periods should be more readily available than this n_skim_time_periods = np.unique( - network_los.los_settings["skim_time_periods"]["labels"] + network_los.los_settings.skim_time_periods["labels"] ).size if trips_merged_df[proposed_trip_departure_period].max() > n_skim_time_periods: # max proposed_trip_departure_period is out of range, @@ -321,6 +331,7 @@ def parking_location( if preprocessor_settings: expressions.assign_columns( + state, df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -328,20 +339,21 @@ def parking_location( ) parking_locations, save_sample_df = run_parking_destination( + state, model_settings, trips_merged_df, land_use_df, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, ) assign_in_place(trips_df, parking_locations.to_frame(alt_destination_col_name)) - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( trips_df, label=trace_label, slicer="trip_id", @@ -362,6 +374,6 @@ def parking_location( ) # lest they try to put tour samples into the same table - if pipeline.is_table(sample_table_name): + if state.is_table(sample_table_name): raise RuntimeError("sample table %s already exists" % sample_table_name) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py index 1b5a97fc93..9c1e438dc5 100644 --- a/activitysim/abm/models/school_escorting.py +++ b/activitysim/abm/models/school_escorting.py @@ -1,16 +1,24 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.abm.models.util import school_escort_tours_trips +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) from activitysim.core.interaction_simulate import interaction_simulate from activitysim.core.util import reindex -from .util import estimation, school_escort_tours_trips - logger = logging.getLogger(__name__) # setting global defaults for max number of escortees and escortees in model @@ -326,10 +334,14 @@ def create_school_escorting_bundles_table(choosers, tours, stage): return bundles -@inject.step() +@workflow.step def school_escorting( - households, households_merged, persons, tours, chunk_size, trace_hh_id -): + state: workflow.State, + households: pd.DataFrame, + households_merged: pd.DataFrame, + persons: pd.DataFrame, + tours: pd.DataFrame, +) -> None: """ school escorting model @@ -355,14 +367,10 @@ def school_escorting( """ trace_label = "school_escorting_simulate" model_settings_file_name = "school_escorting.yaml" - model_settings = config.read_model_settings(model_settings_file_name) - - persons = persons.to_frame() - households = households.to_frame() - households_merged = households_merged.to_frame() - tours = tours.to_frame() + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + trace_hh_id = state.settings.trace_hh_id - alts = simulate.read_model_alts(model_settings["ALTS"], set_index="Alt") + alts = simulate.read_model_alts(state, model_settings["ALTS"], set_index="Alt") households_merged, participant_columns = determine_escorting_participants( households_merged, persons, model_settings @@ -379,16 +387,18 @@ def school_escorting( choices = None for stage_num, stage in enumerate(school_escorting_stages): stage_trace_label = trace_label + "_" + stage - estimator = estimation.manager.begin_estimation("school_escorting_" + stage) + estimator = estimation.manager.begin_estimation( + state, "school_escorting_" + stage + ) - model_spec_raw = simulate.read_model_spec( + model_spec_raw = state.filesystem.read_model_spec( file_name=model_settings[stage.upper() + "_SPEC"] ) - coefficients_df = simulate.read_model_coefficients( + coefficients_df = state.filesystem.read_model_coefficients( file_name=model_settings[stage.upper() + "_COEFFICIENTS"] ) model_spec = simulate.eval_coefficients( - model_spec_raw, coefficients_df, estimator + state, model_spec_raw, coefficients_df, estimator ) # allow for skipping sharrow entirely in this model with `sharrow_skip: true` @@ -426,6 +436,7 @@ def school_escorting( preprocessor_settings = model_settings.get("preprocessor_" + stage, None) if preprocessor_settings: expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -438,15 +449,16 @@ def school_escorting( estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(choosers) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_simulate( + state, choosers=choosers, alternatives=alts, spec=model_spec, log_alt_losers=log_alt_losers, locals_d=locals_dict, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=stage_trace_label, trace_choice_name="school_escorting_" + "stage", estimator=estimator, @@ -470,7 +482,9 @@ def school_escorting( ) if trace_hh_id: - tracing.trace_df(households, label=escorting_choice, warn_if_empty=True) + state.tracing.trace_df( + households, label=escorting_choice, warn_if_empty=True + ) if stage_num >= 1: choosers["Alt"] = choices @@ -493,7 +507,7 @@ def school_escorting( ) school_escort_tours = school_escort_tours_trips.create_pure_school_escort_tours( - escort_bundles + state, escort_bundles ) chauf_tour_id_map = { v: k for k, v in school_escort_tours["bundle_id"].to_dict().items() @@ -506,7 +520,7 @@ def school_escorting( tours = school_escort_tours_trips.add_pure_escort_tours(tours, school_escort_tours) tours = school_escort_tours_trips.process_tours_after_escorting_model( - escort_bundles, tours + state, escort_bundles, tours ) school_escort_trips = school_escort_tours_trips.create_school_escort_trips( @@ -514,17 +528,17 @@ def school_escorting( ) # update pipeline - pipeline.replace_table("households", households) - pipeline.replace_table("tours", tours) - pipeline.get_rn_generator().drop_channel("tours") - pipeline.get_rn_generator().add_channel("tours", tours) - pipeline.replace_table("escort_bundles", escort_bundles) + state.add_table("households", households) + state.add_table("tours", tours) + state.get_rn_generator().drop_channel("tours") + state.get_rn_generator().add_channel("tours", tours) + state.add_table("escort_bundles", escort_bundles) # save school escorting tours and trips in pipeline so we can overwrite results from downstream models - pipeline.replace_table("school_escort_tours", school_escort_tours) - pipeline.replace_table("school_escort_trips", school_escort_trips) + state.add_table("school_escort_tours", school_escort_tours) + state.add_table("school_escort_trips", school_escort_trips) # updating timetable object with pure escort tours so joint tours do not schedule ontop - timetable = inject.get_injectable("timetable") + timetable = state.get_injectable("timetable") # Need to do this such that only one person is in nth_tours # thus, looping through tour_category and tour_num @@ -538,4 +552,4 @@ def school_escorting( window_row_ids=nth_tours["person_id"], tdds=nth_tours["tdd"] ) - timetable.replace_table() + timetable.replace_table(state) diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index 94a208075f..484a93cb14 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -1,23 +1,34 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import numpy as np import pandas as pd -from activitysim.abm.models.util import school_escort_tours_trips - -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing -from activitysim.core.util import assign_in_place, reindex -from .util import estimation, trip +from activitysim.abm.models.util import school_escort_tours_trips, trip +from activitysim.core import ( + config, + estimation, + expressions, + los, + simulate, + tracing, + workflow, +) +from activitysim.core.util import assign_in_place logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def stop_frequency( - tours, tours_merged, stop_frequency_alts, network_los, chunk_size, trace_hh_id -): + state: workflow.State, + tours: pd.DataFrame, + tours_merged: pd.DataFrame, + stop_frequency_alts, + network_los: los.Network_LOS, +) -> None: """ stop frequency model @@ -46,11 +57,10 @@ def stop_frequency( trace_label = "stop_frequency" model_settings_file_name = "stop_frequency.yaml" + trace_hh_id = state.settings.trace_hh_id - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) - tours = tours.to_frame() - tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() @@ -61,7 +71,6 @@ def stop_frequency( # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - # hack: preprocessor adds origin column in place if it does not exist already assert "origin" in tours_merged assert "destination" in tours_merged @@ -77,6 +86,7 @@ def stop_frequency( # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( + state, df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -102,7 +112,6 @@ def stop_frequency( choices_list = [] for segment_settings in spec_segments: - segment_name = segment_settings[segment_col] segment_value = segment_settings[segment_col] @@ -117,20 +126,22 @@ def stop_frequency( ) estimator = estimation.manager.begin_estimation( - model_name=segment_name, bundle_name="stop_frequency" + state, model_name=segment_name, bundle_name="stop_frequency" ) - segment_spec = simulate.read_model_spec(file_name=segment_settings["SPEC"]) + segment_spec = state.filesystem.read_model_spec( + file_name=segment_settings["SPEC"] + ) assert segment_spec is not None, ( "spec for segment_type %s not found" % segment_name ) coefficients_file_name = segment_settings["COEFFICIENTS"] - coefficients_df = simulate.read_model_coefficients( + coefficients_df = state.filesystem.read_model_coefficients( file_name=coefficients_file_name ) segment_spec = simulate.eval_coefficients( - segment_spec, coefficients_df, estimator + state, segment_spec, coefficients_df, estimator ) if estimator: @@ -144,11 +155,11 @@ def stop_frequency( estimator.set_chooser_id(chooser_segment.index.name) choices = simulate.simple_simulate( + state, choosers=chooser_segment, spec=segment_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_name), trace_choice_name="stops", estimator=estimator, @@ -180,13 +191,13 @@ def stop_frequency( # if not already there, then it will have been added by stop_freq_annotate_tours_preprocessor assign_in_place(tours, tours_merged[["primary_purpose"]]) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) # create trips table - trips = trip.initialize_from_tours(tours, stop_frequency_alts) - pipeline.replace_table("trips", trips) - tracing.register_traceable_table("trips", trips) - pipeline.get_rn_generator().add_channel("trips", trips) + trips = trip.initialize_from_tours(state, tours, stop_frequency_alts) + state.add_table("trips", trips) + state.tracing.register_traceable_table("trips", trips) + state.get_rn_generator().add_channel("trips", trips) if estimator: # make sure they created trips with the expected tour_ids @@ -219,22 +230,24 @@ def stop_frequency( assert not trips_differ.any() if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( tours, label="stop_frequency.tours", slicer="person_id", columns=None ) - tracing.trace_df( + state.tracing.trace_df( trips, label="stop_frequency.trips", slicer="person_id", columns=None ) - tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) + state.tracing.trace_df( + annotations, label="stop_frequency.annotations", columns=None + ) - tracing.trace_df( + state.tracing.trace_df( tours_merged, label="stop_frequency.tours_merged", slicer="person_id", columns=None, ) - if pipeline.is_table("school_escort_trips"): - school_escort_tours_trips.merge_school_escort_trips_into_pipeline() + if state.is_table("school_escort_trips"): + school_escort_tours_trips.merge_school_escort_trips_into_pipeline(state) diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py index 66b5cf9588..ed553ecbf5 100644 --- a/activitysim/abm/models/summarize.py +++ b/activitysim/abm/models/summarize.py @@ -1,19 +1,21 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os import numpy as np import pandas as pd -from activitysim.abm.models.trip_matrices import annotate_trips -from activitysim.core import config, expressions, inject, pipeline +from activitysim.core import expressions, workflow +from activitysim.core.los import Network_LOS logger = logging.getLogger(__name__) def wrap_skims( - network_los: pipeline.Pipeline, trips_merged: pd.DataFrame + network_los: Network_LOS, trips_merged: pd.DataFrame ) -> dict[str, object]: """ Retrieve skim wrappers for merged trips. @@ -198,18 +200,18 @@ def manual_breaks( return bins -@inject.step() +@workflow.step def summarize( - network_los: pipeline.Pipeline, + state: workflow.State, + network_los: Network_LOS, persons: pd.DataFrame, persons_merged: pd.DataFrame, households: pd.DataFrame, households_merged: pd.DataFrame, trips: pd.DataFrame, - tours: pd.DataFrame, tours_merged: pd.DataFrame, land_use: pd.DataFrame, -): +) -> None: """ A standard model that uses expression files to summarize pipeline tables for vizualization. @@ -224,26 +226,20 @@ def summarize( """ trace_label = "summarize" model_settings_file_name = "summarize.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) output_location = ( model_settings["OUTPUT"] if "OUTPUT" in model_settings else "summaries" ) - os.makedirs(config.output_file_path(output_location), exist_ok=True) + os.makedirs(state.get_output_file_path(output_location), exist_ok=True) spec = pd.read_csv( - config.config_file_path(model_settings["SPECIFICATION"]), comment="#" + state.filesystem.get_config_file_path(model_settings["SPECIFICATION"]), + comment="#", ) # Load dataframes from pipeline - persons = persons.to_frame() - persons_merged = persons_merged.to_frame() - households = households.to_frame() - households_merged = households_merged.to_frame() - trips = trips.to_frame() - tours = tours_merged.to_frame() - tours_merged = tours_merged.to_frame() - land_use = land_use.to_frame() + tours = tours_merged # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( @@ -272,12 +268,11 @@ def summarize( # Annotate trips_merged expressions.annotate_preprocessors( - trips_merged, locals_d, skims, model_settings, "summarize" + state, trips_merged, locals_d, skims, model_settings, "summarize" ) for table_name, df in locals_d.items(): if table_name in model_settings: - meta = model_settings[table_name] df = eval(table_name) @@ -318,10 +313,12 @@ def summarize( # Output pipeline tables for expression development if model_settings["EXPORT_PIPELINE_TABLES"] is True: pipeline_table_dir = os.path.join(output_location, "pipeline_tables") - os.makedirs(config.output_file_path(pipeline_table_dir), exist_ok=True) + os.makedirs(state.get_output_file_path(pipeline_table_dir), exist_ok=True) for name, df in locals_d.items(): df.to_csv( - config.output_file_path(os.path.join(pipeline_table_dir, f"{name}.csv")) + state.get_output_file_path( + os.path.join(pipeline_table_dir, f"{name}.csv") + ) ) # Add classification functions to locals @@ -335,13 +332,11 @@ def summarize( ) for i, row in spec.iterrows(): - out_file = row["Output"] expr = row["Expression"] # Save temporary variables starting with underscores in locals_d if out_file.startswith("_"): - logger.debug(f"Temp Variable: {expr} -> {out_file}") locals_d[out_file] = eval(expr, globals(), locals_d) @@ -351,6 +346,8 @@ def summarize( resultset = eval(expr, globals(), locals_d) resultset.to_csv( - config.output_file_path(os.path.join(output_location, f"{out_file}.csv")), + state.get_output_file_path( + os.path.join(output_location, f"{out_file}.csv") + ), index=False, ) diff --git a/activitysim/abm/models/telecommute_frequency.py b/activitysim/abm/models/telecommute_frequency.py index 4596a89115..8629909aaf 100755 --- a/activitysim/abm/models/telecommute_frequency.py +++ b/activitysim/abm/models/telecommute_frequency.py @@ -1,17 +1,29 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger("activitysim") -@inject.step() -def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def telecommute_frequency( + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, +) -> None: """ This model predicts the frequency of telecommute for a person (worker) who does not works from home. The alternatives of this model are 'No Telecommute', @@ -23,34 +35,36 @@ def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): trace_label = "telecommute_frequency" model_settings_file_name = "telecommute_frequency.yaml" - choosers = persons_merged.to_frame() + choosers = persons_merged choosers = choosers[choosers.workplace_zone_id > -1] logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("telecommute_frequency") + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + estimator = estimation.manager.begin_estimation(state, "telecommute_frequency") constants = config.get_model_constants(model_settings) # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -61,11 +75,11 @@ def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_choosers(choosers) choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="telecommute_frequency", estimator=estimator, @@ -81,16 +95,15 @@ def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["telecommute_frequency"] = ( choices.reindex(persons.index).fillna("").astype(str) ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary( "telecommute_frequency", persons.telecommute_frequency, value_counts=True ) - if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index 1e826ae919..c56547b265 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -1,27 +1,18 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd from orca import orca -from activitysim.core import ( - config, - expressions, - inject, - logit, - los, - pipeline, - simulate, - tracing, -) -from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.abm.models.util import annotate, school_escort_tours_trips, trip +from activitysim.abm.models.util.mode import run_tour_mode_choice_simulate +from activitysim.core import config, estimation, logit, los, simulate, tracing, workflow from activitysim.core.util import assign_in_place, reindex -from .util import estimation, trip, annotate, school_escort_tours_trips -from .util.mode import run_tour_mode_choice_simulate - logger = logging.getLogger(__name__) """ @@ -30,7 +21,9 @@ """ -def get_alts_from_segmented_nested_logit(model_settings, segment_name, trace_label): +def get_alts_from_segmented_nested_logit( + state: workflow.State, model_settings, segment_name, trace_label +): """Infer alts from logit spec Parameters @@ -45,7 +38,9 @@ def get_alts_from_segmented_nested_logit(model_settings, segment_name, trace_lab """ nest_spec = config.get_logit_model_settings(model_settings) - coefficients = simulate.get_segment_coefficients(model_settings, segment_name) + coefficients = state.filesystem.get_segment_coefficients( + model_settings, segment_name + ) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) tour_mode_alts = [] for nest in logit.each_nest(nest_spec): @@ -55,7 +50,9 @@ def get_alts_from_segmented_nested_logit(model_settings, segment_name, trace_lab return tour_mode_alts -def create_logsum_trips(tours, segment_column_name, model_settings, trace_label): +def create_logsum_trips( + state: workflow.State, tours, segment_column_name, model_settings, trace_label +): """ Construct table of trips from half-tours (1 inbound, 1 outbound) for each tour-mode. @@ -72,11 +69,11 @@ def create_logsum_trips(tours, segment_column_name, model_settings, trace_label) pandas.DataFrame Table of trips: 2 per tour, with O/D and purpose inherited from tour """ - stop_frequency_alts = inject.get_injectable("stop_frequency_alts") + stop_frequency_alts = state.get_injectable("stop_frequency_alts") stop_freq = "0out_0in" # no intermediate stops tours["stop_frequency"] = stop_freq tours["primary_purpose"] = tours["tour_purpose"] - trips = trip.initialize_from_tours(tours, stop_frequency_alts) + trips = trip.initialize_from_tours(state, tours, stop_frequency_alts) trips["stop_frequency"] = stop_freq outbound = trips["outbound"] trips["depart"] = reindex(tours.start, trips.tour_id) @@ -86,7 +83,7 @@ def create_logsum_trips(tours, segment_column_name, model_settings, trace_label) # to get a set of coefficients from the spec segment_name = tours.iloc[0][segment_column_name] tour_mode_alts = get_alts_from_segmented_nested_logit( - model_settings, segment_name, trace_label + state, model_settings, segment_name, trace_label ) # repeat rows from the trips table iterating over tour mode @@ -100,7 +97,7 @@ def create_logsum_trips(tours, segment_column_name, model_settings, trace_label) return logsum_trips -def append_tour_leg_trip_mode_choice_logsums(tours): +def append_tour_leg_trip_mode_choice_logsums(state: workflow.State, tours): """Creates trip mode choice logsum column in tours table for each tour mode and leg Parameters @@ -112,7 +109,7 @@ def append_tour_leg_trip_mode_choice_logsums(tours): tours : pd.DataFrame Adds two * n_modes logsum columns to each tour row, e.g. "logsum_DRIVE_outbound" """ - trips = inject.get_table("trips").to_frame() + trips = state.get_dataframe("trips") trip_dir_mode_logsums = trips.pivot( index="tour_id", columns=["tour_mode", "outbound"], @@ -130,7 +127,7 @@ def append_tour_leg_trip_mode_choice_logsums(tours): def get_trip_mc_logsums_for_all_modes( - tours, segment_column_name, model_settings, trace_label + state: workflow.State, tours, segment_column_name, model_settings, trace_label ): """Creates pseudo-trips from tours and runs trip mode choice to get logsums @@ -150,51 +147,55 @@ def get_trip_mc_logsums_for_all_modes( # create pseudo-trips from tours for all tour modes logsum_trips = create_logsum_trips( - tours, segment_column_name, model_settings, trace_label + state, tours, segment_column_name, model_settings, trace_label ) # temporarily register trips in the pipeline - pipeline.replace_table("trips", logsum_trips) - tracing.register_traceable_table("trips", logsum_trips) - pipeline.get_rn_generator().add_channel("trips", logsum_trips) + state.add_table("trips", logsum_trips) + state.tracing.register_traceable_table("trips", logsum_trips) + state.get_rn_generator().add_channel("trips", logsum_trips) - # run trip mode choice on pseudo-trips. use orca instead of pipeline to + # run trip mode choice on pseudo-trips. use a direct call instead of pipeline to # execute the step because pipeline can only handle one open step at a time - orca.run(["trip_mode_choice"]) + from .trip_mode_choice import trip_mode_choice + + trip_mode_choice(state, logsum_trips, state.get("network_los")) # add trip mode choice logsums as new cols in tours - tours = append_tour_leg_trip_mode_choice_logsums(tours) + tours = append_tour_leg_trip_mode_choice_logsums(state, tours) # de-register logsum trips table - pipeline.get_rn_generator().drop_channel("trips") - tracing.deregister_traceable_table("trips") + state.get_rn_generator().drop_channel("trips") + state.tracing.deregister_traceable_table("trips") return tours -@inject.step() +@workflow.step def tour_mode_choice_simulate( - tours, persons_merged, network_los, chunk_size, trace_hh_id -): + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: """ Tour mode choice simulate """ trace_label = "tour_mode_choice" model_settings_file_name = "tour_mode_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "tour_mode" segment_column_name = "tour_purpose" - primary_tours = tours.to_frame() + primary_tours = tours assert not (primary_tours.tour_category == "atwork").any() logger.info("Running %s with %d tours" % (trace_label, primary_tours.shape[0])) tracing.print_summary("tour_types", primary_tours.tour_type, value_counts=True) - persons_merged = persons_merged.to_frame() primary_tours_merged = pd.merge( primary_tours, persons_merged, @@ -278,10 +279,10 @@ def tour_mode_choice_simulate( # don't create estimation data bundle if trip mode choice is being called # from another model step (i.e. tour mode choice logsum creation) - if pipeline.get_rn_generator().step_name != "tour_mode_choice_simulate": + if state.get_rn_generator().step_name != "tour_mode_choice_simulate": estimator = None else: - estimator = estimation.manager.begin_estimation("tour_mode_choice") + estimator = estimation.manager.begin_estimation(state, "tour_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) @@ -304,14 +305,17 @@ def tour_mode_choice_simulate( # if trip logsums are used, run trip mode choice and append the logsums if model_settings.get("COMPUTE_TRIP_MODE_CHOICE_LOGSUMS", False): primary_tours_merged = get_trip_mc_logsums_for_all_modes( - primary_tours_merged, segment_column_name, model_settings, trace_label + state, + primary_tours_merged, + segment_column_name, + model_settings, + trace_label, ) choices_list = [] for tour_purpose, tours_segment in primary_tours_merged.groupby( segment_column_name ): - logger.info( "tour_mode_choice_simulate tour_type '%s' (%s tours)" % ( @@ -328,6 +332,7 @@ def tour_mode_choice_simulate( assert tours_segment.index.name == "tour_id" choices_df = run_tour_mode_choice_simulate( + state, tours_segment, tour_purpose, model_settings, @@ -337,7 +342,6 @@ def tour_mode_choice_simulate( skims=skims, constants=constants, estimator=estimator, - chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, tour_purpose), trace_choice_name="tour_mode_choice", ) @@ -354,22 +358,18 @@ def tour_mode_choice_simulate( # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") if tvpb_mode_path_types is not None: for mode, path_types in tvpb_mode_path_types.items(): - for direction, skim in zip( ["od", "do"], [tvpb_logsum_odt, tvpb_logsum_dot] ): - path_type = path_types[direction] skim_cache = skim.cache[path_type] print(f"mode {mode} direction {direction} path_type {path_type}") for c in skim_cache: - dest_col = f"{direction}_{c}" if dest_col not in choices_df: @@ -400,26 +400,26 @@ def tour_mode_choice_simulate( assign_in_place(primary_tours, choices_df) # update tours table with mode choice (and optionally logsums) - all_tours = tours.to_frame() + all_tours = tours assign_in_place(all_tours, choices_df) - if pipeline.is_table("school_escort_tours") & model_settings.get( + if state.is_table("school_escort_tours") & model_settings.get( "FORCE_ESCORTEE_CHAUFFEUR_MODE_MATCH", True ): all_tours = ( school_escort_tours_trips.force_escortee_tour_modes_to_match_chauffeur( - all_tours + state, all_tours ) ) - pipeline.replace_table("tours", all_tours) + state.add_table("tours", all_tours) # - annotate tours table if model_settings.get("annotate_tours"): - annotate.annotate_tours(model_settings, trace_label) + annotate.annotate_tours(state, model_settings, trace_label) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( primary_tours, label=tracing.extend_trace_label(trace_label, mode_column_name), slicer="tour_id", diff --git a/activitysim/abm/models/tour_od_choice.py b/activitysim/abm/models/tour_od_choice.py index 0825a21ea3..41f1593c05 100644 --- a/activitysim/abm/models/tour_od_choice.py +++ b/activitysim/abm/models/tour_od_choice.py @@ -1,22 +1,27 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, inject, pipeline, simulate, tracing -from activitysim.core.util import assign_in_place - -from .util import estimation, tour_od +from activitysim.abm.models.util import tour_od +from activitysim.core import estimation, los, workflow logger = logging.getLogger(__name__) -@inject.step() +@workflow.step def tour_od_choice( - tours, persons, households, land_use, network_los, chunk_size, trace_hh_id -): - + state: workflow.State, + tours: pd.DataFrame, + persons: pd.DataFrame, + households: pd.DataFrame, + land_use: pd.DataFrame, + network_los: los.Network_LOS, + chunk_size, +) -> None: """Simulates joint origin/destination choice for all tours. Given a set of previously generated tours, each tour needs to have an @@ -42,46 +47,41 @@ def tour_od_choice( lazy-loaded activitysim.los.Network_LOS object chunk_size simulation chunk size, set in main settings.yaml - trace_hh_id : int - households to trace, set in main settings.yaml """ trace_label = "tour_od_choice" model_settings_file_name = "tour_od_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) origin_col_name = model_settings["ORIG_COL_NAME"] dest_col_name = model_settings["DEST_COL_NAME"] alt_id_col = tour_od.get_od_id_col(origin_col_name, dest_col_name) + trace_hh_id = state.settings.trace_hh_id sample_table_name = model_settings.get("OD_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) logsum_column_name = model_settings.get("OD_CHOICE_LOGSUM_COLUMN_NAME", None) want_logsums = logsum_column_name is not None - tours = tours.to_frame() - # interaction_sample_simulate insists choosers appear in same order as alts tours = tours.sort_index() - estimator = estimation.manager.begin_estimation("tour_od_choice") + estimator = estimation.manager.begin_estimation(state, "tour_od_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_spec(model_settings, tag="SAMPLE_SPEC") estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(alt_id_col) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + state.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(state.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_od.run_tour_od( + state, tours, persons, want_logsums, @@ -115,12 +115,8 @@ def tour_od_choice( tours[logsum_column_name] = ( choices_df["logsum"].reindex(tours.index).astype("float") ) - tours["poe_id"] = tours[origin_col_name].map( - land_use.to_frame(columns="poe_id").poe_id - ) + tours["poe_id"] = tours[origin_col_name].map(land_use.poe_id) - households = households.to_frame() - persons = persons.to_frame() households[origin_col_name] = tours.set_index("household_id")[ origin_col_name ].reindex(households.index) @@ -134,16 +130,16 @@ def tour_od_choice( households["home_zone_id"] = households[origin_col_name] persons["home_zone_id"] = persons[origin_col_name] - pipeline.replace_table("tours", tours) - pipeline.replace_table("persons", persons) - pipeline.replace_table("households", households) + state.add_table("tours", tours) + state.add_table("persons", persons) + state.add_table("households", households) if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( tours, label="tours_od_choice", slicer="person_id", diff --git a/activitysim/abm/models/tour_scheduling_probabilistic.py b/activitysim/abm/models/tour_scheduling_probabilistic.py index 89fb416768..7d5961529b 100644 --- a/activitysim/abm/models/tour_scheduling_probabilistic.py +++ b/activitysim/abm/models/tour_scheduling_probabilistic.py @@ -1,33 +1,30 @@ # ActivitySim # See full license in LICENSE.txt +from __future__ import annotations import logging -import numpy as np import pandas as pd -from activitysim.abm.models.util import estimation -from activitysim.core import chunk, config, inject, logit, pipeline, tracing -from activitysim.core.util import reindex - -from .util import probabilistic_scheduling as ps +from activitysim.abm.models.util import probabilistic_scheduling as ps +from activitysim.core import chunk, estimation, workflow logger = logging.getLogger(__name__) def run_tour_scheduling_probabilistic( - tours_df, - scheduling_probs, - probs_join_cols, - depart_alt_base, - chunk_size, - trace_label, - trace_hh_id, + state: workflow.State, + tours_df: pd.DataFrame, + scheduling_probs: pd.DataFrame, + probs_join_cols: str | list[str], + depart_alt_base: int, + trace_label: str, ): """Make probabilistic tour scheduling choices in chunks Parameters ---------- + state: workflow.State tours_df : pandas.DataFrame table of tours scheduling_probs : pandas.DataFrame @@ -37,12 +34,8 @@ def run_tour_scheduling_probabilistic( depart_alt_base : int int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am - chunk_size : int - size of chooser chunks, set in main settings.yaml trace_label : str label to append to tracing logs and table names - trace_hh_id : int - households to trace Returns ------- @@ -50,10 +43,14 @@ def run_tour_scheduling_probabilistic( series of chosen alternative indices for each chooser """ result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - tours_df, chunk_size, trace_label, trace_label - ): + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(state, tours_df, trace_label, trace_label): choices = ps.make_scheduling_choices( + state, chooser_chunk, "departure", scheduling_probs, @@ -62,9 +59,9 @@ def run_tour_scheduling_probabilistic( first_trip_in_leg=False, report_failed_trips=True, trace_label=chunk_trace_label, - trace_hh_id=trace_hh_id, trace_choice_col_name="depart_return", clip_earliest_latest=False, + chunk_sizer=chunk_sizer, ) result_list.append(choices) @@ -72,8 +69,8 @@ def run_tour_scheduling_probabilistic( return choices -@inject.step() -def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): +@workflow.step +def tour_scheduling_probabilistic(state: workflow.State, tours: pd.DataFrame) -> None: """Makes tour departure and arrival choices by sampling from a probability lookup table This model samples tour scheduling choices from an exogenously defined probability @@ -83,7 +80,7 @@ def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): Parameters ---------- - tours : orca.DataFrameWrapper + tours : DataFrame lazy-loaded table of tours chunk_size : int size of chooser chunks, defined in main settings.yaml @@ -94,16 +91,20 @@ def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): trace_label = "tour_scheduling_probabilistic" model_settings_file_name = "tour_scheduling_probabilistic.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) depart_alt_base = model_settings.get("depart_alt_base", 0) - scheduling_probs_filepath = config.config_file_path(model_settings["PROBS_SPEC"]) + scheduling_probs_filepath = state.filesystem.get_config_file_path( + model_settings["PROBS_SPEC"] + ) scheduling_probs = pd.read_csv(scheduling_probs_filepath) probs_join_cols = model_settings["PROBS_JOIN_COLS"] - tours_df = tours.to_frame() + tours_df = tours # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode - estimator = estimation.manager.begin_estimation("tour_scheduling_probabilistic") + estimator = estimation.manager.begin_estimation( + state, "tour_scheduling_probabilistic" + ) if estimator: estimator.write_spec(model_settings, tag="PROBS_SPEC") estimator.write_model_settings(model_settings, model_settings_file_name) @@ -111,13 +112,12 @@ def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): estimator.write_choosers(tours_df[chooser_cols_for_estimation]) choices = run_tour_scheduling_probabilistic( + state, tours_df, scheduling_probs, probs_join_cols, depart_alt_base, - chunk_size, trace_label, - trace_hh_id, ) # convert alt index choices to depart/return times @@ -150,4 +150,4 @@ def tour_scheduling_probabilistic(tours, chunk_size, trace_hh_id): assert not tours_df["end"].isnull().any() assert not tours_df["duration"].isnull().any() - pipeline.replace_table("tours", tours_df) + state.add_table("tours", tours_df) diff --git a/activitysim/abm/models/transit_pass_ownership.py b/activitysim/abm/models/transit_pass_ownership.py index 92d97080f9..8fc23cc95d 100644 --- a/activitysim/abm/models/transit_pass_ownership.py +++ b/activitysim/abm/models/transit_pass_ownership.py @@ -1,17 +1,29 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import numpy as np +import pandas as pd -from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger("activitysim") -@inject.step() -def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def transit_pass_ownership( + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, +) -> None: """ Transit pass ownership model. """ @@ -19,32 +31,34 @@ def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): trace_label = "transit_pass_ownership" model_settings_file_name = "transit_pass_ownership.yaml" - choosers = persons_merged.to_frame() + choosers = persons_merged logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("transit_pass_ownership") + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + estimator = estimation.manager.begin_estimation(state, "transit_pass_ownership") constants = config.get_model_constants(model_settings) # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -55,11 +69,11 @@ def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_choosers(choosers) choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="transit_pass_ownership", estimator=estimator, @@ -73,14 +87,13 @@ def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["transit_pass_ownership"] = choices.reindex(persons.index) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary( "transit_pass_ownership", persons.transit_pass_ownership, value_counts=True ) - if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/transit_pass_subsidy.py b/activitysim/abm/models/transit_pass_subsidy.py index 45a118fda8..cd6f3aa106 100644 --- a/activitysim/abm/models/transit_pass_subsidy.py +++ b/activitysim/abm/models/transit_pass_subsidy.py @@ -1,17 +1,29 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import numpy as np +import pandas as pd -from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger("activitysim") -@inject.step() -def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def transit_pass_subsidy( + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, +) -> None: """ Transit pass subsidy model. """ @@ -19,32 +31,34 @@ def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): trace_label = "transit_pass_subsidy" model_settings_file_name = "transit_pass_subsidy.yaml" - choosers = persons_merged.to_frame() + choosers = persons_merged logger.info("Running %s with %d persons", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("transit_pass_subsidy") + model_settings = state.filesystem.read_model_settings(model_settings_file_name) + estimator = estimation.manager.begin_estimation(state, "transit_pass_subsidy") constants = config.get_model_constants(model_settings) # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) @@ -55,11 +69,11 @@ def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_choosers(choosers) choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="transit_pass_subsidy", estimator=estimator, @@ -73,14 +87,13 @@ def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["transit_pass_subsidy"] = choices.reindex(persons.index) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary( "transit_pass_subsidy", persons.transit_pass_subsidy, value_counts=True ) - if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index 2dad8d37c0..9eb7dc90d7 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -1,3 +1,7 @@ +# ActivitySim +# See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np @@ -8,14 +12,12 @@ chunk, config, expressions, - inject, interaction_simulate, logit, - pipeline, simulate, tracing, + workflow, ) -from activitysim.core.simulate import set_skim_wrapper_targets from activitysim.core.util import reindex logger = logging.getLogger(__name__) @@ -164,7 +166,6 @@ def build_patterns(trips, time_windows): def get_spec_for_segment(omnibus_spec, segment): - spec = omnibus_spec[[segment]] # might as well ignore any spec rows with 0 utility @@ -174,15 +175,23 @@ def get_spec_for_segment(omnibus_spec, segment): return spec -def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_label"): +def choose_tour_leg_pattern( + state, + trip_segment, + patterns, + spec, + trace_label="trace_label", + *, + chunk_sizer: chunk.ChunkSizer, +): alternatives = generate_alternatives(trip_segment, STOP_TIME_DURATION).sort_index() - have_trace_targets = tracing.has_trace_targets(trip_segment) + have_trace_targets = state.tracing.has_trace_targets(trip_segment) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( trip_segment, tracing.extend_trace_label(trace_label, "choosers") ) - tracing.trace_df( + state.tracing.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), transpose=False, @@ -201,14 +210,14 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab interaction_df = alternatives.join(trip_segment, how="left", rsuffix="_chooser") - chunk.log_df(trace_label, "interaction_df", interaction_df) + chunk_sizer.log_df(trace_label, "interaction_df", interaction_df) if have_trace_targets: - trace_rows, trace_ids = tracing.interaction_trace_rows( + trace_rows, trace_ids = state.tracing.interaction_trace_rows( interaction_df, trip_segment ) - tracing.trace_df( + state.tracing.trace_df( interaction_df, tracing.extend_trace_label(trace_label, "interaction_df"), transpose=False, @@ -220,13 +229,13 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( - spec, interaction_df, None, trace_label, trace_rows, estimator=None + state, spec, interaction_df, None, trace_label, trace_rows, estimator=None ) interaction_utilities = pd.concat( [interaction_df[STOP_TIME_DURATION], interaction_utilities], axis=1 ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) interaction_utilities = pd.merge( interaction_utilities.reset_index(), @@ -236,20 +245,20 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab ) if have_trace_targets: - tracing.trace_interaction_eval_results( + state.tracing.trace_interaction_eval_results( trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, "eval"), ) - tracing.trace_df( + state.tracing.trace_df( interaction_utilities, tracing.extend_trace_label(trace_label, "interaction_utilities"), transpose=False, ) del interaction_df - chunk.log_df(trace_label, "interaction_df", None) + chunk_sizer.log_df(trace_label, "interaction_df", None) interaction_utilities = interaction_utilities.groupby( [TOUR_ID, OUTBOUND, PATTERN_ID], as_index=False @@ -271,7 +280,7 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab sample_counts = ( interaction_utilities.groupby(interaction_utilities.index).size().values ) - chunk.log_df(trace_label, "sample_counts", sample_counts) + chunk_sizer.log_df(trace_label, "sample_counts", sample_counts) # max number of alternatvies for any chooser max_sample_count = sample_counts.max() @@ -286,28 +295,28 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts) del sample_counts - chunk.log_df(trace_label, "sample_counts", None) + chunk_sizer.log_df(trace_label, "sample_counts", None) # insert the zero-prob utilities to pad each alternative set to same size padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999) del inserts del interaction_utilities - chunk.log_df(trace_label, "interaction_utilities", None) + chunk_sizer.log_df(trace_label, "interaction_utilities", None) # reshape to array with one row per chooser, one column per alternative padded_utilities = padded_utilities.reshape(-1, max_sample_count) - chunk.log_df(trace_label, "padded_utilities", padded_utilities) + chunk_sizer.log_df(trace_label, "padded_utilities", padded_utilities) # convert to a dataframe with one row per chooser and one column per alternative utilities_df = pd.DataFrame(padded_utilities, index=tour_choosers.index.unique()) - chunk.log_df(trace_label, "utilities_df", utilities_df) + chunk_sizer.log_df(trace_label, "utilities_df", utilities_df) del padded_utilities - chunk.log_df(trace_label, "padded_utilities", None) + chunk_sizer.log_df(trace_label, "padded_utilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( utilities_df, tracing.extend_trace_label(trace_label, "utilities"), column_labels=["alternative", "utility"], @@ -316,16 +325,16 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative probs = logit.utils_to_probs( - utilities_df, trace_label=trace_label, trace_choosers=trip_segment + state, utilities_df, trace_label=trace_label, trace_choosers=trip_segment ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities_df - chunk.log_df(trace_label, "utilities_df", None) + chunk_sizer.log_df(trace_label, "utilities_df", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -335,14 +344,14 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=trip_segment + state, probs, trace_label=trace_label, trace_choosers=trip_segment ) - chunk.log_df(trace_label, "positions", positions) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "positions", positions) + chunk_sizer.log_df(trace_label, "rands", rands) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) # shouldn't have chosen any of the dummy pad utilities assert positions.max() < max_sample_count @@ -354,15 +363,15 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab # resulting pandas Int64Index has one element per chooser row and is in same order as choosers choices = tour_choosers[PATTERN_ID].take(positions + first_row_offsets) - chunk.log_df(trace_label, "choices", choices) + chunk_sizer.log_df(trace_label, "choices", choices) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices, tracing.extend_trace_label(trace_label, "choices"), columns=[None, PATTERN_ID], ) - tracing.trace_df( + state.tracing.trace_df( rands, tracing.extend_trace_label(trace_label, "rands"), columns=[None, "rand"], @@ -371,8 +380,7 @@ def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_lab return choices -def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): - +def apply_stage_two_model(state, omnibus_spec, trips, chunk_size, trace_label): if not trips.index.is_monotonic: trips = trips.sort_index() @@ -426,10 +434,8 @@ def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): i, chooser_chunk, chunk_trace_label, - ) in chunk.adaptive_chunked_choosers_by_chunk_id( - side_trips, chunk_size, trace_label - ): - + chunk_sizer, + ) in chunk.adaptive_chunked_choosers_by_chunk_id(state, side_trips, trace_label): for is_outbound, trip_segment in chooser_chunk.groupby(OUTBOUND): direction = OUTBOUND if is_outbound else "inbound" spec = get_spec_for_segment(omnibus_spec, direction) @@ -438,7 +444,12 @@ def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): patterns = build_patterns(trip_segment, time_windows) choices = choose_tour_leg_pattern( - trip_segment, patterns, spec, trace_label=segment_trace_label + state, + trip_segment, + patterns, + spec, + trace_label=segment_trace_label, + chunk_sizer=chunk_sizer, ) choices = pd.merge( @@ -466,15 +477,16 @@ def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): return trips["depart"].astype(int) -@inject.step() -def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_id): - +@workflow.step +def trip_departure_choice( + state: workflow.State, trips: pd.DataFrame, trips_merged: pd.DataFrame, skim_dict +) -> None: trace_label = "trip_departure_choice" - model_settings = config.read_model_settings("trip_departure_choice.yaml") + model_settings = state.filesystem.read_model_settings("trip_departure_choice.yaml") - spec = simulate.read_model_spec(file_name=model_settings["SPECIFICATION"]) + spec = state.filesystem.read_model_spec(file_name=model_settings["SPECIFICATION"]) - trips_merged_df = trips_merged.to_frame() + trips_merged_df = trips_merged # add tour-based chunk_id so we can chunk all trips in tour together tour_ids = trips_merged[TOUR_ID].unique() trips_merged_df["chunk_id"] = reindex( @@ -490,7 +502,7 @@ def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_i preprocessor_settings = model_settings.get("PREPROCESSOR", None) tour_legs = get_tour_legs(trips_merged_df) - pipeline.get_rn_generator().add_channel("tour_legs", tour_legs) + state.get_rn_generator().add_channel("tour_legs", tour_legs) if preprocessor_settings: od_skim = skim_dict.wrap("origin", "destination") @@ -508,18 +520,21 @@ def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_i ) expressions.assign_columns( + state, df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - choices = apply_stage_two_model(spec, trips_merged_df, chunk_size, trace_label) + choices = apply_stage_two_model( + state, spec, trips_merged_df, state.settings.chunk_size, trace_label + ) - trips_df = trips.to_frame() + trips_df = trips trip_length = len(trips_df) trips_df = pd.concat([trips_df, choices], axis=1) assert len(trips_df) == trip_length assert trips_df[trips_df["depart"].isnull()].empty - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index 545cfee29f..4b0384cecb 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging from builtins import range from pathlib import Path @@ -7,6 +9,9 @@ import numpy as np import pandas as pd +from activitysim.abm.models.util.school_escort_tours_trips import ( + split_out_school_escorting_trips, +) from activitysim.abm.models.util.trip import ( cleanup_failed_trips, flag_failed_trip_leg_mates, @@ -15,23 +20,20 @@ from activitysim.core import ( chunk, config, + estimation, expressions, - inject, los, - pipeline, simulate, tracing, + workflow, ) +from activitysim.core.configuration.base import Any, PydanticBase from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.skim_dictionary import DataFrameMatrix from activitysim.core.tracing import print_elapsed_time from activitysim.core.util import assign_in_place, reindex -from ...core.configuration.base import Any, PydanticBase -from .util.school_escort_tours_trips import split_out_school_escorting_trips -from .util import estimation - logger = logging.getLogger(__name__) NO_DESTINATION = -1 @@ -74,7 +76,9 @@ class TripDestinationSettings(PydanticBase): """This setting is used by testing code to force failed trip_destination.""" +@workflow.func def _destination_sample( + state: workflow.State, primary_purpose, trips, alternatives, @@ -83,7 +87,6 @@ def _destination_sample( skims, alt_dest_col_name, estimator, - chunk_size, chunk_tag, trace_label, zone_layer=None, @@ -105,6 +108,7 @@ def _destination_sample( """ spec = simulate.spec_for_segment( + state, model_settings, spec_id="DESTINATION_SAMPLE_SPEC", segment_name=primary_purpose, @@ -112,7 +116,7 @@ def _destination_sample( ) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + if state.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count @@ -138,9 +142,10 @@ def _destination_sample( ) locals_dict.update(skims) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_sample( + state, choosers=trips, alternatives=alternatives, sample_size=sample_size, @@ -150,7 +155,7 @@ def _destination_sample( spec=spec, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer=zone_layer, @@ -159,7 +164,9 @@ def _destination_sample( return choices +@workflow.func def destination_sample( + state: workflow.State, primary_purpose, trips, alternatives, @@ -170,13 +177,13 @@ def destination_sample( chunk_size, trace_label, ): - chunk_tag = "trip_destination.sample" skims = skim_hotel.sample_skims(presample=False) alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _destination_sample( + state, primary_purpose, trips, alternatives, @@ -185,7 +192,6 @@ def destination_sample( skims, alt_dest_col_name, estimator, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, ) @@ -194,7 +200,6 @@ def destination_sample( def aggregate_size_term_matrix(maz_size_term_matrix, network_los): - df = maz_size_term_matrix.df assert ALT_DEST_TAZ not in df @@ -207,7 +212,13 @@ def aggregate_size_term_matrix(maz_size_term_matrix, network_los): def choose_MAZ_for_TAZ( - taz_sample, MAZ_size_terms, trips, network_los, alt_dest_col_name, trace_label + state, + taz_sample, + MAZ_size_terms, + trips, + network_los, + alt_dest_col_name, + trace_label, ): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ @@ -235,14 +246,14 @@ def choose_MAZ_for_TAZ( taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True) - trace_hh_id = inject.get_injectable("trace_hh_id", None) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) + trace_hh_id = state.settings.trace_hh_id + have_trace_targets = trace_hh_id and state.tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") # write taz choices, pick_counts, probs - trace_targets = tracing.trace_targets(taz_sample) - tracing.trace_df( + trace_targets = state.tracing.trace_targets(taz_sample) + state.tracing.trace_df( taz_sample[trace_targets], label=tracing.extend_trace_label(trace_label, "taz_sample"), transpose=False, @@ -298,7 +309,8 @@ def choose_MAZ_for_TAZ( # (preserve index, which will have duplicates as result of join) maz_taz = ( - network_los.get_maz_to_taz_series.rename(DEST_TAZ) + network_los.get_maz_to_taz_series(state) + .rename(DEST_TAZ) .rename_axis(index=DEST_MAZ) .to_frame() .reset_index() @@ -323,9 +335,11 @@ def choose_MAZ_for_TAZ( if have_trace_targets: # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term] - maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer="trip_id") + maz_sizes_trace_targets = state.tracing.trace_targets( + maz_sizes, slicer="trip_id" + ) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df( + state.tracing.trace_df( trace_maz_sizes, label=tracing.extend_trace_label(trace_label, "maz_sizes"), transpose=False, @@ -358,7 +372,7 @@ def choose_MAZ_for_TAZ( assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = ( - pipeline.get_rn_generator() + state.get_rn_generator() .random_for_df(chooser_df, n=taz_sample_size) .reshape(-1, 1) ) @@ -378,10 +392,11 @@ def choose_MAZ_for_TAZ( taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - - taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer="trip_id") + taz_choices_trace_targets = state.tracing.trace_targets( + taz_choices, slicer="trip_id" + ) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df( + state.tracing.trace_df( trace_taz_choices_df, label=tracing.extend_trace_label(trace_label, "taz_choices"), transpose=False, @@ -407,7 +422,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), transpose=False, @@ -423,7 +438,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), transpose=False, @@ -437,7 +452,7 @@ def choose_MAZ_for_TAZ( ) df = pd.concat([lhs_df, df], axis=1) df["rand"] = rands[taz_choices_trace_targets] - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), transpose=False, @@ -453,7 +468,9 @@ def choose_MAZ_for_TAZ( return taz_choices +@workflow.func def destination_presample( + state: workflow.State, primary_purpose, trips, alternatives, @@ -462,11 +479,8 @@ def destination_presample( skim_hotel, network_los, estimator, - chunk_size, - trace_hh_id, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") chunk_tag = "trip_destination.presample" # distinguish from trip_destination.sample @@ -494,6 +508,7 @@ def destination_presample( skims = skim_hotel.sample_skims(presample=True) taz_sample = _destination_sample( + state, primary_purpose, trips_taz, alternatives, @@ -502,7 +517,6 @@ def destination_presample( skims, alt_dest_col_name, estimator, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer="taz", @@ -510,7 +524,13 @@ def destination_presample( # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_sample = choose_MAZ_for_TAZ( - taz_sample, size_term_matrix, trips, network_los, alt_dest_col_name, trace_label + state, + taz_sample, + size_term_matrix, + trips, + network_los, + alt_dest_col_name, + trace_label, ) assert alt_dest_col_name in maz_sample @@ -519,6 +539,7 @@ def destination_presample( def trip_destination_sample( + state: workflow.State, primary_purpose, trips, alternatives, @@ -527,7 +548,6 @@ def trip_destination_sample( skim_hotel, estimator, chunk_size, - trace_hh_id, trace_label, ): """ @@ -552,9 +572,9 @@ def trip_destination_sample( assert len(alternatives) > 0 # by default, enable presampling for multizone systems, unless they disable it in settings file - network_los = inject.get_injectable("network_los") + network_los = state.get_injectable("network_los") pre_sample_taz = network_los.zone_system != los.ONE_ZONE - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not state.settings.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -562,13 +582,13 @@ def trip_destination_sample( ) if pre_sample_taz: - logger.info( "Running %s trip_destination_presample with %d trips" % (trace_label, len(trips)) ) choices = destination_presample( + state, primary_purpose, trips, alternatives, @@ -577,13 +597,12 @@ def trip_destination_sample( skim_hotel, network_los, estimator, - chunk_size, - trace_hh_id, trace_label, ) else: choices = destination_sample( + state, primary_purpose, trips, alternatives, @@ -598,7 +617,9 @@ def trip_destination_sample( return choices +@workflow.func def compute_ood_logsums( + state: workflow.State, choosers, logsum_settings, nest_spec, @@ -623,13 +644,16 @@ def compute_ood_logsums( # in `chunk.chunk_log()` at chunk.py L927. To avoid failing this assertion, # the preprocessor must be called from within a "null chunker" as follows: with chunk.chunk_log( - tracing.extend_trace_label(trace_label, "annotate_preprocessor"), base=True + state, + tracing.extend_trace_label(trace_label, "annotate_preprocessor"), + base=True, ): expressions.annotate_preprocessors( - choosers, locals_dict, od_skims, logsum_settings, trace_label + state, choosers, locals_dict, od_skims, logsum_settings, trace_label ) logsums = simulate.simple_simulate_logsums( + state, choosers, logsum_spec, nest_spec, @@ -649,14 +673,14 @@ def compute_ood_logsums( def compute_logsums( + state: workflow.State, primary_purpose, - trips, + trips: pd.DataFrame, destination_sample, - tours_merged, + tours_merged: pd.DataFrame, model_settings, skim_hotel, - chunk_size, - trace_label, + trace_label: str, ): """ Calculate mode choice logsums using the same recipe as for trip_mode_choice, but do it twice @@ -674,7 +698,7 @@ def compute_logsums( chunk_tag = "trip_destination.compute_logsums" # FIXME should pass this in? - network_los = inject.get_injectable("network_los") + network_los = state.get_injectable("network_los") # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( @@ -694,14 +718,20 @@ def compute_logsums( ).set_index("trip_id") assert choosers.index.equals(destination_sample.index) - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) - coefficients = simulate.get_segment_coefficients(logsum_settings, primary_purpose) + logsum_settings = state.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) + coefficients = state.filesystem.get_segment_coefficients( + logsum_settings, primary_purpose + ) nest_spec = config.get_logit_model_settings(logsum_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) - logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) - logsum_spec = simulate.eval_coefficients(logsum_spec, coefficients, estimator=None) + logsum_spec = state.filesystem.read_model_spec(file_name=logsum_settings["SPEC"]) + logsum_spec = simulate.eval_coefficients( + state, logsum_spec, coefficients, estimator=None + ) locals_dict = {} locals_dict.update(config.get_model_constants(logsum_settings)) @@ -734,13 +764,14 @@ def compute_logsums( } ) destination_sample["od_logsum"] = compute_ood_logsums( + state, choosers, logsum_settings, nest_spec, logsum_spec, od_skims, locals_dict, - chunk_size, + state.settings.chunk_size, trace_label=tracing.extend_trace_label(trace_label, "od"), chunk_tag=chunk_tag, ) @@ -762,13 +793,14 @@ def compute_logsums( ) destination_sample["dp_logsum"] = compute_ood_logsums( + state, choosers, logsum_settings, nest_spec, logsum_spec, dp_skims, locals_dict, - chunk_size, + state.settings.chunk_size, trace_label=tracing.extend_trace_label(trace_label, "dp"), chunk_tag=chunk_tag, ) @@ -777,6 +809,7 @@ def compute_logsums( def trip_destination_simulate( + state: workflow.State, primary_purpose, trips, destination_sample, @@ -785,8 +818,6 @@ def trip_destination_simulate( size_term_matrix, skim_hotel, estimator, - chunk_size, - trace_hh_id, trace_label, ): """ @@ -802,6 +833,7 @@ def trip_destination_simulate( chunk_tag = "trip_destination.simulate" spec = simulate.spec_for_segment( + state, model_settings, spec_id="DESTINATION_SPEC", segment_name=primary_purpose, @@ -833,8 +865,9 @@ def trip_destination_simulate( ) locals_dict.update(skims) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers destinations = interaction_sample_simulate( + state, choosers=trips, alternatives=destination_sample, spec=spec, @@ -845,7 +878,7 @@ def trip_destination_simulate( zero_prob_choice_val=NO_DESTINATION, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, trace_choice_name="trip_dest", @@ -874,7 +907,9 @@ def trip_destination_simulate( return destinations +@workflow.func def choose_trip_destination( + state: workflow.State, primary_purpose, trips, alternatives, @@ -886,16 +921,15 @@ def choose_trip_destination( skim_hotel, estimator, chunk_size, - trace_hh_id, trace_label, ): - logger.info("choose_trip_destination %s with %d trips", trace_label, trips.shape[0]) t0 = print_elapsed_time() # - trip_destination_sample destination_sample = trip_destination_sample( + state, primary_purpose=primary_purpose, trips=trips, alternatives=alternatives, @@ -904,7 +938,6 @@ def choose_trip_destination( skim_hotel=skim_hotel, estimator=estimator, chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, ) @@ -924,20 +957,20 @@ def choose_trip_destination( # - compute logsums destination_sample = compute_logsums( + state, primary_purpose=primary_purpose, trips=trips, destination_sample=destination_sample, tours_merged=tours_merged, model_settings=model_settings, skim_hotel=skim_hotel, - chunk_size=chunk_size, trace_label=trace_label, ) t0 = print_elapsed_time("%s.compute_logsums" % trace_label, t0) - # - trip_destination_simulate destinations = trip_destination_simulate( + state, primary_purpose=primary_purpose, trips=trips, destination_sample=destination_sample, @@ -946,8 +979,6 @@ def choose_trip_destination( size_term_matrix=size_term_matrix, skim_hotel=skim_hotel, estimator=estimator, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, ) @@ -974,14 +1005,12 @@ def choose_trip_destination( class SkimHotel(object): def __init__(self, model_settings, network_los, trace_label): - self.model_settings = model_settings self.trace_label = tracing.extend_trace_label(trace_label, "skim_hotel") self.network_los = network_los self.zone_system = network_los.zone_system def sample_skims(self, presample): - o = self.model_settings["TRIP_ORIGIN"] d = self.model_settings["ALT_DEST_COL_NAME"] n = self.model_settings.get("PRIMARY_ORIGIN", "origin") @@ -1027,7 +1056,6 @@ def sample_skims(self, presample): return skims def logsum_skims(self): - o = self.model_settings["TRIP_ORIGIN"] d = self.model_settings["ALT_DEST_COL_NAME"] p = self.model_settings["PRIMARY_DEST"] @@ -1099,12 +1127,13 @@ def logsum_skims(self): return skims +@workflow.func def run_trip_destination( + state: workflow.State, trips, tours_merged, estimator, chunk_size, - trace_hh_id, trace_label, fail_some_trips_for_testing=False, ): @@ -1132,22 +1161,23 @@ def run_trip_destination( """ model_settings_file_name = "trip_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) preprocessor_settings = model_settings.get("preprocessor", None) - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = state.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) - land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") - network_los = inject.get_injectable("network_los") + land_use = state.get_dataframe("land_use") + size_terms = state.get_injectable("size_terms") + network_los = state.get_injectable("network_los") trips = trips.sort_index() trips["next_trip_id"] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where(trips.trip_num < trips.trip_count, 0) @@ -1162,7 +1192,7 @@ def run_trip_destination( # stop_frequency step calls trip.initialize_from_tours. But if this module is being # called from trip_destination_and_purpose, these columns will have been deleted # so they must be re-created - if pipeline.get_rn_generator().step_name == "trip_purpose_and_destination": + if state.get_rn_generator().step_name == "trip_purpose_and_destination": trips["destination"] = np.where(trips.outbound, tour_destination, tour_origin) trips["origin"] = np.where(trips.outbound, tour_origin, tour_destination) trips["failed"] = False @@ -1231,13 +1261,11 @@ def run_trip_destination( # - process intermediate trips in ascending trip_num order intermediate = trips.trip_num < trips.trip_count if intermediate.any(): - first_trip_num = trips[intermediate].trip_num.min() last_trip_num = trips[intermediate].trip_num.max() # iterate over trips in ascending trip_num order for trip_num in range(first_trip_num, last_trip_num + 1): - nth_trips = trips[intermediate & (trips.trip_num == trip_num)] nth_trace_label = tracing.extend_trace_label( trace_label, "trip_num_%s" % trip_num @@ -1252,6 +1280,7 @@ def run_trip_destination( # - annotate nth_trips if preprocessor_settings: expressions.assign_columns( + state, df=nth_trips, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -1273,6 +1302,7 @@ def run_trip_destination( choices_list = [] for primary_purpose, trips_segment in nth_trips.groupby("primary_purpose"): choices, destination_sample = choose_trip_destination( + state, primary_purpose, trips_segment, alternatives, @@ -1284,7 +1314,6 @@ def run_trip_destination( skim_hotel, estimator, chunk_size, - trace_hh_id, trace_label=tracing.extend_trace_label( nth_trace_label, primary_purpose ), @@ -1348,8 +1377,10 @@ def run_trip_destination( return trips, save_sample_df -@inject.step() -def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): +@workflow.step +def trip_destination( + state: workflow.State, trips: pd.DataFrame, tours_merged: pd.DataFrame +) -> None: """ Choose a destination for all intermediate trips based on trip purpose. @@ -1364,10 +1395,10 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): Parameters ---------- - trips : orca.DataFrameWrapper + trips : DataFrame The trips table. This table is edited in-place to add the trip destinations. - tours_merged : orca.DataFrameWrapper + tours_merged : DataFrame The tours table, with columns merge from persons and households as well. chunk_size : int If non-zero, iterate over trips using this chunk size. @@ -1378,24 +1409,24 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_destination" model_settings_file_name = "trip_destination.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) CLEANUP = model_settings.get("CLEANUP", True) fail_some_trips_for_testing = model_settings.get( "fail_some_trips_for_testing", False ) - trips_df = trips.to_frame() - tours_merged_df = tours_merged.to_frame() + trips_df = trips + tours_merged_df = tours_merged - if pipeline.is_table("school_escort_trips"): - school_escort_trips = pipeline.get_table("school_escort_trips") + if state.is_table("school_escort_trips"): + school_escort_trips = state.get_dataframe("school_escort_trips") # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips ) - estimator = estimation.manager.begin_estimation("trip_destination") + estimator = estimation.manager.begin_estimation(state, "trip_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) @@ -1403,30 +1434,25 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table( - inject.get_injectable("size_terms"), "size_terms", append=False - ) - estimator.write_table( - inject.get_table("land_use").to_frame(), "landuse", append=False + state.get_injectable("size_terms"), "size_terms", append=False ) + estimator.write_table(state.get_dataframe("land_use"), "landuse", append=False) estimator.write_model_settings(model_settings, model_settings_file_name) logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) trips_df, save_sample_df = run_trip_destination( + state, trips_df, tours_merged_df, estimator=estimator, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, + chunk_size=state.settings.chunk_size, trace_label=trace_label, fail_some_trips_for_testing=fail_some_trips_for_testing, ) # testing feature t0 make sure at least one trip fails so trip_purpose_and_destination model is run - if ( - config.setting("testing_fail_trip_destination", False) - and not trips_df.failed.any() - ): + if state.settings.testing_fail_trip_destination and not trips_df.failed.any(): if (trips_df.trip_num < trips_df.trip_count).sum() == 0: raise RuntimeError( "can't honor 'testing_fail_trip_destination' setting because no intermediate trips" @@ -1439,12 +1465,12 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): if trips_df.failed.any(): logger.warning("%s %s failed trips", trace_label, trips_df.failed.sum()) - if inject.get_injectable("pipeline_file_prefix", None): - file_name = f"{trace_label}_failed_trips_{inject.get_injectable('pipeline_file_prefix')}" + if state.get_injectable("pipeline_file_prefix", None): + file_name = f"{trace_label}_failed_trips_{state.get_injectable('pipeline_file_prefix')}" else: file_name = f"{trace_label}_failed_trips" logger.info("writing failed trips to %s", file_name) - tracing.write_csv( + state.tracing.write_csv( trips_df[trips_df.failed], file_name=file_name, transpose=False ) @@ -1454,7 +1480,6 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): assert not trips_df.failed.any() if CLEANUP: - if trips_df.failed.any(): flag_failed_trip_leg_mates(trips_df, "failed") @@ -1467,7 +1492,7 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): trips_df.drop(columns="failed", inplace=True, errors="ignore") - if pipeline.is_table("school_escort_trips"): + if state.is_table("school_escort_trips"): # setting destination for school escort trips se_trips_df["destination"] = reindex( school_escort_trips.destination, se_trips_df.index @@ -1484,10 +1509,10 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): trips_df.groupby("tour_id")["destination"].shift(), ).astype(int) - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( trips_df, label=trace_label, slicer="trip_id", @@ -1511,6 +1536,6 @@ def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): ) # lest they try to put tour samples into the same table - if pipeline.is_table(sample_table_name): + if state.is_table(sample_table_name): raise RuntimeError("sample table %s already exists" % sample_table_name) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) diff --git a/activitysim/abm/models/trip_matrices.py b/activitysim/abm/models/trip_matrices.py index 0c9e1f447f..5476b3983d 100644 --- a/activitysim/abm/models/trip_matrices.py +++ b/activitysim/abm/models/trip_matrices.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging @@ -7,13 +8,17 @@ import openmatrix as omx import pandas as pd -from activitysim.core import config, expressions, inject, los, pipeline +from activitysim.core import config, expressions, los, workflow logger = logging.getLogger(__name__) -@inject.step() -def write_trip_matrices(network_los): +@workflow.step(copy_tables=["trips"]) +def write_trip_matrices( + state: workflow.State, + network_los: los.Network_LOS, + trips: pd.DataFrame, +) -> None: """ Write trip matrices step. @@ -32,7 +37,6 @@ def write_trip_matrices(network_los): """ - trips = inject.get_table("trips", None) if trips is None: # this step is a NOP if there is no trips table # this might legitimately happen if they comment out some steps to debug but still want write_tables @@ -42,14 +46,16 @@ def write_trip_matrices(network_los): ) return - model_settings = config.read_model_settings("write_trip_matrices.yaml") - trips_df = annotate_trips(trips, network_los, model_settings) + model_settings = state.filesystem.read_model_settings("write_trip_matrices.yaml") + trips_df = annotate_trips(state, trips, network_los, model_settings) if bool(model_settings.get("SAVE_TRIPS_TABLE")): - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) - if "parking_location" in config.setting("models"): - parking_settings = config.read_model_settings("parking_location_choice.yaml") + if "parking_location" in state.settings.models: + parking_settings = state.filesystem.read_model_settings( + "parking_location_choice.yaml" + ) parking_taz_col_name = parking_settings["ALT_DEST_COL_NAME"] if parking_taz_col_name in trips_df: # TODO make parking zone negative, not zero, if not used @@ -78,7 +84,7 @@ def write_trip_matrices(network_los): dest_vals = aggregate_trips.index.get_level_values("destination") # use the land use table for the set of possible tazs - land_use = pipeline.get_table("land_use") + land_use = state.get_dataframe("land_use") zone_index = land_use.index assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) @@ -92,16 +98,18 @@ def write_trip_matrices(network_los): zone_labels = land_use.index write_matrices( - aggregate_trips, zone_labels, orig_index, dest_index, model_settings + state, aggregate_trips, zone_labels, orig_index, dest_index, model_settings ) elif network_los.zone_system == los.TWO_ZONE: # maz trips written to taz matrices logger.info("aggregating trips two zone...") trips_df["otaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["origin"]).TAZ.tolist() + state.get_dataframe("land_use").reindex(trips_df["origin"]).TAZ.tolist() ) trips_df["dtaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["destination"]).TAZ.tolist() + state.get_dataframe("land_use") + .reindex(trips_df["destination"]) + .TAZ.tolist() ) aggregate_trips = trips_df.groupby(["otaz", "dtaz"], sort=False).sum( numeric_only=True @@ -120,7 +128,7 @@ def write_trip_matrices(network_los): dest_vals = aggregate_trips.index.get_level_values("dtaz") try: - land_use_taz = pipeline.get_table("land_use_taz") + land_use_taz = state.get_dataframe("land_use_taz") except (KeyError, RuntimeError): pass # table missing, ignore else: @@ -128,7 +136,7 @@ def write_trip_matrices(network_los): orig_vals = orig_vals.map(land_use_taz["_original_TAZ"]) dest_vals = dest_vals.map(land_use_taz["_original_TAZ"]) - zone_index = pd.Index(network_los.get_tazs(), name="TAZ") + zone_index = pd.Index(network_los.get_tazs(state), name="TAZ") assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) @@ -136,19 +144,20 @@ def write_trip_matrices(network_los): _, dest_index = zone_index.reindex(dest_vals) write_matrices( - aggregate_trips, zone_index, orig_index, dest_index, model_settings + state, aggregate_trips, zone_index, orig_index, dest_index, model_settings ) elif ( network_los.zone_system == los.THREE_ZONE ): # maz trips written to taz and tap matrices - logger.info("aggregating trips three zone taz...") trips_df["otaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["origin"]).TAZ.tolist() + state.get_dataframe("land_use").reindex(trips_df["origin"]).TAZ.tolist() ) trips_df["dtaz"] = ( - pipeline.get_table("land_use").reindex(trips_df["destination"]).TAZ.tolist() + state.get_dataframe("land_use") + .reindex(trips_df["destination"]) + .TAZ.tolist() ) aggregate_trips = trips_df.groupby(["otaz", "dtaz"], sort=False).sum( numeric_only=True @@ -167,7 +176,7 @@ def write_trip_matrices(network_los): dest_vals = aggregate_trips.index.get_level_values("dtaz") try: - land_use_taz = pipeline.get_table("land_use_taz") + land_use_taz = state.get_dataframe("land_use_taz") except (KeyError, RuntimeError): pass # table missing, ignore else: @@ -175,7 +184,7 @@ def write_trip_matrices(network_los): orig_vals = orig_vals.map(land_use_taz["_original_TAZ"]) dest_vals = dest_vals.map(land_use_taz["_original_TAZ"]) - zone_index = pd.Index(network_los.get_tazs(), name="TAZ") + zone_index = pd.Index(network_los.get_tazs(state), name="TAZ") assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) @@ -183,7 +192,7 @@ def write_trip_matrices(network_los): _, dest_index = zone_index.reindex(dest_vals) write_matrices( - aggregate_trips, zone_index, orig_index, dest_index, model_settings + state, aggregate_trips, zone_index, orig_index, dest_index, model_settings ) logger.info("aggregating trips three zone tap...") @@ -211,11 +220,20 @@ def write_trip_matrices(network_los): _, dest_index = zone_index.reindex(dest_vals) write_matrices( - aggregate_trips, zone_index, orig_index, dest_index, model_settings, True + state, + aggregate_trips, + zone_index, + orig_index, + dest_index, + model_settings, + True, ) -def annotate_trips(trips, network_los, model_settings): +@workflow.func +def annotate_trips( + state: workflow.State, trips: pd.DataFrame, network_los, model_settings +): """ Add columns to local trips table. The annotator has access to the origin/destination skims and everything @@ -225,7 +243,7 @@ def annotate_trips(trips, network_los, model_settings): TABLES in the preprocessor settings. """ - trips_df = trips.to_frame() + trips_df = trips trace_label = "trip_matrices" @@ -246,7 +264,7 @@ def annotate_trips(trips, network_los, model_settings): locals_dict.update(constants) expressions.annotate_preprocessors( - trips_df, locals_dict, skims, model_settings, trace_label + state, trips_df, locals_dict, skims, model_settings, trace_label ) if not np.issubdtype(trips_df["trip_period"].dtype, np.integer): @@ -263,14 +281,20 @@ def annotate_trips(trips, network_los, model_settings): if hh_weight_col and hh_weight_col not in trips_df: logger.info("adding '%s' from households to trips table" % hh_weight_col) - household_weights = pipeline.get_table("households")[hh_weight_col] + household_weights = state.get_dataframe("households")[hh_weight_col] trips_df[hh_weight_col] = trips_df.household_id.map(household_weights) return trips_df def write_matrices( - aggregate_trips, zone_index, orig_index, dest_index, model_settings, is_tap=False + state: workflow.State, + aggregate_trips, + zone_index, + orig_index, + dest_index, + model_settings, + is_tap=False, ): """ Write aggregated trips to OMX format. @@ -294,9 +318,9 @@ def write_matrices( if matrix_is_tap == is_tap: # only write tap matrices to tap matrix files filename = matrix.get("file_name") - filepath = config.output_file_path(filename) + filepath = state.get_output_file_path(filename) logger.info("opening %s" % filepath) - file = omx.open_file(filepath, "w") # possibly overwrite existing file + file = omx.open_file(str(filepath), "w") # possibly overwrite existing file table_settings = matrix.get("tables") for table in table_settings: diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index e7bb200e45..8f3e9f418b 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -1,33 +1,33 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging import numpy as np import pandas as pd +from activitysim.abm.models.util import annotate, school_escort_tours_trips +from activitysim.abm.models.util.mode import mode_choice_simulate from activitysim.core import ( - assign, chunk, config, + estimation, expressions, - inject, los, - pipeline, simulate, tracing, + workflow, ) -from activitysim.core.pathbuilder import TransitVirtualPathBuilder from activitysim.core.util import assign_in_place -from .util import estimation, annotate, school_escort_tours_trips -from .util.mode import mode_choice_simulate - logger = logging.getLogger(__name__) -@inject.step() -def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): +@workflow.step +def trip_mode_choice( + state: workflow.State, trips: pd.DataFrame, network_los: los.Network_LOS +) -> None: """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. @@ -39,12 +39,12 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): trace_label = "trip_mode_choice" model_settings_file_name = "trip_mode_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") mode_column_name = "trip_mode" - trips_df = trips.to_frame() + trips_df = trips logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) # give trip mode choice the option to run without calling tours_merged. Useful for xborder @@ -56,7 +56,7 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): if col not in trips_df.columns ] if len(tours_cols) > 0: - tours_merged = inject.get_table("tours_merged").to_frame(columns=tours_cols) + tours_merged = state.get_dataframe("tours_merged", columns=tours_cols) else: tours_merged = pd.DataFrame() @@ -152,24 +152,23 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): # don't create estimation data bundle if trip mode choice is being called # from another model step (e.g. tour mode choice logsum creation) - if pipeline._PIPELINE.rng().step_name != "trip_mode_choice": + if state.current_model_name != "trip_mode_choice": estimator = None else: - estimator = estimation.manager.begin_estimation("trip_mode_choice") + estimator = estimation.manager.begin_estimation(state, "trip_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) nest_spec = config.get_logit_model_settings(model_settings) cols_to_keep = model_settings.get("CHOOSER_COLS_TO_KEEP", None) choices_list = [] cols_to_keep_list = [] for primary_purpose, trips_segment in trips_merged.groupby("primary_purpose"): - segment_trace_label = tracing.extend_trace_label(trace_label, primary_purpose) logger.info( @@ -187,7 +186,7 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): tvpb_logsum_odt.extend_trace_label(primary_purpose) # tvpb_logsum_dot.extend_trace_label(primary_purpose) - coefficients = simulate.get_segment_coefficients( + coefficients = state.filesystem.get_segment_coefficients( model_settings, primary_purpose ) @@ -202,10 +201,17 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): # have to initialize chunker for preprocessing in order to access # tvpb logsum terms in preprocessor expressions. with chunk.chunk_log( - tracing.extend_trace_label(trace_label, "preprocessing"), base=True + state, + tracing.extend_trace_label(trace_label, "preprocessing"), + base=True, ): expressions.annotate_preprocessors( - trips_segment, locals_dict, skims, model_settings, segment_trace_label + state, + trips_segment, + locals_dict, + skims, + model_settings, + segment_trace_label, ) if estimator: @@ -216,14 +222,14 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): locals_dict["timeframe"] = "trip" choices = mode_choice_simulate( + state, choosers=trips_segment, - spec=simulate.eval_coefficients(model_spec, coefficients, estimator), + spec=simulate.eval_coefficients(state, model_spec, coefficients, estimator), nest_spec=simulate.eval_nest_coefficients( nest_spec, coefficients, segment_trace_label ), skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=segment_trace_label, @@ -231,9 +237,9 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): estimator=estimator, ) - if trace_hh_id: + if state.settings.trace_hh_id: # trace the coefficients - tracing.trace_df( + state.tracing.trace_df( pd.Series(locals_dict), label=tracing.extend_trace_label(segment_trace_label, "constants"), transpose=False, @@ -243,7 +249,7 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): # so we can trace with annotations assign_in_place(trips_segment, choices) - tracing.trace_df( + state.tracing.trace_df( trips_segment, label=tracing.extend_trace_label(segment_trace_label, "trip_mode"), slicer="tour_id", @@ -265,10 +271,8 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") for mode, path_type in tvpb_mode_path_types.items(): - skim_cache = tvpb_logsum_odt.cache[path_type] for c in skim_cache: @@ -288,8 +292,7 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): ) estimator.write_override_choices(choices_df.trip_mode) estimator.end_estimation() - trips_df = trips.to_frame() - + trips_df = trips # adding columns from the chooser table to include in final output if len(cols_to_keep_list) > 0: cols_to_keep_df = pd.concat(cols_to_keep_list) @@ -297,12 +300,12 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): assign_in_place(trips_df, choices_df) - if pipeline.is_table("school_escort_tours") & model_settings.get( + if state.is_table("school_escort_tours") & model_settings.get( "FORCE_ESCORTEE_CHAUFFEUR_MODE_MATCH", True ): trips_df = ( school_escort_tours_trips.force_escortee_trip_modes_to_match_chauffeur( - trips_df + state, trips_df ) ) @@ -314,13 +317,13 @@ def trip_mode_choice(trips, network_los, chunk_size, trace_hh_id): assert not trips_df[mode_column_name].isnull().any() - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) if model_settings.get("annotate_trips"): - annotate.annotate_trips(model_settings, trace_label, locals_dict) + annotate.annotate_trips(state, model_settings, trace_label, locals_dict) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( trips_df, label=tracing.extend_trace_label(trace_label, "trip_mode"), slicer="trip_id", diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 1e48444e73..7af387e21c 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -1,23 +1,26 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging + import numpy as np import pandas as pd +from activitysim.abm.models.util.school_escort_tours_trips import ( + split_out_school_escorting_trips, +) from activitysim.core import ( chunk, config, + estimation, expressions, - inject, logit, - pipeline, simulate, tracing, + workflow, ) - -from .util import estimation from activitysim.core.util import reindex -from .util.school_escort_tours_trips import split_out_school_escorting_trips logger = logging.getLogger(__name__) @@ -45,6 +48,7 @@ def map_coefficients(spec, coefficients): def choose_intermediate_trip_purpose( + state: workflow.State, trips, probs_spec, estimator, @@ -52,6 +56,8 @@ def choose_intermediate_trip_purpose( use_depart_time, trace_hh_id, trace_label, + *, + chunk_sizer: chunk.ChunkSizer, ): """ chose purpose for intermediate trips based on probs_spec @@ -68,7 +74,7 @@ def choose_intermediate_trip_purpose( purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) + have_trace_targets = trace_hh_id and state.tracing.has_trace_targets(trips) # probs should sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) @@ -78,10 +84,9 @@ def choose_intermediate_trip_purpose( choosers = pd.merge( trips.reset_index(), probs_spec, on=probs_join_cols, how="left" ).set_index("trip_id") - chunk.log_df(trace_label, "choosers", choosers) + chunk_sizer.log_df(trace_label, "choosers", choosers) if use_depart_time: - # select the matching depart range (this should result on in exactly one chooser row per trip) chooser_probs = (choosers.start >= choosers["depart_range_start"]) & ( choosers.start <= choosers["depart_range_end"] @@ -89,7 +94,6 @@ def choose_intermediate_trip_purpose( # if we failed to match a row in probs_spec if chooser_probs.sum() < num_trips: - # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols missing_trip_ids = trips.index[ ~trips.index.isin(choosers.index[chooser_probs]) @@ -100,7 +104,7 @@ def choose_intermediate_trip_purpose( ] # join to persons for better diagnostics - persons = inject.get_table("persons").to_frame() + persons = state.get_dataframe("persons") persons_cols = [ "age", "is_worker", @@ -129,7 +133,9 @@ def choose_intermediate_trip_purpose( file_name, ) ) - tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) + state.tracing.write_csv( + unmatched_choosers, file_name=file_name, transpose=False + ) raise RuntimeError( "Some trips could not be matched to probs based on join columns %s." % probs_join_cols @@ -147,20 +153,20 @@ def choose_intermediate_trip_purpose( estimator.write_table(choosers[probs_cols], "probs", append=True) choices, rands = logit.make_choices( - choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers + state, choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers ) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices, "%s.choices" % trace_label, columns=[None, "trip_purpose"] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + state.tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) choices = choices.map(pd.Series(purpose_cols)) return choices -def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): +def run_trip_purpose(state: workflow.State, trips_df, estimator, trace_label): """ trip purpose - main functionality separated from model step so it can be called iteratively @@ -181,15 +187,17 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): chunk_tag = "trip_purpose" model_settings_file_name = "trip_purpose.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) probs_join_cols = model_settings.get("probs_join_cols", PROBS_JOIN_COLUMNS) spec_file_name = model_settings.get("PROBS_SPEC", "trip_purpose_probs.csv") - probs_spec = pd.read_csv(config.config_file_path(spec_file_name), comment="#") + probs_spec = pd.read_csv( + state.filesystem.get_config_file_path(spec_file_name), comment="#" + ) # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation - # coefficients_df = simulate.read_model_coefficients(model_settings) + # coefficients_df = state.filesystem.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) if estimator: @@ -221,6 +229,7 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns( + state, df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -229,22 +238,27 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): use_depart_time = model_settings.get("use_depart_time", True) - for i, trips_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - trips_df, chunk_size, chunk_tag, trace_label - ): + for ( + i, + trips_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(state, trips_df, chunk_tag, trace_label): choices = choose_intermediate_trip_purpose( + state, trips_chunk, probs_spec, estimator, probs_join_cols=probs_join_cols, use_depart_time=use_depart_time, - trace_hh_id=trace_hh_id, + trace_hh_id=state.settings.trace_hh_id, trace_label=chunk_trace_label, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, f"result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -252,9 +266,8 @@ def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): return choices -@inject.step() -def trip_purpose(trips, chunk_size, trace_hh_id): - +@workflow.step +def trip_purpose(state: workflow.State, trips: pd.DataFrame) -> None: """ trip purpose model step - calls run_trip_purpose to run the actual model @@ -262,16 +275,16 @@ def trip_purpose(trips, chunk_size, trace_hh_id): """ trace_label = "trip_purpose" - trips_df = trips.to_frame() + trips_df = trips - if pipeline.is_table("school_escort_trips"): - school_escort_trips = pipeline.get_table("school_escort_trips") + if state.is_table("school_escort_trips"): + school_escort_trips = state.get_dataframe("school_escort_trips") # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips ) - estimator = estimation.manager.begin_estimation("trip_purpose") + estimator = estimation.manager.begin_estimation(state, "trip_purpose") if estimator: chooser_cols_for_estimation = [ "person_id", @@ -282,10 +295,9 @@ def trip_purpose(trips, chunk_size, trace_hh_id): estimator.write_choosers(trips_df[chooser_cols_for_estimation]) choices = run_trip_purpose( + state, trips_df, estimator, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label, ) @@ -299,7 +311,7 @@ def trip_purpose(trips, chunk_size, trace_hh_id): trips_df["purpose"] = choices - if pipeline.is_table("school_escort_trips"): + if state.is_table("school_escort_trips"): # setting purpose for school escort trips se_trips_df["purpose"] = reindex(school_escort_trips.purpose, se_trips_df.index) # merge trips back together preserving index order @@ -309,10 +321,10 @@ def trip_purpose(trips, chunk_size, trace_hh_id): # we should have assigned a purpose to all trips assert not trips_df.purpose.isnull().any() - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( trips_df, label=trace_label, slicer="trip_id", diff --git a/activitysim/abm/models/trip_purpose_and_destination.py b/activitysim/abm/models/trip_purpose_and_destination.py index 31bca977ec..3e7baf2b02 100644 --- a/activitysim/abm/models/trip_purpose_and_destination.py +++ b/activitysim/abm/models/trip_purpose_and_destination.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd @@ -10,64 +12,70 @@ cleanup_failed_trips, flag_failed_trip_leg_mates, ) -from activitysim.core import config, inject, pipeline, tracing +from activitysim.core import estimation, tracing, workflow from activitysim.core.util import assign_in_place -from .util import estimation - logger = logging.getLogger(__name__) +@workflow.func def run_trip_purpose_and_destination( - trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label + state: workflow.State, + trips_df, + tours_merged_df, + chunk_size, + trace_label, ): - assert not trips_df.empty + trace_hh_id = state.settings.trace_hh_id choices = run_trip_purpose( + state, trips_df, estimator=None, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "purpose"), ) trips_df["purpose"] = choices trips_df, save_sample_df = run_trip_destination( + state, trips_df, tours_merged_df, estimator=None, chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "destination"), ) return trips_df, save_sample_df -@inject.step() -def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): - +@workflow.step +def trip_purpose_and_destination( + state: workflow.State, + trips: pd.DataFrame, + tours_merged: pd.DataFrame, +) -> None: trace_label = "trip_purpose_and_destination" - model_settings = config.read_model_settings("trip_purpose_and_destination.yaml") + model_settings = state.filesystem.read_model_settings( + "trip_purpose_and_destination.yaml" + ) # for consistency, read sample_table_name setting from trip_destination settings file - trip_destination_model_settings = config.read_model_settings( + trip_destination_model_settings = state.filesystem.read_model_settings( "trip_destination.yaml" ) sample_table_name = trip_destination_model_settings.get( "DEST_CHOICE_SAMPLE_TABLE_NAME" ) want_sample_table = ( - config.setting("want_dest_choice_sample_tables") - and sample_table_name is not None + state.settings.want_dest_choice_sample_tables and sample_table_name is not None ) MAX_ITERATIONS = model_settings.get("MAX_ITERATIONS", 5) - trips_df = trips.to_frame() - tours_merged_df = tours_merged.to_frame() + trips_df = trips + tours_merged_df = tours_merged if trips_df.empty: logger.info("%s - no trips. Nothing to do." % trace_label) @@ -79,7 +87,6 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if "destination" in trips_df: - if "failed" not in trips_df.columns: # trip_destination model cleaned up any failed trips logger.info("%s - no failed column from prior model run." % trace_label) @@ -89,7 +96,7 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): # 'failed' column but no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) trips_df.drop(columns="failed", inplace=True) - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) return else: @@ -102,19 +109,19 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): logger.info("Rerunning %s failed trips and leg-mates" % trips_df.shape[0]) # drop any previously saved samples of failed trips - if want_sample_table and pipeline.is_table(sample_table_name): + if want_sample_table and state.is_table(sample_table_name): logger.info("Dropping any previously saved samples of failed trips") - save_sample_df = pipeline.get_table(sample_table_name) + save_sample_df = state.get_dataframe(sample_table_name) save_sample_df.drop(trips_df.index, level="trip_id", inplace=True) - pipeline.replace_table(sample_table_name, save_sample_df) + state.add_table(sample_table_name, save_sample_df) del save_sample_df # if we estimated trip_destination, there should have been no failed trips # if we didn't, but it is enabled, it is probably a configuration error # if we just estimated trip_purpose, it isn't clear what they are trying to do , nor how to handle it assert not ( - estimation.manager.begin_estimation("trip_purpose") - or estimation.manager.begin_estimation("trip_destination") + estimation.manager.begin_estimation(state, "trip_purpose") + or estimation.manager.begin_estimation(state, "trip_destination") ) processed_trips = [] @@ -122,7 +129,6 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): i = 0 TRIP_RESULT_COLUMNS = ["purpose", "destination", "origin", "failed"] while True: - i += 1 for c in TRIP_RESULT_COLUMNS: @@ -130,16 +136,16 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): del trips_df[c] trips_df, save_sample_df = run_trip_purpose_and_destination( + state, trips_df, tours_merged_df, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, + chunk_size=state.settings.chunk_size, trace_label=tracing.extend_trace_label(trace_label, "i%s" % i), ) # # if testing, make sure at least one trip fails if ( - config.setting("testing_fail_trip_destination", False) + state.settings.testing_fail_trip_destination and (i == 1) and not trips_df.failed.any() ): @@ -162,7 +168,7 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): ) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) - tracing.write_csv( + state.tracing.write_csv( trips_df[trips_df.failed], file_name=file_name, transpose=False ) @@ -202,34 +208,34 @@ def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): logger.info( "adding %s samples to %s" % (len(save_sample_df), sample_table_name) ) - pipeline.extend_table(sample_table_name, save_sample_df) + state.extend_table(sample_table_name, save_sample_df) logger.info( "%s %s failed trips after %s iterations" % (trace_label, processed_trips.failed.sum(), i) ) - trips_df = trips.to_frame() + trips_df = trips assign_in_place(trips_df, processed_trips) trips_df = cleanup_failed_trips(trips_df) - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) # check to make sure we wrote sample file if requestsd if want_sample_table and len(trips_df) > 0: - assert pipeline.is_table(sample_table_name) + assert state.is_table(sample_table_name) # since we have saved samples for all successful trips # once we discard failed trips, we should samples for all trips - save_sample_df = pipeline.get_table(sample_table_name) + save_sample_df = state.get_dataframe(sample_table_name) # expect samples only for intermediate trip destinatinos assert len(save_sample_df.index.get_level_values(0).unique()) == len( trips_df[trips_df.trip_num < trips_df.trip_count] ) del save_sample_df - if trace_hh_id: - tracing.trace_df( + if state.settings.trace_hh_id: + state.tracing.trace_df( trips_df, label=trace_label, slicer="trip_id", diff --git a/activitysim/abm/models/trip_scheduling.py b/activitysim/abm/models/trip_scheduling.py index 54c3eb201c..15fe0c6d9b 100644 --- a/activitysim/abm/models/trip_scheduling.py +++ b/activitysim/abm/models/trip_scheduling.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import warnings from builtins import range @@ -7,14 +9,14 @@ import numpy as np import pandas as pd -from activitysim.abm.models.util import estimation +from activitysim.abm.models.util import probabilistic_scheduling as ps +from activitysim.abm.models.util.school_escort_tours_trips import ( + split_out_school_escorting_trips, +) from activitysim.abm.models.util.trip import cleanup_failed_trips, failed_trip_cohorts -from activitysim.core import chunk, config, expressions, inject, pipeline, tracing +from activitysim.core import chunk, config, estimation, expressions, tracing, workflow from activitysim.core.util import reindex -from .util import probabilistic_scheduling as ps -from .util.school_escort_tours_trips import split_out_school_escorting_trips - logger = logging.getLogger(__name__) """ @@ -189,24 +191,26 @@ def update_tour_earliest(trips, outbound_choices, logic_version: int): def schedule_trips_in_leg( + state: workflow.State, outbound, trips, probs_spec, model_settings, is_last_iteration, - trace_hh_id, trace_label, + *, + chunk_sizer: chunk.ChunkSizer, ): """ Parameters ---------- + state outbound trips probs_spec depart_alt_base is_last_iteration - trace_hh_id trace_label Returns @@ -237,6 +241,7 @@ def schedule_trips_in_leg( "Invalid scheduling mode specified: {0}.".format(scheduling_mode), "Please select one of ['departure', 'stop_duration', 'relative'] and try again.", ) + raise ValueError(f"Invalid scheduling mode specified: {scheduling_mode}") # logger.debug("%s scheduling %s trips" % (trace_label, trips.shape[0])) @@ -274,7 +279,7 @@ def schedule_trips_in_leg( ADJUST_NEXT_DEPART_COL = "latest" trips.next_trip_id = trips.next_trip_id.where(~is_final, NO_TRIP_ID) - network_los = inject.get_injectable("network_los") + network_los = state.get_injectable("network_los") locals_dict = {"network_los": network_los} locals_dict.update(config.get_model_constants(model_settings)) @@ -285,6 +290,7 @@ def schedule_trips_in_leg( # - annotate trips if preprocessor_settings: expressions.assign_columns( + state, df=trips, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -303,6 +309,7 @@ def schedule_trips_in_leg( nth_trips = trips[trips.trip_num == trips.trip_count - i] choices = ps.make_scheduling_choices( + state, nth_trips, scheduling_mode, probs_spec, @@ -310,8 +317,8 @@ def schedule_trips_in_leg( depart_alt_base, first_trip_in_leg=first_trip_in_leg, report_failed_trips=is_last_iteration, - trace_hh_id=trace_hh_id, trace_label=nth_trace_label, + chunk_sizer=chunk_sizer, ) # most initial departure (when no choice was made because all probs were zero) @@ -347,7 +354,7 @@ def schedule_trips_in_leg( result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) first_trip_in_leg = False @@ -358,15 +365,16 @@ def schedule_trips_in_leg( def run_trip_scheduling( + state: workflow.State, trips_chunk, tours, probs_spec, model_settings, estimator, is_last_iteration, - chunk_size, - trace_hh_id, trace_label, + *, + chunk_sizer: chunk.ChunkSizer, ): set_tour_hour(trips_chunk, tours) set_stop_num(trips_chunk) @@ -382,17 +390,18 @@ def run_trip_scheduling( leg_chunk = trips_chunk[trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, "outbound") choices = schedule_trips_in_leg( + state, outbound=True, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, - trace_hh_id=trace_hh_id, trace_label=leg_trace_label, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) # departure time of last outbound trips must constrain # departure times for initial inbound trips @@ -402,25 +411,30 @@ def run_trip_scheduling( leg_chunk = trips_chunk[~trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, "inbound") choices = schedule_trips_in_leg( + state, outbound=False, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, - trace_hh_id=trace_hh_id, trace_label=leg_trace_label, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) choices = pd.concat(result_list) return choices -@inject.step() -def trip_scheduling(trips, tours, chunk_size, trace_hh_id): +@workflow.step(copy_tables=False) +def trip_scheduling( + state: workflow.State, + trips: pd.DataFrame, + tours: pd.DataFrame, +) -> None: """ Trip scheduling assigns depart times for trips within the start, end limits of the tour. @@ -468,13 +482,12 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): """ trace_label = "trip_scheduling" model_settings_file_name = "trip_scheduling.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) - trips_df = trips.to_frame() - tours = tours.to_frame() + trips_df = trips.copy() - if pipeline.is_table("school_escort_trips"): - school_escort_trips = pipeline.get_table("school_escort_trips") + if state.is_table("school_escort_trips"): + school_escort_trips = state.get_dataframe("school_escort_trips") # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips @@ -486,7 +499,7 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode - estimator = estimation.manager.begin_estimation("trip_scheduling") + estimator = estimation.manager.begin_estimation(state, "trip_scheduling") if estimator: estimator.write_spec(model_settings, tag="PROBS_SPEC") estimator.write_model_settings(model_settings, model_settings_file_name) @@ -505,11 +518,12 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): estimator.write_choosers(trips_df[chooser_cols_for_estimation]) probs_spec_file = model_settings.get("PROBS_SPEC", "trip_scheduling_probs.csv") - logger.debug(f"probs_spec_file: {config.config_file_path(probs_spec_file)}") - probs_spec = pd.read_csv(config.config_file_path(probs_spec_file), comment="#") + probs_spec = pd.read_csv( + state.filesystem.get_config_file_path(probs_spec_file), comment="#" + ) # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation - # coefficients_df = simulate.read_model_coefficients(model_settings) + # coefficients_df = state.filesystem.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) # add tour-based chunk_id so we can chunk all trips in tour together @@ -529,13 +543,16 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): chunk_i, trips_chunk, chunk_trace_label, + chunk_sizer, ) in chunk.adaptive_chunked_choosers_by_chunk_id( - trips_df, chunk_size, trace_label, trace_label + state, trips_df, trace_label, trace_label ): i = 0 while (i < max_iterations) and not trips_chunk.empty: # only chunk log first iteration since memory use declines with each iteration - with chunk.chunk_log(trace_label) if i == 0 else chunk.chunk_log_skip(): + with chunk.chunk_log( + state, trace_label + ) if i == 0 else chunk.chunk_log_skip(): i += 1 is_last_iteration = i == max_iterations @@ -548,15 +565,15 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): ) choices = run_trip_scheduling( + state, trips_chunk, tours, probs_spec, model_settings, estimator=estimator, is_last_iteration=is_last_iteration, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, trace_label=trace_label_i, + chunk_sizer=chunk_sizer, ) # boolean series of trips whose individual trip scheduling failed @@ -576,9 +593,9 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): choices_list.append(choices) - trips_df = trips.to_frame() + trips_df = trips.copy() - if pipeline.is_table("school_escort_trips"): + if state.is_table("school_escort_trips"): # separate out school escorting trips to exclude them from the model and estimation data bundle trips_df, se_trips_df, full_trips_index = split_out_school_escorting_trips( trips_df, school_escort_trips @@ -615,7 +632,7 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): trips_df["depart"] = choices - if pipeline.is_table("school_escort_trips"): + if state.is_table("school_escort_trips"): # setting destination for school escort trips se_trips_df["depart"] = reindex(school_escort_trips.depart, se_trips_df.index) non_se_trips_df["depart"] = reindex(trips_df.depart, non_se_trips_df.index) @@ -631,4 +648,4 @@ def trip_scheduling(trips, tours, chunk_size, trace_hh_id): assert not trips_df.depart.isnull().any() - pipeline.replace_table("trips", trips_df) + state.add_table("trips", trips_df) diff --git a/activitysim/abm/models/trip_scheduling_choice.py b/activitysim/abm/models/trip_scheduling_choice.py index 5a7abe1f53..91630c340f 100644 --- a/activitysim/abm/models/trip_scheduling_choice.py +++ b/activitysim/abm/models/trip_scheduling_choice.py @@ -1,4 +1,9 @@ +# ActivitySim +# See full license in LICENSE.txt. +from __future__ import annotations + import logging +from typing import Mapping import numpy as np import pandas as pd @@ -7,15 +12,7 @@ generate_alternative_sizes, get_time_windows, ) -from activitysim.core import ( - chunk, - config, - expressions, - inject, - pipeline, - simulate, - tracing, -) +from activitysim.core import chunk, expressions, simulate, tracing, workflow from activitysim.core.interaction_sample_simulate import _interaction_sample_simulate logger = logging.getLogger(__name__) @@ -195,7 +192,7 @@ def get_pattern_index_and_arrays(tour_indexes, durations, one_way=True): return indexes, patterns, pattern_sizes -def get_spec_for_segment(model_settings, spec_name, segment): +def get_spec_for_segment(state: workflow.State, model_settings, spec_name, segment): """ Read in the model spec :param model_settings: model settings file @@ -204,7 +201,7 @@ def get_spec_for_segment(model_settings, spec_name, segment): :return: array of utility equations """ - omnibus_spec = simulate.read_model_spec(file_name=model_settings[spec_name]) + omnibus_spec = state.filesystem.read_model_spec(file_name=model_settings[spec_name]) spec = omnibus_spec[[segment]] @@ -216,9 +213,13 @@ def get_spec_for_segment(model_settings, spec_name, segment): def run_trip_scheduling_choice( - spec, tours, skims, locals_dict, chunk_size, trace_hh_id, trace_label + state: workflow.State, + spec: pd.DataFrame, + tours: pd.DataFrame, + skims, + locals_dict: Mapping, + trace_label: str, ): - NUM_TOUR_LEGS = 3 trace_label = tracing.extend_trace_label(trace_label, "interaction_sample_simulate") @@ -258,13 +259,14 @@ def run_trip_scheduling_choice( indirect_tours = tours.loc[tours[HAS_OB_STOPS] | tours[HAS_IB_STOPS]] if len(indirect_tours) > 0: - # Iterate through the chunks result_list = [] - for i, choosers, chunk_trace_label in chunk.adaptive_chunked_choosers( - indirect_tours, chunk_size, trace_label - ): - + for ( + i, + choosers, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(state, indirect_tours, trace_label): # Sort the choosers and get the schedule alternatives choosers = choosers.sort_index() schedules = generate_schedule_alternatives(choosers).sort_index() @@ -275,6 +277,7 @@ def run_trip_scheduling_choice( # Run the simulation choices = _interaction_sample_simulate( + state, choosers=choosers, alternatives=schedules, spec=spec, @@ -288,6 +291,7 @@ def run_trip_scheduling_choice( trace_label=chunk_trace_label, trace_choice_name="trip_schedule_stage_1", estimator=None, + chunk_sizer=chunk_sizer, ) assert len(choices.index) == len(choosers.index) @@ -296,7 +300,7 @@ def run_trip_scheduling_choice( result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -319,15 +323,19 @@ def run_trip_scheduling_choice( return tours -@inject.step() -def trip_scheduling_choice(trips, tours, skim_dict, chunk_size, trace_hh_id): - +@workflow.step +def trip_scheduling_choice( + state: workflow.State, + trips: pd.DataFrame, + tours: pd.DataFrame, + skim_dict, +) -> None: trace_label = "trip_scheduling_choice" - model_settings = config.read_model_settings("trip_scheduling_choice.yaml") - spec = get_spec_for_segment(model_settings, "SPECIFICATION", "stage_one") + model_settings = state.filesystem.read_model_settings("trip_scheduling_choice.yaml") + spec = get_spec_for_segment(state, model_settings, "SPECIFICATION", "stage_one") - trips_df = trips.to_frame() - tours_df = tours.to_frame() + trips_df = trips + tours_df = tours outbound_trips = trips_df[trips_df[OUTBOUND_FLAG]] inbound_trips = trips_df[~trips_df[OUTBOUND_FLAG]] @@ -358,26 +366,28 @@ def trip_scheduling_choice(trips, tours, skim_dict, chunk_size, trace_hh_id): preprocessor_settings = model_settings.get("PREPROCESSOR", None) + # hack: preprocessor adds origin column in place if it does not exist already + od_skim_stack_wrapper = skim_dict.wrap("origin", "destination") + do_skim_stack_wrapper = skim_dict.wrap("destination", "origin") + obib_skim_stack_wrapper = skim_dict.wrap(LAST_OB_STOP, FIRST_IB_STOP) + + skims = [od_skim_stack_wrapper, do_skim_stack_wrapper, obib_skim_stack_wrapper] + + locals_dict = { + "od_skims": od_skim_stack_wrapper, + "do_skims": do_skim_stack_wrapper, + "obib_skims": obib_skim_stack_wrapper, + "orig_col_name": "origin", + "dest_col_name": "destination", + "timeframe": "timeless_directional", + } + if preprocessor_settings: - # hack: preprocessor adds origin column in place if it does not exist already - od_skim_stack_wrapper = skim_dict.wrap("origin", "destination") - do_skim_stack_wrapper = skim_dict.wrap("destination", "origin") - obib_skim_stack_wrapper = skim_dict.wrap(LAST_OB_STOP, FIRST_IB_STOP) - - skims = [od_skim_stack_wrapper, do_skim_stack_wrapper, obib_skim_stack_wrapper] - - locals_dict = { - "od_skims": od_skim_stack_wrapper, - "do_skims": do_skim_stack_wrapper, - "obib_skims": obib_skim_stack_wrapper, - "orig_col_name": "origin", - "dest_col_name": "destination", - "timeframe": "timeless_directional", - } simulate.set_skim_wrapper_targets(tours_df, skims) expressions.assign_columns( + state, df=tours_df, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -385,7 +395,7 @@ def trip_scheduling_choice(trips, tours, skim_dict, chunk_size, trace_hh_id): ) tours_df = run_trip_scheduling_choice( - spec, tours_df, skims, locals_dict, chunk_size, trace_hh_id, trace_label + state, spec, tours_df, skims, locals_dict, trace_label ) - pipeline.replace_table("tours", tours_df) + state.add_table("tours", tours_df) diff --git a/activitysim/abm/models/util/annotate.py b/activitysim/abm/models/util/annotate.py index e50519b38e..7726aac013 100644 --- a/activitysim/abm/models/util/annotate.py +++ b/activitysim/abm/models/util/annotate.py @@ -1,12 +1,11 @@ -# ActivitySim -# See full license in LICENSE.txt. -import pandas as pd +from __future__ import annotations + import logging -from activitysim.core import expressions -from activitysim.core import tracing -from activitysim.core import inject -from activitysim.core import pipeline +from activitysim.core import expressions, tracing, workflow + +# ActivitySim +# See full license in LICENSE.txt. """ Code for annotating tables @@ -15,7 +14,9 @@ logger = logging.getLogger(__name__) -def annotate_tours(model_settings, trace_label, locals_dict={}): +def annotate_tours( + state: workflow.State, model_settings, trace_label, locals_dict=None +): """ Add columns to the tours table in the pipeline according to spec. @@ -24,17 +25,22 @@ def annotate_tours(model_settings, trace_label, locals_dict={}): model_settings : dict trace_label : str """ - tours = inject.get_table("tours").to_frame() + if locals_dict is None: + locals_dict = {} + tours = state.get_dataframe("tours") expressions.assign_columns( + state, df=tours, model_settings=model_settings.get("annotate_tours"), locals_dict=locals_dict, trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), ) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) -def annotate_trips(model_settings, trace_label, locals_dict={}): +def annotate_trips( + state: workflow.State, model_settings, trace_label, locals_dict=None +): """ Add columns to the trips table in the pipeline according to spec. @@ -43,11 +49,14 @@ def annotate_trips(model_settings, trace_label, locals_dict={}): model_settings : dict trace_label : str """ - trips = inject.get_table("trips").to_frame() + if locals_dict is None: + locals_dict = {} + trips = state.get_dataframe("trips") expressions.assign_columns( + state, df=trips, model_settings=model_settings.get("annotate_trips"), locals_dict=locals_dict, trace_label=tracing.extend_trace_label(trace_label, "annotate_trips"), ) - pipeline.replace_table("trips", trips) + state.add_table("trips", trips) diff --git a/activitysim/abm/models/util/canonical_ids.py b/activitysim/abm/models/util/canonical_ids.py index 4e46e26aae..0ca1a1d3bc 100644 --- a/activitysim/abm/models/util/canonical_ids.py +++ b/activitysim/abm/models/util/canonical_ids.py @@ -1,15 +1,14 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import re import numpy as np import pandas as pd -import re -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate +from activitysim.core import simulate, workflow logger = logging.getLogger(__name__) @@ -58,9 +57,9 @@ def enumerate_tour_types(tour_flavors): return channels -def read_alts_file(file_name, set_index=None): +def read_alts_file(state: workflow.State, file_name, set_index=None): try: - alts = simulate.read_model_alts(file_name, set_index=set_index) + alts = simulate.read_model_alts(state, file_name, set_index=set_index) except (RuntimeError, FileNotFoundError): logger.warning(f"Could not find file {file_name} to determine tour flavors.") return pd.DataFrame() @@ -210,7 +209,7 @@ def determine_flavors_from_alts_file( return flavors -def canonical_tours(): +def canonical_tours(state: workflow.State): """ create labels for every the possible tour by combining tour_type/tour_num. @@ -221,12 +220,14 @@ def canonical_tours(): # ---- non_mandatory_channels nm_model_settings_file_name = "non_mandatory_tour_frequency.yaml" - nm_model_settings = config.read_model_settings(nm_model_settings_file_name) - nm_alts = read_alts_file("non_mandatory_tour_frequency_alternatives.csv") + nm_model_settings = state.filesystem.read_model_settings( + nm_model_settings_file_name + ) + nm_alts = read_alts_file(state, "non_mandatory_tour_frequency_alternatives.csv") # first need to determine max extension try: - ext_probs_f = config.config_file_path( + ext_probs_f = state.filesystem.get_config_file_path( "non_mandatory_tour_frequency_extension_probs.csv" ) extension_probs = pd.read_csv(ext_probs_f, comment="#") @@ -258,9 +259,11 @@ def canonical_tours(): # ---- mandatory_channels mtf_model_settings_file_name = "mandatory_tour_frequency.yaml" - mtf_model_settings = config.read_model_settings(mtf_model_settings_file_name) + mtf_model_settings = state.filesystem.read_model_settings( + mtf_model_settings_file_name + ) mtf_spec = mtf_model_settings.get("SPEC", "mandatory_tour_frequency.csv") - mtf_model_spec = read_alts_file(file_name=mtf_spec) + mtf_model_spec = read_alts_file(state, file_name=mtf_spec) default_mandatory_tour_flavors = {"work": 2, "school": 2} mandatory_tour_flavors = determine_mandatory_tour_flavors( @@ -272,8 +275,10 @@ def canonical_tours(): # ---- atwork_subtour_channels atwork_model_settings_file_name = "atwork_subtour_frequency.yaml" - atwork_model_settings = config.read_model_settings(atwork_model_settings_file_name) - atwork_alts = read_alts_file("atwork_subtour_frequency_alternatives.csv") + atwork_model_settings = state.filesystem.read_model_settings( + atwork_model_settings_file_name + ) + atwork_alts = read_alts_file(state, "atwork_subtour_frequency_alternatives.csv") provided_atwork_flavors = atwork_model_settings.get("ATWORK_SUBTOUR_FLAVORS", None) default_atwork_flavors = {"eat": 1, "business": 2, "maint": 1} @@ -296,8 +301,10 @@ def canonical_tours(): # ---- joint_tour_channels jtf_model_settings_file_name = "joint_tour_frequency.yaml" - jtf_model_settings = config.read_model_settings(jtf_model_settings_file_name) - jtf_alts = read_alts_file("joint_tour_frequency_alternatives.csv") + jtf_model_settings = state.filesystem.read_model_settings( + jtf_model_settings_file_name + ) + jtf_alts = read_alts_file(state, "joint_tour_frequency_alternatives.csv") provided_joint_flavors = jtf_model_settings.get("JOINT_TOUR_FLAVORS", None) default_joint_flavors = { @@ -324,11 +331,14 @@ def canonical_tours(): # ---- school escort channels # only include if model is run - if pipeline.is_table("school_escort_tours") | ( - "school_escorting" in config.setting("models", default=[]) + if state.is_table("school_escort_tours") | ( + state.settings.models is not None + and "school_escorting" in state.settings.models ): se_model_settings_file_name = "school_escorting.yaml" - se_model_settings = config.read_model_settings(se_model_settings_file_name) + se_model_settings = state.filesystem.read_model_settings( + se_model_settings_file_name + ) num_escortees = se_model_settings.get("NUM_ESCORTEES", 3) school_escort_flavors = {"escort": 2 * num_escortees} school_escort_channels = enumerate_tour_types(school_escort_flavors) @@ -343,7 +353,11 @@ def canonical_tours(): def set_tour_index( - tours, parent_tour_num_col=None, is_joint=False, is_school_escorting=False + state: workflow.State, + tours, + parent_tour_num_col=None, + is_joint=False, + is_school_escorting=False, ): """ The new index values are stable based on the person_id, tour_type, and tour_num. @@ -363,7 +377,7 @@ def set_tour_index( """ tour_num_col = "tour_type_num" - possible_tours = canonical_tours() + possible_tours = canonical_tours(state) possible_tours_count = len(possible_tours) assert tour_num_col in tours.columns @@ -417,16 +431,16 @@ def set_tour_index( return tours -def determine_max_trips_per_leg(default_max_trips_per_leg=4): +def determine_max_trips_per_leg(state: workflow.State, default_max_trips_per_leg=4): model_settings_file_name = "stop_frequency.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) # first see if flavors given explicitly provided_max_trips_per_leg = model_settings.get("MAX_TRIPS_PER_LEG", None) # determine flavors from alternative file try: - alts = read_alts_file("stop_frequency_alternatives.csv") + alts = read_alts_file(state, "stop_frequency_alternatives.csv") trips_per_leg = [ int(alts[c].max()) for c in alts.columns @@ -452,10 +466,10 @@ def determine_max_trips_per_leg(default_max_trips_per_leg=4): return default_max_trips_per_leg -def set_trip_index(trips, tour_id_column="tour_id"): +def set_trip_index(state: workflow.State, trips, tour_id_column="tour_id"): # max number of trips per leg (inbound or outbound) of tour # = stops + 1 for primary half-tour destination - max_trips_per_leg = determine_max_trips_per_leg() + max_trips_per_leg = determine_max_trips_per_leg(state) # canonical_trip_num: 1st trip out = 1, 2nd trip out = 2, 1st in = 5, etc. canonical_trip_num = (~trips.outbound * max_trips_per_leg) + trips.trip_num diff --git a/activitysim/abm/models/util/cdap.py b/activitysim/abm/models/util/cdap.py index b71d5128a8..33f04bfd41 100644 --- a/activitysim/abm/models/util/cdap.py +++ b/activitysim/abm/models/util/cdap.py @@ -1,13 +1,14 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import itertools import logging -import os import numpy as np import pandas as pd -from activitysim.core import chunk, config, inject, logit, pipeline, simulate, tracing +from activitysim.core import chunk, logit, simulate, tracing, workflow logger = logging.getLogger(__name__) @@ -29,7 +30,6 @@ def set_hh_index(df): - # index on household_id, not person_id df.set_index(_hh_id_, inplace=True) df.index.name = _hh_index_ @@ -50,7 +50,13 @@ def add_pn(col, pnum): raise RuntimeError("add_pn col not list or str") -def assign_cdap_rank(persons, person_type_map, trace_hh_id=None, trace_label=None): +def assign_cdap_rank( + state: workflow.State | None, + persons, + person_type_map, + trace_hh_id=None, + trace_label=None, +): """ Assign an integer index, cdap_rank, to each household member. (Starting with 1, not 0) @@ -130,7 +136,11 @@ def assign_cdap_rank(persons, person_type_map, trace_hh_id=None, trace_label=Non # choose up to MAX_HHSIZE, choosing randomly others = persons[[_hh_id_, "cdap_rank"]].copy() - others["random_order"] = pipeline.get_rn_generator().random_for_df(persons) + if state is None: + # typically in estimation, no state is available, just use stable but simple random + others["random_order"] = np.random.default_rng(seed=0).uniform(size=len(others)) + else: + others["random_order"] = state.get_rn_generator().random_for_df(persons) others = ( others.sort_values(by=[_hh_id_, "random_order"], ascending=[True, True]) .groupby(_hh_id_) @@ -156,17 +166,24 @@ def assign_cdap_rank(persons, person_type_map, trace_hh_id=None, trace_label=Non persons["cdap_rank"] = p["cdap_rank"] # assignment aligns on index values # if DUMP: - # tracing.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label, + # state.tracing.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label, # transpose=False, slicer='NONE') - if trace_hh_id: - tracing.trace_df(persons, "%s.cdap_rank" % trace_label) + if trace_hh_id and state is not None: + state.tracing.trace_df(persons, "%s.cdap_rank" % trace_label) return persons["cdap_rank"] def individual_utilities( - persons, cdap_indiv_spec, locals_d, trace_hh_id=None, trace_label=None + state: workflow.State, + persons, + cdap_indiv_spec, + locals_d, + trace_hh_id=None, + trace_label=None, + *, + chunk_sizer, ): """ Calculate CDAP utilities for all individuals. @@ -188,7 +205,12 @@ def individual_utilities( # calculate single person utilities indiv_utils = simulate.eval_utilities( - cdap_indiv_spec, persons, locals_d, trace_label=trace_label + state, + cdap_indiv_spec, + persons, + locals_d, + trace_label=trace_label, + chunk_sizer=chunk_sizer, ) # add columns from persons to facilitate building household interactions @@ -196,13 +218,13 @@ def individual_utilities( indiv_utils[useful_columns] = persons[useful_columns] # add attributes for joint tour utility - model_settings = config.read_model_settings("cdap.yaml") + model_settings = state.filesystem.read_model_settings("cdap.yaml") additional_useful_columns = model_settings.get("JOINT_TOUR_USEFUL_COLUMNS", None) if additional_useful_columns is not None: indiv_utils[additional_useful_columns] = persons[additional_useful_columns] if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( indiv_utils, "%s.indiv_utils" % trace_label, column_labels=["activity", "person"], @@ -269,11 +291,11 @@ def cached_joint_spec_name(hhsize): return "cdap_joint_spec_%s" % hhsize -def get_cached_spec(hhsize): +def get_cached_spec(state: workflow.State, hhsize): spec_name = cached_spec_name(hhsize) - spec = inject.get_injectable(spec_name, None) + spec = state.get_injectable(spec_name, None) if spec is not None: logger.debug("build_cdap_spec returning cached injectable spec %s", spec_name) return spec @@ -283,19 +305,19 @@ def get_cached_spec(hhsize): # cached spec will be available as an injectable to subsequent chunks # # try data dir - # if os.path.exists(config.output_file_path(spec_name)): - # spec_path = config.output_file_path(spec_name) + # if os.path.exists(state.get_output_file_path(spec_name)): + # spec_path = state.get_output_file_path(spec_name) # logger.info("build_cdap_spec reading cached spec %s from %s", spec_name, spec_path) # return pd.read_csv(spec_path, index_col='Expression') return None -def get_cached_joint_spec(hhsize): +def get_cached_joint_spec(state: workflow.State, hhsize): spec_name = cached_joint_spec_name(hhsize) - spec = inject.get_injectable(spec_name, None) + spec = state.get_injectable(spec_name, None) if spec is not None: logger.debug( "build_cdap_joint_spec returning cached injectable spec %s", spec_name @@ -305,19 +327,20 @@ def get_cached_joint_spec(hhsize): return None -def cache_spec(hhsize, spec): +def cache_spec(state: workflow.State, hhsize, spec): spec_name = cached_spec_name(hhsize) # cache as injectable - inject.add_injectable(spec_name, spec) + state.add_injectable(spec_name, spec) -def cache_joint_spec(hhsize, spec): +def cache_joint_spec(state: workflow.State, hhsize, spec): spec_name = cached_joint_spec_name(hhsize) # cache as injectable - inject.add_injectable(spec_name, spec) + state.add_injectable(spec_name, spec) def build_cdap_spec( + state: workflow.State, interaction_coefficients, hhsize, trace_spec=False, @@ -372,7 +395,7 @@ def build_cdap_spec( # if DUMP: # # dump the interaction_coefficients table because it has been preprocessed - # tracing.trace_df(interaction_coefficients, + # state.tracing.trace_df(interaction_coefficients, # '%s.hhsize%d_interaction_coefficients' % (trace_label, hhsize), # transpose=False, slicer='NONE') @@ -380,7 +403,7 @@ def build_cdap_spec( hhsize = min(hhsize, MAX_HHSIZE) if cache: - spec = get_cached_spec(hhsize) + spec = get_cached_spec(state, hhsize) if spec is not None: return spec @@ -412,7 +435,6 @@ def build_cdap_spec( # N_p1 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0 for pnum in range(1, hhsize + 1): for activity in ["M", "N", "H"]: - new_row_index = len(spec) spec.loc[new_row_index, expression_name] = add_pn(activity, pnum) @@ -428,10 +450,8 @@ def build_cdap_spec( # for each row in the interaction_coefficients table for row in interaction_coefficients[relevant_rows].itertuples(): - # if it is a wildcard all_people interaction if not row.interaction_ptypes: - # wildcard interactions only apply if the interaction includes all household members # this will be the case if the cardinality of the wildcard equals the hhsize # conveniently, the slug is given the name of the alternative column (e.g. HHHH) @@ -455,7 +475,6 @@ def build_cdap_spec( # possible combination of interacting persons # e.g. for (1, 2), (1,3), (2,3) for a coefficient with cardinality 2 in hhsize 3 for tup in itertools.combinations(list(range(1, hhsize + 1)), row.cardinality): - # determine the name of the chooser column with the ptypes for this interaction if row.cardinality == 1: interaction_column = "ptype_p%d" % tup[0] @@ -497,7 +516,7 @@ def build_cdap_spec( simulate.uniquify_spec_index(spec) if trace_spec: - tracing.trace_df( + state.tracing.trace_df( spec, "%s.hhsize%d_spec" % (trace_label, hhsize), transpose=False, @@ -510,7 +529,7 @@ def build_cdap_spec( spec[c] = spec[c].map(lambda x: d.get(x, x or 0.0)).fillna(0) if trace_spec: - tracing.trace_df( + state.tracing.trace_df( spec, "%s.hhsize%d_spec_patched" % (trace_label, hhsize), transpose=False, @@ -518,7 +537,7 @@ def build_cdap_spec( ) if cache: - cache_spec(hhsize, spec) + cache_spec(state, hhsize, spec) t0 = tracing.print_elapsed_time("build_cdap_spec hh_size %s" % hhsize, t0) @@ -526,7 +545,12 @@ def build_cdap_spec( def build_cdap_joint_spec( - joint_tour_coefficients, hhsize, trace_spec=False, trace_label=None, cache=True + state: workflow.State, + joint_tour_coefficients, + hhsize, + trace_spec=False, + trace_label=None, + cache=True, ): """ Build a spec file for computing joint tour utilities of alternative household member for households of specified size. @@ -565,7 +589,7 @@ def build_cdap_joint_spec( hhsize = min(hhsize, MAX_HHSIZE) if cache: - spec = get_cached_joint_spec(hhsize) + spec = get_cached_joint_spec(state, hhsize) if spec is not None: return spec @@ -624,7 +648,7 @@ def build_cdap_joint_spec( coefficient = row.coefficient if dependency_name in ["M_px", "N_px", "H_px"]: if "_pxprod" in expression: - prod_conds = row.Expression.split("|") + prod_conds = [j.strip() for j in row.Expression.split("|")] expanded_expressions = [ tup for tup in itertools.product( @@ -703,7 +727,7 @@ def build_cdap_joint_spec( spec[c] = 0 if trace_spec: - tracing.trace_df( + state.tracing.trace_df( spec, "%s.hhsize%d_joint_spec" % (trace_label, hhsize), transpose=False, @@ -711,7 +735,7 @@ def build_cdap_joint_spec( ) if trace_spec: - tracing.trace_df( + state.tracing.trace_df( spec, "%s.hhsize%d_joint_spec_patched" % (trace_label, hhsize), transpose=False, @@ -719,7 +743,7 @@ def build_cdap_joint_spec( ) if cache: - cache_joint_spec(hhsize, spec) + cache_joint_spec(state, hhsize, spec) t0 = tracing.print_elapsed_time("build_cdap_joint_spec hh_size %s" % hhsize, t0) @@ -784,7 +808,7 @@ def add_interaction_column(choosers, p_tup): ) -def hh_choosers(indiv_utils, hhsize): +def hh_choosers(state: workflow.State, indiv_utils, hhsize): """ Build a chooser table for calculating house utilities for all households of specified hhsize @@ -823,7 +847,7 @@ def hh_choosers(indiv_utils, hhsize): merge_cols = [_hh_id_, _ptype_, "M", "N", "H"] # add attributes for joint tour utility - model_settings = config.read_model_settings("cdap.yaml") + model_settings = state.filesystem.read_model_settings("cdap.yaml") additional_merge_cols = model_settings.get("JOINT_TOUR_USEFUL_COLUMNS", None) if additional_merge_cols is not None: merge_cols.extend(additional_merge_cols) @@ -846,7 +870,6 @@ def hh_choosers(indiv_utils, hhsize): # for each of the higher cdap_ranks for pnum in range(2, hhsize + 1): - # df with merge columns for indiv with cdap_rank of pnum rhs = indiv_utils.loc[ include_households & (indiv_utils["cdap_rank"] == pnum), merge_cols @@ -877,12 +900,15 @@ def hh_choosers(indiv_utils, hhsize): def household_activity_choices( + state: workflow.State, indiv_utils, interaction_coefficients, hhsize, trace_hh_id=None, trace_label=None, add_joint_tour_utility=False, + *, + chunk_sizer, ): """ Calculate household utilities for each activity pattern alternative for households of hhsize @@ -919,10 +945,10 @@ def household_activity_choices( # index on household_id, not person_id set_hh_index(utils) else: - - choosers = hh_choosers(indiv_utils, hhsize=hhsize) + choosers = hh_choosers(state, indiv_utils, hhsize=hhsize) spec = build_cdap_spec( + state, interaction_coefficients, hhsize, trace_spec=(trace_hh_id in choosers.index), @@ -930,15 +956,19 @@ def household_activity_choices( joint_tour_alt=add_joint_tour_utility, ) - utils = simulate.eval_utilities(spec, choosers, trace_label=trace_label) + utils = simulate.eval_utilities( + state, spec, choosers, trace_label=trace_label, chunk_sizer=chunk_sizer + ) if len(utils.index) == 0: return pd.Series(dtype="float64") + probs = logit.utils_to_probs(state, utils, trace_label=trace_label) # calculate joint tour utility if add_joint_tour_utility & (hhsize > 1): # calculate joint utils joint_tour_spec = build_cdap_joint_spec( + state, interaction_coefficients, hhsize, trace_spec=(trace_hh_id in choosers.index), @@ -946,46 +976,49 @@ def household_activity_choices( ) joint_tour_utils = simulate.eval_utilities( - joint_tour_spec, choosers, trace_label=trace_label + state, + joint_tour_spec, + choosers, + trace_label=trace_label, + chunk_sizer=chunk_sizer, ) # add joint util to util utils = utils.add(joint_tour_utils) - probs = logit.utils_to_probs(utils, trace_label=trace_label) + probs = logit.utils_to_probs(state, utils, trace_label=trace_label) # select an activity pattern alternative for each household based on probability # result is a series indexed on _hh_index_ with the (0 based) index of the column from probs - idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) + idx_choices, rands = logit.make_choices(state, probs, trace_label=trace_label) # convert choice expressed as index into alternative name from util column label choices = pd.Series(utils.columns[idx_choices].values, index=utils.index) if trace_hh_id: - if hhsize > 1: - tracing.trace_df( + state.tracing.trace_df( choosers, "%s.hhsize%d_choosers" % (trace_label, hhsize), column_labels=["expression", "person"], ) - tracing.trace_df( + state.tracing.trace_df( utils, "%s.hhsize%d_utils" % (trace_label, hhsize), column_labels=["expression", "household"], ) - tracing.trace_df( + state.tracing.trace_df( probs, "%s.hhsize%d_probs" % (trace_label, hhsize), column_labels=["expression", "household"], ) - tracing.trace_df( + state.tracing.trace_df( choices, "%s.hhsize%d_activity_choices" % (trace_label, hhsize), column_labels=["expression", "household"], ) - tracing.trace_df( + state.tracing.trace_df( rands, "%s.hhsize%d_rands" % (trace_label, hhsize), columns=[None, "rand"] ) @@ -1034,7 +1067,7 @@ def unpack_cdap_indiv_activity_choices(persons, hh_choices, trace_hh_id, trace_l cdap_indiv_activity_choices = indiv_activity["cdap_activity"] # if DUMP: - # tracing.trace_df(cdap_indiv_activity_choices, + # state.tracing.trace_df(cdap_indiv_activity_choices, # '%s.DUMP.cdap_indiv_activity_choices' % trace_label, # transpose=False, slicer='NONE') @@ -1042,7 +1075,12 @@ def unpack_cdap_indiv_activity_choices(persons, hh_choices, trace_hh_id, trace_l def extra_hh_member_choices( - persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label + state: workflow.State, + persons, + cdap_fixed_relative_proportions: pd.DataFrame, + locals_d, + trace_hh_id, + trace_label, ): """ Generate the activity choices for the 'extra' household members who weren't handled by cdap @@ -1083,7 +1121,7 @@ def extra_hh_member_choices( # eval the expression file values = simulate.eval_variables( - cdap_fixed_relative_proportions.index, choosers, locals_d + state, cdap_fixed_relative_proportions.index, choosers, locals_d ) # cdap_fixed_relative_proportions computes relative proportions by ptype, not utilities @@ -1095,37 +1133,37 @@ def extra_hh_member_choices( # select an activity pattern alternative for each person based on probability # idx_choices is a series (indexed on _persons_index_ ) with the chosen alternative represented # as the integer (0 based) index of the chosen column from probs - idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) + idx_choices, rands = logit.make_choices(state, probs, trace_label=trace_label) # convert choice from column index to activity name choices = pd.Series(probs.columns[idx_choices].values, index=probs.index) # if DUMP: - # tracing.trace_df(proportions, '%s.DUMP.extra_proportions' % trace_label, + # state.tracing.trace_df(proportions, '%s.DUMP.extra_proportions' % trace_label, # transpose=False, slicer='NONE') - # tracing.trace_df(probs, '%s.DUMP.extra_probs' % trace_label, + # state.tracing.trace_df(probs, '%s.DUMP.extra_probs' % trace_label, # transpose=False, slicer='NONE') - # tracing.trace_df(choices, '%s.DUMP.extra_choices' % trace_label, + # state.tracing.trace_df(choices, '%s.DUMP.extra_choices' % trace_label, # transpose=False, # slicer='NONE') if trace_hh_id: - tracing.trace_df( + state.tracing.trace_df( proportions, "%s.extra_hh_member_choices_proportions" % trace_label, column_labels=["expression", "person"], ) - tracing.trace_df( + state.tracing.trace_df( probs, "%s.extra_hh_member_choices_probs" % trace_label, column_labels=["expression", "person"], ) - tracing.trace_df( + state.tracing.trace_df( choices, "%s.extra_hh_member_choices_choices" % trace_label, column_labels=["expression", "person"], ) - tracing.trace_df( + state.tracing.trace_df( rands, "%s.extra_hh_member_choices_rands" % trace_label, columns=[None, "rand"], @@ -1135,6 +1173,7 @@ def extra_hh_member_choices( def _run_cdap( + state: workflow.State, persons, person_type_map, cdap_indiv_spec, @@ -1144,7 +1183,9 @@ def _run_cdap( trace_hh_id, trace_label, add_joint_tour_utility, -): + *, + chunk_sizer, +) -> pd.DataFrame | tuple: """ Implements core run_cdap functionality on persons df (or chunked subset thereof) Aside from chunking of persons df, params are passed through from run_cdap unchanged @@ -1160,43 +1201,46 @@ def _run_cdap( # assign integer cdap_rank to each household member # persons with cdap_rank 1..MAX_HHSIZE will be have their activities chose by CDAP model # extra household members, will have activities assigned by in fixed proportions - assign_cdap_rank(persons, person_type_map, trace_hh_id, trace_label) - chunk.log_df(trace_label, "persons", persons) + assign_cdap_rank(state, persons, person_type_map, trace_hh_id, trace_label) + chunk_sizer.log_df(trace_label, "persons", persons) # Calculate CDAP utilities for each individual, ignoring interactions # ind_utils has index of 'person_id' and a column for each alternative # i.e. three columns 'M' (Mandatory), 'N' (NonMandatory), 'H' (Home) indiv_utils = individual_utilities( + state, persons[persons.cdap_rank <= MAX_HHSIZE], cdap_indiv_spec, locals_d, trace_hh_id, trace_label, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "indiv_utils", indiv_utils) + chunk_sizer.log_df(trace_label, "indiv_utils", indiv_utils) # compute interaction utilities, probabilities, and hh activity pattern choices # for each size household separately in turn up to MAX_HHSIZE hh_choices_list = [] for hhsize in range(1, MAX_HHSIZE + 1): - choices = household_activity_choices( + state, indiv_utils, interaction_coefficients, hhsize=hhsize, trace_hh_id=trace_hh_id, trace_label=trace_label, add_joint_tour_utility=add_joint_tour_utility, + chunk_sizer=chunk_sizer, ) hh_choices_list.append(choices) del indiv_utils - chunk.log_df(trace_label, "indiv_utils", None) + chunk_sizer.log_df(trace_label, "indiv_utils", None) # concat all the household choices into a single series indexed on _hh_index_ hh_activity_choices = pd.concat(hh_choices_list) - chunk.log_df(trace_label, "hh_activity_choices", hh_activity_choices) + chunk_sizer.log_df(trace_label, "hh_activity_choices", hh_activity_choices) # unpack the household activity choice list into choices for each (non-extra) household member # resulting series contains one activity per individual hh member, indexed on _persons_index_ @@ -1207,7 +1251,12 @@ def _run_cdap( # assign activities to extra household members (with cdap_rank > MAX_HHSIZE) # resulting series contains one activity per individual hh member, indexed on _persons_index_ extra_person_choices = extra_hh_member_choices( - persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label + state, + persons, + cdap_fixed_relative_proportions, + locals_d, + trace_hh_id, + trace_label, ) # concat cdap and extra persoin choices into a single series @@ -1216,7 +1265,7 @@ def _run_cdap( person_choices = pd.concat([cdap_person_choices, extra_person_choices]) persons["cdap_activity"] = person_choices - chunk.log_df(trace_label, "persons", persons) + chunk_sizer.log_df(trace_label, "persons", persons) # return household joint tour flag if add_joint_tour_utility: @@ -1226,15 +1275,15 @@ def _run_cdap( ) # if DUMP: - # tracing.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, + # state.tracing.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, # transpose=False, slicer='NONE') - # tracing.trace_df(cdap_results, '%s.DUMP.cdap_results' % trace_label, + # state.tracing.trace_df(cdap_results, '%s.DUMP.cdap_results' % trace_label, # transpose=False, slicer='NONE') result = persons[["cdap_rank", "cdap_activity"]] del persons - chunk.log_df(trace_label, "persons", None) + chunk_sizer.log_df(trace_label, "persons", None) if add_joint_tour_utility: return result, hh_activity_choices["has_joint_tour"] @@ -1243,6 +1292,7 @@ def _run_cdap( def run_cdap( + state: workflow.State, persons, person_type_map, cdap_indiv_spec, @@ -1294,16 +1344,18 @@ def run_cdap( trace_label = tracing.extend_trace_label(trace_label, "cdap") + cdap_results = hh_choice_results = None result_list = [] # segment by person type and pick the right spec for each person type for ( i, persons_chunk, chunk_trace_label, - ) in chunk.adaptive_chunked_choosers_by_chunk_id(persons, chunk_size, trace_label): - + chunk_sizer, + ) in chunk.adaptive_chunked_choosers_by_chunk_id(state, persons, trace_label): if add_joint_tour_utility: cdap_results, hh_choice_results = _run_cdap( + state, persons_chunk, person_type_map, cdap_indiv_spec, @@ -1313,9 +1365,11 @@ def run_cdap( trace_hh_id, chunk_trace_label, add_joint_tour_utility, + chunk_sizer=chunk_sizer, ) else: cdap_results = _run_cdap( + state, persons_chunk, person_type_map, cdap_indiv_spec, @@ -1325,11 +1379,12 @@ def run_cdap( trace_hh_id, chunk_trace_label, add_joint_tour_utility, + chunk_sizer=chunk_sizer, ) result_list.append(cdap_results) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -1338,8 +1393,7 @@ def run_cdap( cdap_results = pd.concat(result_list) if trace_hh_id: - - tracing.trace_df( + state.tracing.trace_df( cdap_results, label="cdap", columns=["cdap_rank", "cdap_activity"], diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py index f42c15c161..c48586a86b 100644 --- a/activitysim/abm/models/util/logsums.py +++ b/activitysim/abm/models/util/logsums.py @@ -1,9 +1,10 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging from activitysim.core import config, expressions, los, simulate, tracing -from activitysim.core.pathbuilder import TransitVirtualPathBuilder logger = logging.getLogger(__name__) @@ -32,6 +33,7 @@ def filter_chooser_columns(choosers, logsum_settings, model_settings): def compute_logsums( + state, choosers, tour_purpose, logsum_settings, @@ -127,10 +129,14 @@ def compute_logsums( else: logger.error("Choosers table already has column 'duration'.") - logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) - coefficients = simulate.get_segment_coefficients(logsum_settings, tour_purpose) + logsum_spec = state.filesystem.read_model_spec(file_name=logsum_settings["SPEC"]) + coefficients = state.filesystem.get_segment_coefficients( + logsum_settings, tour_purpose + ) - logsum_spec = simulate.eval_coefficients(logsum_spec, coefficients, estimator=None) + logsum_spec = simulate.eval_coefficients( + state, logsum_spec, coefficients, estimator=None + ) nest_spec = config.get_logit_model_settings(logsum_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) @@ -211,6 +217,7 @@ def compute_logsums( simulate.set_skim_wrapper_targets(choosers, skims) expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -218,6 +225,7 @@ def compute_logsums( ) logsums = simulate.simple_simulate_logsums( + state, choosers, logsum_spec, nest_spec, diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index f28713b2c1..8a75ae8b6f 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -1,10 +1,15 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging +import warnings +from typing import Optional import pandas as pd -from activitysim.core import config, expressions, simulate, tracing +from activitysim.core import config, expressions, simulate, workflow +from activitysim.core.estimation import Estimator """ At this time, these utilities are mostly for transforming the mode choice @@ -16,18 +21,18 @@ def mode_choice_simulate( - choosers, - spec, + state: workflow.State, + choosers: pd.DataFrame, + spec: pd.DataFrame, nest_spec, skims, locals_d, - chunk_size, mode_column_name, logsum_column_name, - trace_label, + trace_label: str, trace_choice_name, trace_column_names=None, - estimator=None, + estimator: Optional[Estimator] = None, ): """ common method for both tour_mode_choice and trip_mode_choice @@ -53,12 +58,12 @@ def mode_choice_simulate( want_logsums = logsum_column_name is not None choices = simulate.simple_simulate( + state, choosers=choosers, spec=spec, nest_spec=nest_spec, skims=skims, locals_d=locals_d, - chunk_size=chunk_size, want_logsums=want_logsums, trace_label=trace_label, trace_choice_name=trace_choice_name, @@ -83,6 +88,7 @@ def mode_choice_simulate( def run_tour_mode_choice_simulate( + state: workflow.State, choosers, tour_purpose, model_settings, @@ -92,7 +98,6 @@ def run_tour_mode_choice_simulate( skims, constants, estimator, - chunk_size, trace_label=None, trace_choice_name=None, ): @@ -103,10 +108,12 @@ def run_tour_mode_choice_simulate( you want to use in the evaluation of variables. """ - spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients = simulate.get_segment_coefficients(model_settings, tour_purpose) + spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients = state.filesystem.get_segment_coefficients( + model_settings, tour_purpose + ) - spec = simulate.eval_coefficients(spec, coefficients, estimator) + spec = simulate.eval_coefficients(state, spec, coefficients, estimator) nest_spec = config.get_logit_model_settings(model_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) @@ -128,11 +135,14 @@ def run_tour_mode_choice_simulate( choosers["out_period"] = network_los.skim_time_period_label(choosers[out_time]) expressions.annotate_preprocessors( - choosers, locals_dict, skims, model_settings, trace_label + state, choosers, locals_dict, skims, model_settings, trace_label ) trace_column_names = choosers.index.name - assert trace_column_names == "tour_id" + if trace_column_names != "tour_id": + # TODO suppress this warning? It should not be relevant in regular + # activitysim models, but could be annoying in extensions. + warnings.warn(f"trace_column_names is {trace_column_names!r} not 'tour_id'") if trace_column_names not in choosers: choosers[trace_column_names] = choosers.index @@ -141,12 +151,12 @@ def run_tour_mode_choice_simulate( estimator.write_choosers(choosers) choices = mode_choice_simulate( + state, choosers=choosers, spec=spec, nest_spec=nest_spec, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=trace_label, diff --git a/activitysim/abm/models/util/overlap.py b/activitysim/abm/models/util/overlap.py index 70fadfbd43..914a25dbd1 100644 --- a/activitysim/abm/models/util/overlap.py +++ b/activitysim/abm/models/util/overlap.py @@ -1,11 +1,13 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import inject, tracing +from activitysim.core import workflow logger = logging.getLogger(__name__) @@ -90,7 +92,7 @@ def rle(a): return row_id, start_pos, run_length, run_val -def p2p_time_window_overlap(p1_ids, p2_ids): +def p2p_time_window_overlap(state: workflow.State, p1_ids, p2_ids): """ Parameters @@ -103,7 +105,7 @@ def p2p_time_window_overlap(p1_ids, p2_ids): """ - timetable = inject.get_injectable("timetable") + timetable = state.get_injectable("timetable") assert len(p1_ids) == len(p2_ids) # if series, ought to have same index @@ -163,11 +165,11 @@ def person_pairs(persons): return p2p -def hh_time_window_overlap(households, persons): +def hh_time_window_overlap(state: workflow.State, households, persons): p2p = person_pairs(persons) - p2p["max_overlap"] = p2p_time_window_overlap(p2p.person1, p2p.person2) + p2p["max_overlap"] = p2p_time_window_overlap(state, p2p.person1, p2p.person2) hh_overlap = ( p2p.groupby(["household_id", "p2p_type"]) @@ -186,11 +188,11 @@ def hh_time_window_overlap(households, persons): return hh_overlap -def person_time_window_overlap(persons): +def person_time_window_overlap(state: workflow.State, persons): p2p = person_pairs(persons) - p2p["max_overlap"] = p2p_time_window_overlap(p2p.person1, p2p.person2) + p2p["max_overlap"] = p2p_time_window_overlap(state, p2p.person1, p2p.person2) p_overlap = ( pd.concat( @@ -221,9 +223,9 @@ def person_time_window_overlap(persons): return p_overlap -def person_max_window(persons): +def person_max_window(state: workflow.State, persons): - timetable = inject.get_injectable("timetable") + timetable = state.get_injectable("timetable") # ndarray with one row per person and one column per time period # array value of 1 where free periods and 0 elsewhere diff --git a/activitysim/abm/models/util/probabilistic_scheduling.py b/activitysim/abm/models/util/probabilistic_scheduling.py index 6fc7689a64..1b6b063baf 100644 --- a/activitysim/abm/models/util/probabilistic_scheduling.py +++ b/activitysim/abm/models/util/probabilistic_scheduling.py @@ -1,11 +1,13 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import chunk, config, inject, logit, pipeline, simulate, tracing +from activitysim.core import chunk, logit, tracing, workflow logger = logging.getLogger(__name__) @@ -59,7 +61,9 @@ def _clip_probs(choosers_df, probs, depart_alt_base): return probs -def _report_bad_choices(bad_row_map, df, filename, trace_label, trace_choosers=None): +def _report_bad_choices( + state: workflow.State, bad_row_map, df, filename, trace_label, trace_choosers=None +): """ Parameters @@ -84,7 +88,7 @@ def _report_bad_choices(bad_row_map, df, filename, trace_label, trace_choosers=N filename = "%s.%s" % (trace_label, filename) logger.info("dumping %s" % filename) - tracing.write_csv(df, file_name=filename, transpose=False) + state.tracing.write_csv(df, file_name=filename, transpose=False) # log the indexes of the first MAX_PRINT offending rows MAX_PRINT = 0 @@ -240,6 +244,7 @@ def _postprocess_scheduling_choices( def make_scheduling_choices( + state: workflow.State, choosers_df, scheduling_mode, probs_spec, @@ -247,10 +252,11 @@ def make_scheduling_choices( depart_alt_base, first_trip_in_leg, report_failed_trips, - trace_hh_id, trace_label, trace_choice_col_name="depart", clip_earliest_latest=True, + *, + chunk_sizer: chunk.ChunkSizer, ): """ We join each trip with the appropriate row in probs_spec by joining on probs_join_cols, @@ -271,7 +277,6 @@ def make_scheduling_choices( int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am report_failed_trips : bool - trace_hh_id trace_label Returns @@ -279,14 +284,14 @@ def make_scheduling_choices( choices: pd.Series time periods depart choices, one per trip (except for trips with zero probs) """ - + trace_hh_id = state.settings.trace_hh_id choosers = pd.merge( choosers_df.reset_index(), probs_spec, on=probs_join_cols, how="left" ).set_index(choosers_df.index.name) - chunk.log_df(trace_label, "choosers", choosers) + chunk_sizer.log_df(trace_label, "choosers", choosers) - if trace_hh_id and tracing.has_trace_targets(choosers_df): - tracing.trace_df(choosers, "%s.choosers" % trace_label) + if trace_hh_id and state.tracing.has_trace_targets(choosers_df): + state.tracing.trace_df(choosers, "%s.choosers" % trace_label) # different pre-processing is required based on the scheduling mode chooser_probs = _preprocess_scheduling_probs( @@ -300,25 +305,25 @@ def make_scheduling_choices( first_trip_in_leg, ) - chunk.log_df(trace_label, "chooser_probs", chooser_probs) + chunk_sizer.log_df(trace_label, "chooser_probs", chooser_probs) - if trace_hh_id and tracing.has_trace_targets(choosers_df): - tracing.trace_df(chooser_probs, "%s.chooser_probs" % trace_label) + if trace_hh_id and state.tracing.has_trace_targets(choosers_df): + state.tracing.trace_df(chooser_probs, "%s.chooser_probs" % trace_label) raw_choices, rands = logit.make_choices( - chooser_probs, trace_label=trace_label, trace_choosers=choosers + state, chooser_probs, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "choices", raw_choices) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "choices", raw_choices) + chunk_sizer.log_df(trace_label, "rands", rands) - if trace_hh_id and tracing.has_trace_targets(choosers_df): - tracing.trace_df( + if trace_hh_id and state.tracing.has_trace_targets(choosers_df): + state.tracing.trace_df( raw_choices, "%s.choices" % trace_label, columns=[None, trace_choice_col_name], ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + state.tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) # different post-processing is required based on the scheduling mode choices, failed = _postprocess_scheduling_choices( @@ -329,11 +334,12 @@ def make_scheduling_choices( choosers_df, ) - chunk.log_df(trace_label, "failed", failed) + chunk_sizer.log_df(trace_label, "failed", failed) # report failed trips while we have the best diagnostic info if report_failed_trips and failed.any(): _report_bad_choices( + state, bad_row_map=failed, df=choosers, filename="failed_choosers", @@ -342,11 +348,11 @@ def make_scheduling_choices( ) # trace before removing failures - if trace_hh_id and tracing.has_trace_targets(choosers_df): - tracing.trace_df( + if trace_hh_id and state.tracing.has_trace_targets(choosers_df): + state.tracing.trace_df( choices, "%s.choices" % trace_label, columns=[None, trace_choice_col_name] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + state.tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) # remove any failed choices if failed.any(): diff --git a/activitysim/abm/models/util/school_escort_tours_trips.py b/activitysim/abm/models/util/school_escort_tours_trips.py index 778fb86454..64101c45e2 100644 --- a/activitysim/abm/models/util/school_escort_tours_trips.py +++ b/activitysim/abm/models/util/school_escort_tours_trips.py @@ -1,15 +1,15 @@ +from __future__ import annotations + import logging -import pandas as pd + import numpy as np -import warnings +import pandas as pd +from activitysim.abm.models.school_escorting import NUM_ESCORTEES from activitysim.abm.models.util import canonical_ids -from activitysim.core import pipeline -from activitysim.core import inject +from activitysim.core import workflow from activitysim.core.util import reindex -from ..school_escorting import NUM_ESCORTEES - logger = logging.getLogger(__name__) @@ -77,7 +77,6 @@ def create_chauf_trip_table(row): def create_chauf_escort_trips(bundles): - chauf_trip_bundles = bundles.apply(lambda row: create_chauf_trip_table(row), axis=1) chauf_trip_bundles["tour_id"] = bundles["chauf_tour_id"].astype(int) @@ -215,7 +214,6 @@ def create_child_escorting_stops(row, escortee_num): def create_escortee_trips(bundles): - escortee_trips = [] for escortee_num in range(0, int(bundles.num_escortees.max()) + 1): escortee_bundles = bundles.apply( @@ -338,7 +336,7 @@ def add_school_escorting_type_to_tours_table(escort_bundles, tours): return tours -def process_tours_after_escorting_model(escort_bundles, tours): +def process_tours_after_escorting_model(state: workflow.State, escort_bundles, tours): # adding indicators to tours that include school escorting tours = add_school_escorting_type_to_tours_table(escort_bundles, tours) @@ -376,7 +374,7 @@ def process_tours_after_escorting_model(escort_bundles, tours): tours.loc[bad_end_times, "end"] = tours.loc[bad_end_times, "start"] # updating tdd to match start and end times - tdd_alts = inject.get_injectable("tdd_alts") + tdd_alts = state.get_injectable("tdd_alts") tdd_alts["tdd"] = tdd_alts.index tours.drop(columns="tdd", inplace=True) @@ -394,10 +392,10 @@ def process_tours_after_escorting_model(escort_bundles, tours): return tours -def merge_school_escort_trips_into_pipeline(): - school_escort_trips = pipeline.get_table("school_escort_trips") - tours = pipeline.get_table("tours") - trips = pipeline.get_table("trips") +def merge_school_escort_trips_into_pipeline(state: workflow.State): + school_escort_trips = state.get_dataframe("school_escort_trips") + tours = state.get_dataframe("tours") + trips = state.get_dataframe("trips") # want to remove stops if school escorting takes place on that half tour so we can replace them with the actual stops out_se_tours = tours[ @@ -469,7 +467,7 @@ def merge_school_escort_trips_into_pipeline(): trips["destination"] = trips["destination"].astype(int) # updating trip_id now that we have all trips - trips = canonical_ids.set_trip_index(trips) + trips = canonical_ids.set_trip_index(state, trips) school_escort_trip_id_map = { v: k for k, v in trips.loc[ @@ -492,10 +490,10 @@ def merge_school_escort_trips_into_pipeline(): trips.drop(columns="school_escort_trip_id", inplace=True) # replace trip table and pipeline and register with the random number generator - pipeline.replace_table("trips", trips) - pipeline.get_rn_generator().drop_channel("trips") - pipeline.get_rn_generator().add_channel("trips", trips) - pipeline.replace_table("school_escort_trips", school_escort_trips) + state.add_table("trips", trips) + state.get_rn_generator().drop_channel("trips") + state.get_rn_generator().add_channel("trips", trips) + state.add_table("school_escort_trips", school_escort_trips) # updating stop frequency in tours tabel to be consistent num_outbound_stops = ( @@ -510,13 +508,13 @@ def merge_school_escort_trips_into_pipeline(): tours.loc[stop_freq.index, "stop_frequency"] = stop_freq # no need to reset random number generator since no tours added - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) return trips -def recompute_tour_count_statistics(): - tours = pipeline.get_table("tours") +def recompute_tour_count_statistics(state: workflow.State): + tours = state.get_dataframe("tours") grouped = tours.groupby(["person_id", "tour_type"]) tours["tour_type_num"] = grouped.cumcount() + 1 @@ -528,10 +526,10 @@ def recompute_tour_count_statistics(): tours["tour_num"] = grouped.cumcount() + 1 tours["tour_count"] = tours["tour_num"] + grouped.cumcount(ascending=False) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) -def create_pure_school_escort_tours(bundles): +def create_pure_school_escort_tours(state: workflow.State, bundles): # creating home to school tour for chauffers making pure escort tours # ride share tours are already created since they go off the mandatory tour pe_tours = bundles[bundles["escort_type"] == "pure_escort"] @@ -582,7 +580,7 @@ def create_pure_school_escort_tours(bundles): pe_tours["tour_num"] = grouped.cumcount() + 1 pe_tours["tour_count"] = pe_tours["tour_num"] + grouped.cumcount(ascending=False) - pe_tours = canonical_ids.set_tour_index(pe_tours, is_school_escorting=True) + pe_tours = canonical_ids.set_tour_index(state, pe_tours, is_school_escorting=True) return pe_tours @@ -597,11 +595,11 @@ def split_out_school_escorting_trips(trips, school_escort_trips): return trips, se_trips, full_trips_index -def force_escortee_tour_modes_to_match_chauffeur(tours): +def force_escortee_tour_modes_to_match_chauffeur(state: workflow.State, tours): # FIXME: escortee tour can have different chauffeur in outbound vs inbound direction # which tour mode should it be set to? Currently it's whatever comes last. # Does it even matter if trip modes are getting matched later? - escort_bundles = inject.get_table("escort_bundles").to_frame() + escort_bundles = state.get_dataframe("escort_bundles") # grabbing the school tour ids for each school escort bundle se_tours = escort_bundles[["school_tour_ids", "chauf_tour_id"]].copy() @@ -628,8 +626,8 @@ def force_escortee_tour_modes_to_match_chauffeur(tours): return tours -def force_escortee_trip_modes_to_match_chauffeur(trips): - school_escort_trips = inject.get_table("school_escort_trips").to_frame() +def force_escortee_trip_modes_to_match_chauffeur(state: workflow.State, trips): + school_escort_trips = state.get_dataframe("school_escort_trips") # starting with only trips that are created as part of the school escorting model se_trips = trips[trips.index.isin(school_escort_trips.index)].copy() diff --git a/activitysim/abm/models/util/test/test_cdap.py b/activitysim/abm/models/util/test/test_cdap.py index 0e4bd68392..20dc6b2410 100644 --- a/activitysim/abm/models/util/test/test_cdap.py +++ b/activitysim/abm/models/util/test/test_cdap.py @@ -1,6 +1,8 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import os.path import pandas as pd @@ -8,9 +10,8 @@ import pytest import yaml -from activitysim.core import chunk, config, inject, simulate - -from .. import cdap +from activitysim.abm.models.util import cdap +from activitysim.core import chunk, simulate, workflow @pytest.fixture(scope="module") @@ -23,11 +24,6 @@ def people(data_dir): return pd.read_csv(os.path.join(data_dir, "people.csv"), index_col="id") -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() - - @pytest.fixture(scope="module") def model_settings(configs_dir): yml_file = os.path.join(configs_dir, "cdap.yaml") @@ -41,17 +37,11 @@ def configs_dir(): return os.path.join(os.path.dirname(__file__), "configs") -def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) - - def test_bad_coefficients(): - + state = workflow.State.make_default(__file__) coefficients = pd.read_csv( - config.config_file_path("cdap_interaction_coefficients.csv"), comment="#" + state.filesystem.get_config_file_path("cdap_interaction_coefficients.csv"), + comment="#", ) coefficients = cdap.preprocess_interaction_coefficients(coefficients) @@ -63,11 +53,11 @@ def test_bad_coefficients(): def test_assign_cdap_rank(people, model_settings): - + state = workflow.State.make_default(__file__) person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log("test_assign_cdap_rank", base=True): - cdap.assign_cdap_rank(people, person_type_map) + with chunk.chunk_log(state, "test_assign_cdap_rank", base=True): + cdap.assign_cdap_rank(state, people, person_type_map) expected = pd.Series( [1, 1, 1, 2, 2, 1, 3, 1, 2, 1, 3, 2, 1, 3, 2, 4, 1, 3, 4, 2], index=people.index @@ -79,17 +69,21 @@ def test_assign_cdap_rank(people, model_settings): def test_individual_utilities(people, model_settings): - - cdap_indiv_and_hhsize1 = simulate.read_model_spec( + state = workflow.State.make_default(__file__) + cdap_indiv_and_hhsize1 = state.filesystem.read_model_spec( file_name="cdap_indiv_and_hhsize1.csv" ) person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log("test_individual_utilities", base=True): - cdap.assign_cdap_rank(people, person_type_map) + with chunk.chunk_log(state, "test_individual_utilities", base=True) as chunk_sizer: + cdap.assign_cdap_rank(state, people, person_type_map) individual_utils = cdap.individual_utilities( - people, cdap_indiv_and_hhsize1, locals_d=None + state, + people, + cdap_indiv_and_hhsize1, + locals_d=None, + chunk_sizer=chunk_sizer, ) individual_utils = individual_utils[["M", "N", "H"]] @@ -127,14 +121,15 @@ def test_individual_utilities(people, model_settings): def test_build_cdap_spec_hhsize2(people, model_settings): - + state = workflow.State.make_default(__file__) hhsize = 2 - cdap_indiv_and_hhsize1 = simulate.read_model_spec( + cdap_indiv_and_hhsize1 = state.filesystem.read_model_spec( file_name="cdap_indiv_and_hhsize1.csv" ) interaction_coefficients = pd.read_csv( - config.config_file_path("cdap_interaction_coefficients.csv"), comment="#" + state.filesystem.get_config_file_path("cdap_interaction_coefficients.csv"), + comment="#", ) interaction_coefficients = cdap.preprocess_interaction_coefficients( interaction_coefficients @@ -142,23 +137,29 @@ def test_build_cdap_spec_hhsize2(people, model_settings): person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log("test_build_cdap_spec_hhsize2", base=True): - cdap.assign_cdap_rank(people, person_type_map) + with chunk.chunk_log( + state, "test_build_cdap_spec_hhsize2", base=True + ) as chunk_sizer: + cdap.assign_cdap_rank(state, people, person_type_map) indiv_utils = cdap.individual_utilities( - people, cdap_indiv_and_hhsize1, locals_d=None + state, + people, + cdap_indiv_and_hhsize1, + locals_d=None, + chunk_sizer=chunk_sizer, ) - choosers = cdap.hh_choosers(indiv_utils, hhsize=hhsize) + choosers = cdap.hh_choosers(state, indiv_utils, hhsize=hhsize) spec = cdap.build_cdap_spec( - interaction_coefficients, hhsize=hhsize, cache=False + state, interaction_coefficients, hhsize=hhsize, cache=False ) # pandas.dot depends on column names of expression_values matching spec index values # expressions should have been uniquified when spec was read assert spec.index.is_unique - vars = simulate.eval_variables(spec.index, choosers) + vars = simulate.eval_variables(state, spec.index, choosers) assert (spec.index.values == vars.columns.values).all() # spec = spec.astype(np.float64) diff --git a/activitysim/abm/models/util/test/test_flexible_tour_trip_ids.py b/activitysim/abm/models/util/test/test_flexible_tour_trip_ids.py index 178025160a..288e11cbc5 100644 --- a/activitysim/abm/models/util/test/test_flexible_tour_trip_ids.py +++ b/activitysim/abm/models/util/test/test_flexible_tour_trip_ids.py @@ -3,7 +3,7 @@ import pandas as pd -from ..canonical_ids import ( +from activitysim.abm.models.util.canonical_ids import ( determine_flavors_from_alts_file, determine_mandatory_tour_flavors, ) diff --git a/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py b/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py index 3f01446087..0fdb5e1e66 100644 --- a/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py +++ b/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py @@ -8,9 +8,8 @@ import pandas.testing as pdt import pytest -from activitysim.core import inject - -from ..tour_frequency import process_mandatory_tours +from activitysim.abm.models.util.tour_frequency import process_mandatory_tours +from activitysim.core import workflow @pytest.fixture(scope="module") @@ -18,13 +17,6 @@ def configs_dir(): return os.path.join(os.path.dirname(__file__), "configs") -def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) - - def mandatory_tour_frequency_alternatives(): configs_dir = os.path.join(os.path.dirname(__file__), "configs") f = os.path.join(configs_dir, "mandatory_tour_frequency_alternatives.csv") @@ -34,6 +26,8 @@ def mandatory_tour_frequency_alternatives(): def test_mtf(): + state = workflow.State.make_default(__file__) + persons = pd.DataFrame( { "is_worker": [True, True, False, False], @@ -53,7 +47,9 @@ def test_mtf(): tour_frequency_alternatives = mandatory_tour_frequency_alternatives() - mandatory_tours = process_mandatory_tours(persons, tour_frequency_alternatives) + mandatory_tours = process_mandatory_tours( + state, persons, tour_frequency_alternatives + ) idx = mandatory_tours.index diff --git a/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py b/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py index b7fac80442..6f25a87353 100644 --- a/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py @@ -2,25 +2,15 @@ # See full license in LICENSE.txt. -import os - import pandas as pd import pandas.testing as pdt -import pytest - -from activitysim.core import inject - -from ..tour_frequency import process_non_mandatory_tours - -def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) +from activitysim.abm.models.util.tour_frequency import process_non_mandatory_tours +from activitysim.core import workflow def test_nmtf(): + state = workflow.State.make_default(__file__) persons = pd.DataFrame( { @@ -42,7 +32,7 @@ def test_nmtf(): tour_counts.index = persons.index # assign person ids to the index # - create the non_mandatory tours - nmt = process_non_mandatory_tours(persons, tour_counts) + nmt = process_non_mandatory_tours(state, persons, tour_counts) idx = nmt.index diff --git a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py index 4e4325b056..3c3cc5c90f 100644 --- a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py @@ -1,39 +1,23 @@ # ActivitySim # See full license in LICENSE.txt. -import os - -import numpy as np import pandas as pd import pandas.testing as pdt -import pytest - -from activitysim.core import inject -from ..vectorize_tour_scheduling import ( +from activitysim.abm.models.util.vectorize_tour_scheduling import ( get_previous_tour_by_tourid, vectorize_tour_scheduling, ) - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() - - -def setup_function(): - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) +from activitysim.core import workflow def test_vts(): - - inject.add_injectable("settings", {}) + state = workflow.State.make_default(__file__) # note: need 0 duration tour on one end of day to guarantee at least one available tour alts = pd.DataFrame({"start": [1, 1, 2, 3], "end": [1, 4, 5, 6]}) alts["duration"] = alts.end - alts.start - inject.add_injectable("tdd_alts", alts) + state.add_injectable("tdd_alts", alts) current_tour_person_ids = pd.Series(["b", "c"], index=["d", "e"]) @@ -63,16 +47,17 @@ def test_vts(): persons = pd.DataFrame({"income": [20, 30, 25]}, index=[1, 2, 3]) - inject.add_table("persons", persons) + state.add_table("persons", persons) spec = pd.DataFrame({"Coefficient": [1.2]}, index=["income"]) spec.index.name = "Expression" - inject.add_injectable("check_for_variability", True) + state.settings.check_for_variability = True - timetable = inject.get_injectable("timetable") + timetable = state.get_injectable("timetable") tdd_choices = vectorize_tour_scheduling( + state, tours, persons, alts, diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index fed9c15579..be07c33aa0 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -1,34 +1,34 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd +from activitysim.abm.models.util import logsums as logsum from activitysim.abm.tables.size_terms import tour_destination_size_terms -from activitysim.core import config, inject, los, pipeline, simulate, tracing +from activitysim.core import config, los, simulate, tracing, workflow from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.util import reindex -from . import logsums as logsum - logger = logging.getLogger(__name__) DUMP = False -class SizeTermCalculator(object): +class SizeTermCalculator: """ convenience object to provide size_terms for a selector (e.g. non_mandatory) for various segments (e.g. tour_type or purpose) returns size terms for specified segment in df or series form """ - def __init__(self, size_term_selector): - + def __init__(self, state: workflow.State, size_term_selector): # do this once so they can request size_terms for various segments (tour_type or purpose) - land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") + land_use = state.get_dataframe("land_use") + size_terms = state.get_injectable("size_terms") self.destination_size_terms = tour_destination_size_terms( land_use, size_terms, size_term_selector ) @@ -59,27 +59,22 @@ def dest_size_terms_df(self, segment_name, trace_label): return size_terms - # def dest_size_terms_series(self, segment_name): - # # return size terms as as series - # # convenient (and no copy overhead) if reindexing and assigning into alts column - # return self.destination_size_terms[segment_name] - def _destination_sample( - spec_segment_name, - choosers, + state: workflow.State, + spec_segment_name: str, + choosers: pd.DataFrame, destination_size_terms, skims, estimator, model_settings, alt_dest_col_name, - chunk_size, chunk_tag, - trace_label, + trace_label: str, zone_layer=None, ): - model_spec = simulate.spec_for_segment( + state, model_settings, spec_id="SAMPLE_SPEC", segment_name=spec_segment_name, @@ -89,7 +84,7 @@ def _destination_sample( logger.info("running %s with %d tours", trace_label, len(choosers)) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + if state.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count @@ -109,9 +104,10 @@ def _destination_sample( if constants is not None: locals_d.update(constants) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_sample( + state, choosers, alternatives=destination_size_terms, sample_size=sample_size, @@ -120,7 +116,7 @@ def _destination_sample( spec=model_spec, skims=skims, locals_d=locals_d, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer=zone_layer, @@ -137,6 +133,7 @@ def _destination_sample( def destination_sample( + state: workflow.State, spec_segment_name, choosers, model_settings, @@ -146,7 +143,6 @@ def destination_sample( chunk_size, trace_label, ): - chunk_tag = "tour_destination.sample" # create wrapper with keys for this lookup @@ -164,6 +160,7 @@ def destination_sample( alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _destination_sample( + state, spec_segment_name, choosers, destination_size_terms, @@ -171,7 +168,6 @@ def destination_sample( estimator, model_settings, alt_dest_col_name, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, ) @@ -229,7 +225,7 @@ def aggregate_size_terms(dest_size_terms, network_los): return MAZ_size_terms, TAZ_size_terms -def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): +def choose_MAZ_for_TAZ(state: workflow.State, taz_sample, MAZ_size_terms, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ @@ -251,8 +247,8 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): # 542963 53 0.004224 2 13243 # 542963 59 0.008628 1 13243 - trace_hh_id = inject.get_injectable("trace_hh_id", None) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) + trace_hh_id = state.settings.trace_hh_id + have_trace_targets = trace_hh_id and state.tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") @@ -262,8 +258,8 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): assert CHOOSER_ID is not None # write taz choices, pick_counts, probs - trace_targets = tracing.trace_targets(taz_sample) - tracing.trace_df( + trace_targets = state.tracing.trace_targets(taz_sample) + state.tracing.trace_df( taz_sample[trace_targets], label=tracing.extend_trace_label(trace_label, "taz_sample"), transpose=False, @@ -331,9 +327,11 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): if have_trace_targets: # write maz_sizes: maz_sizes[index,tour_id,dest_TAZ,zone_id,size_term] - maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) + maz_sizes_trace_targets = state.tracing.trace_targets( + maz_sizes, slicer=CHOOSER_ID + ) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df( + state.tracing.trace_df( trace_maz_sizes, label=tracing.extend_trace_label(trace_label, "maz_sizes"), transpose=False, @@ -364,7 +362,7 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) - rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) + rands = state.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) rands = rands.reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] @@ -382,12 +380,11 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - - taz_choices_trace_targets = tracing.trace_targets( + taz_choices_trace_targets = state.tracing.trace_targets( taz_choices, slicer=CHOOSER_ID ) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df( + state.tracing.trace_df( trace_taz_choices_df, label=tracing.extend_trace_label(trace_label, "taz_choices"), transpose=False, @@ -413,7 +410,7 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), transpose=False, @@ -429,7 +426,7 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), transpose=False, @@ -443,7 +440,7 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): ) df = pd.concat([lhs_df, df], axis=1) df["rand"] = rands[taz_choices_trace_targets] - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), transpose=False, @@ -460,16 +457,15 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): def destination_presample( + state: workflow.State, spec_segment_name, choosers, model_settings, network_los, destination_size_terms, estimator, - chunk_size, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") chunk_tag = "tour_destination.presample" @@ -494,6 +490,7 @@ def destination_presample( skims = skim_dict.wrap(ORIG_TAZ, DEST_TAZ) taz_sample = _destination_sample( + state, spec_segment_name, choosers, TAZ_size_terms, @@ -501,14 +498,13 @@ def destination_presample( estimator, model_settings, DEST_TAZ, - chunk_size, chunk_tag=chunk_tag, trace_label=trace_label, zone_layer="taz", ) # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total - maz_choices = choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label) + maz_choices = choose_MAZ_for_TAZ(state, taz_sample, MAZ_size_terms, trace_label) assert DEST_MAZ in maz_choices maz_choices = maz_choices.rename(columns={DEST_MAZ: alt_dest_col_name}) @@ -517,6 +513,7 @@ def destination_presample( def run_destination_sample( + state, spec_segment_name, tours, persons_merged, @@ -527,7 +524,6 @@ def run_destination_sample( chunk_size, trace_label, ): - # FIXME - MEMORY HACK - only include columns actually used in spec (omit them pre-merge) chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] @@ -553,7 +549,7 @@ def run_destination_sample( # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not state.settings.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -561,24 +557,24 @@ def run_destination_sample( ) if pre_sample_taz: - logger.info( "Running %s destination_presample with %d tours" % (trace_label, len(tours)) ) choices = destination_presample( + state, spec_segment_name, choosers, model_settings, network_los, destination_size_terms, estimator, - chunk_size, trace_label, ) else: choices = destination_sample( + state, spec_segment_name, choosers, model_settings, @@ -597,6 +593,7 @@ def run_destination_sample( def run_destination_logsums( + state: workflow.State, tour_purpose, persons_merged, destination_sample, @@ -626,7 +623,9 @@ def run_destination_logsums( +-----------+--------------+----------------+------------+----------------+ """ - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = state.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) # if special person id is passed chooser_id_column = model_settings.get("CHOOSER_ID_COLUMN", "person_id") @@ -648,10 +647,11 @@ def run_destination_logsums( logger.info("Running %s with %s rows", trace_label, len(choosers)) - tracing.dump_df(DUMP, persons_merged, trace_label, "persons_merged") - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + state.tracing.dump_df(DUMP, persons_merged, trace_label, "persons_merged") + state.tracing.dump_df(DUMP, choosers, trace_label, "choosers") logsums = logsum.compute_logsums( + state, choosers, tour_purpose, logsum_settings, @@ -668,6 +668,7 @@ def run_destination_logsums( def run_destination_simulate( + state: workflow.State, spec_segment_name, tours, persons_merged, @@ -688,6 +689,7 @@ def run_destination_simulate( chunk_tag = "tour_destination.simulate" model_spec = simulate.spec_for_segment( + state, model_settings, spec_id="SPEC", segment_name=spec_segment_name, @@ -729,7 +731,7 @@ def run_destination_simulate( destination_size_terms.size_term, destination_sample[alt_dest_col_name] ) - tracing.dump_df(DUMP, destination_sample, trace_label, "alternatives") + state.tracing.dump_df(DUMP, destination_sample, trace_label, "alternatives") constants = config.get_model_constants(model_settings) @@ -750,11 +752,12 @@ def run_destination_simulate( if constants is not None: locals_d.update(constants) - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + state.tracing.dump_df(DUMP, choosers, trace_label, "choosers") - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_sample_simulate( + state, choosers, destination_sample, spec=model_spec, @@ -780,20 +783,20 @@ def run_destination_simulate( def run_tour_destination( - tours, - persons_merged, - want_logsums, - want_sample_table, + state: workflow.State, + tours: pd.DataFrame, + persons_merged: pd.DataFrame, + want_logsums: bool, + want_sample_table: bool, model_settings, - network_los, + network_los: los.Network_LOS, estimator, - chunk_size, - trace_hh_id, trace_label, skip_choice=False, ): - - size_term_calculator = SizeTermCalculator(model_settings["SIZE_TERM_SELECTOR"]) + size_term_calculator = SizeTermCalculator( + state, model_settings["SIZE_TERM_SELECTOR"] + ) # maps segment names to compact (integer) ids segments = model_settings["SEGMENTS"] @@ -807,7 +810,6 @@ def run_tour_destination( choices_list = [] sample_list = [] for segment_name in segments: - segment_trace_label = tracing.extend_trace_label(trace_label, segment_name) if chooser_segment_column is not None: @@ -829,6 +831,7 @@ def run_tour_destination( # - destination_sample spec_segment_name = segment_name # spec_segment_name is segment_name location_sample_df = run_destination_sample( + state, spec_segment_name, choosers, persons_merged, @@ -836,25 +839,27 @@ def run_tour_destination( network_los, segment_destination_size_terms, estimator, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=tracing.extend_trace_label(segment_trace_label, "sample"), ) # - destination_logsums tour_purpose = segment_name # tour_purpose is segment_name location_sample_df = run_destination_logsums( + state, tour_purpose, persons_merged, location_sample_df, model_settings, network_los, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=tracing.extend_trace_label(segment_trace_label, "logsums"), ) # - destination_simulate spec_segment_name = segment_name # spec_segment_name is segment_name choices = run_destination_simulate( + state, spec_segment_name, choosers, persons_merged, @@ -864,7 +869,7 @@ def run_tour_destination( network_los=network_los, destination_size_terms=segment_destination_size_terms, estimator=estimator, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=tracing.extend_trace_label(segment_trace_label, "simulate"), skip_choice=skip_choice, ) diff --git a/activitysim/abm/models/util/tour_frequency.py b/activitysim/abm/models/util/tour_frequency.py index a73482fe5d..9c9d8a567e 100644 --- a/activitysim/abm/models/util/tour_frequency.py +++ b/activitysim/abm/models/util/tour_frequency.py @@ -1,12 +1,14 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import config from activitysim.abm.models.util.canonical_ids import set_tour_index +from activitysim.core import config, workflow from activitysim.core.util import reindex logger = logging.getLogger(__name__) @@ -93,7 +95,9 @@ def create_tours(tour_counts, tour_category, parent_col="person_id"): """ # set these here to ensure consistency across different tour categories - assert tour_category in ["mandatory", "non_mandatory", "atwork", "joint"] + + # do not enforce this here, other categories are possible + # assert tour_category in ["mandatory", "non_mandatory", "atwork", "joint"] tours["tour_category"] = tour_category # for joint tours, the correct number will be filled in after participation step @@ -168,7 +172,9 @@ def process_tours( return tours -def process_mandatory_tours(persons, mandatory_tour_frequency_alts): +def process_mandatory_tours( + state: workflow.State, persons, mandatory_tour_frequency_alts +): """ This method processes the mandatory_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the @@ -239,7 +245,7 @@ def process_mandatory_tours(persons, mandatory_tour_frequency_alts): tours["household_id"] = tours_merged.household_id # assign stable (predictable) tour_id - set_tour_index(tours) + set_tour_index(state, tours) """ person_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -258,7 +264,7 @@ def process_mandatory_tours(persons, mandatory_tour_frequency_alts): return tours -def process_non_mandatory_tours(persons, tour_counts): +def process_non_mandatory_tours(state: workflow.State, persons, tour_counts): """ This method processes the non_mandatory_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that @@ -292,7 +298,7 @@ def process_non_mandatory_tours(persons, tour_counts): tours["origin"] = reindex(persons.home_zone_id, tours.person_id) # assign stable (predictable) tour_id - set_tour_index(tours) + set_tour_index(state, tours) """ person_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -311,7 +317,11 @@ def process_non_mandatory_tours(persons, tour_counts): return tours -def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): +def process_atwork_subtours( + state: workflow.State, + work_tours: pd.DataFrame, + atwork_subtour_frequency_alts: pd.DataFrame, +): """ This method processes the atwork_subtour_frequency column that comes @@ -378,7 +388,7 @@ def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): tours = pd.merge(tours, work_tours, left_on=parent_col, right_index=True) # assign stable (predictable) tour_id - set_tour_index(tours, parent_tour_num_col="parent_tour_num") + set_tour_index(state, tours, parent_tour_num_col="parent_tour_num") """ person_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -399,7 +409,12 @@ def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): return tours -def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_persons): +def process_joint_tours( + state: workflow.State, + joint_tour_frequency, + joint_tour_frequency_alts, + point_persons, +): """ This method processes the joint_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the @@ -443,7 +458,7 @@ def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_p tours["origin"] = reindex(point_persons.home_zone_id, tours.household_id) # assign stable (predictable) tour_id - set_tour_index(tours, is_joint=True) + set_tour_index(state, tours, is_joint=True) """ household_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -463,6 +478,7 @@ def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_p def process_joint_tours_frequency_composition( + state: workflow.State, joint_tour_frequency_composition, joint_tour_frequency_composition_alts, point_persons, @@ -496,6 +512,7 @@ def process_joint_tours_frequency_composition( assert not joint_tour_frequency_composition.isnull().any() tours = process_tours_frequency_composition( + state, joint_tour_frequency_composition.dropna(), joint_tour_frequency_composition_alts, tour_category="joint", @@ -510,7 +527,7 @@ def process_joint_tours_frequency_composition( tours["origin"] = reindex(point_persons.home_zone_id, tours.household_id) # assign stable (predictable) tour_id - set_tour_index(tours, is_joint=True) + set_tour_index(state, tours, is_joint=True) """ household_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -530,6 +547,7 @@ def process_joint_tours_frequency_composition( def process_tours_frequency_composition( + state: workflow.State, joint_tour_frequency_composition, joint_tour_frequency_composition_alts, tour_category, @@ -592,12 +610,14 @@ def process_tours_frequency_composition( 2588677 1 1 0 """ - tours = create_joint_tours(tour_counts, tour_category, parent_col) + tours = create_joint_tours(state, tour_counts, tour_category, parent_col) return tours -def create_joint_tours(tour_counts, tour_category, parent_col="person_id"): +def create_joint_tours( + state: workflow.State, tour_counts, tour_category, parent_col="person_id" +): """ This method processes the tour_frequency column that comes out of the model of the same name and turns into a DataFrame that @@ -639,7 +659,7 @@ def create_joint_tours(tour_counts, tour_category, parent_col="person_id"): """ model_settings_file_name = "joint_tour_frequency_composition.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) alts_table_structure = model_settings.get("ALTS_TABLE_STRUCTURE", None) assert ( diff --git a/activitysim/abm/models/util/tour_od.py b/activitysim/abm/models/util/tour_od.py index 91377bd6db..26eefd7352 100644 --- a/activitysim/abm/models/util/tour_od.py +++ b/activitysim/abm/models/util/tour_od.py @@ -1,29 +1,30 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging +from typing import Optional import numpy as np import pandas as pd from orca import orca -from activitysim.abm.tables.size_terms import tour_destination_size_terms +from activitysim.abm.models.util import logsums as logsum +from activitysim.abm.models.util import trip +from activitysim.abm.models.util.tour_destination import SizeTermCalculator from activitysim.core import ( config, expressions, - inject, logit, los, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.util import reindex -from . import logsums as logsum -from . import trip - logger = logging.getLogger(__name__) DUMP = False @@ -50,7 +51,6 @@ def create_od_id_col(df, origin_col, destination_col): def _get_od_cols_from_od_id( df, orig_col_name=None, dest_col_name=None, od_id_col="choice" ): - df[orig_col_name] = df[od_id_col].str.split("_").str[0].astype(int) df[dest_col_name] = df[od_id_col].str.split("_").str[1].astype(int) @@ -58,13 +58,14 @@ def _get_od_cols_from_od_id( def _create_od_alts_from_dest_size_terms( + state: workflow.State, size_terms_df, segment_name, od_id_col=None, origin_id_col="origin", dest_id_col="destination", origin_filter=None, - origin_attr_cols=None, + origin_attr_cols: Optional[list[str]] = None, ): """ Extend destination size terms to create dataframe representing the @@ -73,7 +74,9 @@ def _create_od_alts_from_dest_size_terms( attributes of the origins can be preserved. """ - land_use = inject.get_table("land_use").to_frame(columns=origin_attr_cols) + land_use = state.get_dataframe("land_use") + if origin_attr_cols is not None: + land_use = land_use[origin_attr_cols] if origin_filter: origins = land_use.query(origin_filter) @@ -113,6 +116,7 @@ def _create_od_alts_from_dest_size_terms( def _od_sample( + state: workflow.State, spec_segment_name, choosers, network_los, @@ -127,8 +131,8 @@ def _od_sample( chunk_tag, trace_label, ): - model_spec = simulate.spec_for_segment( + state, model_settings, spec_id="SAMPLE_SPEC", segment_name=spec_segment_name, @@ -142,7 +146,7 @@ def _od_sample( logger.info("running %s with %d tours", trace_label, len(choosers)) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( + if state.settings.disable_destination_sampling or ( estimator and estimator.want_unsampled_alternatives ): # FIXME interaction_sample will return unsampled complete alternatives @@ -170,6 +174,7 @@ def _od_sample( origin_attr_cols = model_settings["ORIGIN_ATTR_COLS_TO_USE"] od_alts_df = _create_od_alts_from_dest_size_terms( + state, destination_size_terms, spec_segment_name, od_id_col=alt_col_name, @@ -186,6 +191,7 @@ def _od_sample( logger.error("Alts df is missing origin skim key column.") choices = interaction_sample( + state, choosers, alternatives=od_alts_df, sample_size=sample_size, @@ -212,7 +218,6 @@ def od_sample( chunk_size, trace_label, ): - chunk_tag = "tour_od.sample" origin_col_name = model_settings["ORIG_COL_NAME"] @@ -253,29 +258,22 @@ def map_maz_to_taz(s, network_los): return s.map(maz_to_taz) -def map_maz_to_ext_taz(s): - land_use = ( - inject.get_table("land_use").to_frame(columns=["external_TAZ"]).external_TAZ - ) +def map_maz_to_ext_taz(state: workflow.State, s): + land_use = state.get_dataframe("land_use", columns=["external_TAZ"]).external_TAZ return s.map(land_use).astype(int) -def map_maz_to_ext_maz(s): - land_use = ( - inject.get_table("land_use").to_frame(columns=["external_MAZ"]).external_MAZ - ) +def map_maz_to_ext_maz(state: workflow.State, s): + land_use = state.get_dataframe("land_use", columns=["external_MAZ"]).external_MAZ return s.map(land_use).astype(int) -def map_ext_maz_to_maz(s): - land_use = ( - inject.get_table("land_use").to_frame(columns=["original_MAZ"]).original_MAZ - ) +def map_ext_maz_to_maz(state: workflow.State, s): + land_use = state.get_dataframe("land_use", columns=["original_MAZ"]).original_MAZ return s.map(land_use).astype(int) def aggregate_size_terms(dest_size_terms, network_los): - # aggregate MAZ_size_terms to TAZ_size_terms MAZ_size_terms = dest_size_terms.copy() @@ -310,7 +308,9 @@ def aggregate_size_terms(dest_size_terms, network_los): return MAZ_size_terms, TAZ_size_terms +@workflow.func def choose_MAZ_for_TAZ( + state: workflow.State, taz_sample, MAZ_size_terms, trace_label, @@ -341,8 +341,8 @@ def choose_MAZ_for_TAZ( # 542963 53 0.004224 2 13243 # 542963 59 0.008628 1 13243 - trace_hh_id = inject.get_injectable("trace_hh_id", None) - have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) + trace_hh_id = state.settings.trace_hh_id + have_trace_targets = trace_hh_id and state.tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") @@ -352,8 +352,8 @@ def choose_MAZ_for_TAZ( assert CHOOSER_ID is not None # write taz choices, pick_counts, probs - trace_targets = tracing.trace_targets(taz_sample) - tracing.trace_df( + trace_targets = state.tracing.trace_targets(taz_sample) + state.tracing.trace_df( taz_sample[trace_targets], label=tracing.extend_trace_label(trace_label, "taz_sample"), transpose=False, @@ -431,9 +431,11 @@ def choose_MAZ_for_TAZ( if have_trace_targets: # write maz_sizes: maz_sizes[index,tour_id,dest_TAZ,zone_id,size_term] - maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) + maz_sizes_trace_targets = state.tracing.trace_targets( + maz_sizes, slicer=CHOOSER_ID + ) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df( + state.tracing.trace_df( trace_maz_sizes, label=tracing.extend_trace_label(trace_label, "maz_sizes"), transpose=False, @@ -464,7 +466,7 @@ def choose_MAZ_for_TAZ( maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) - rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) + rands = state.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) rands = rands.reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] @@ -484,12 +486,11 @@ def choose_MAZ_for_TAZ( taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - - taz_choices_trace_targets = tracing.trace_targets( + taz_choices_trace_targets = state.tracing.trace_targets( taz_choices, slicer=CHOOSER_ID ) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df( + state.tracing.trace_df( trace_taz_choices_df, label=tracing.extend_trace_label(trace_label, "taz_choices"), transpose=False, @@ -515,7 +516,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), transpose=False, @@ -531,7 +532,7 @@ def choose_MAZ_for_TAZ( index=trace_taz_choices_df.index, ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), transpose=False, @@ -545,7 +546,7 @@ def choose_MAZ_for_TAZ( ) df = pd.concat([lhs_df, df], axis=1) df["rand"] = rands[taz_choices_trace_targets] - tracing.trace_df( + state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), transpose=False, @@ -562,7 +563,9 @@ def choose_MAZ_for_TAZ( return taz_choices_w_maz +@workflow.func def od_presample( + state: workflow.State, spec_segment_name, choosers, model_settings, @@ -572,7 +575,6 @@ def od_presample( chunk_size, trace_label, ): - trace_label = tracing.extend_trace_label(trace_label, "presample") chunk_tag = "tour_od.presample" @@ -591,6 +593,7 @@ def od_presample( skims = skim_dict.wrap(ORIG_TAZ, DEST_TAZ) orig_MAZ_dest_TAZ_sample = _od_sample( + state, spec_segment_name, choosers, network_los, @@ -617,6 +620,7 @@ def od_presample( # MAZ size_term fraction of TAZ total maz_choices = choose_MAZ_for_TAZ( + state, orig_MAZ_dest_TAZ_sample, MAZ_size_terms, trace_label, @@ -635,56 +639,56 @@ def od_presample( return maz_choices -class SizeTermCalculator(object): - """ - convenience object to provide size_terms for a selector (e.g. - non_mandatory) for various segments (e.g. tour_type or purpose) - returns size terms for specified segment in df or series form. - """ - - def __init__(self, size_term_selector): - - # do this once so they can request size_terms for various segments (tour_type or purpose) - land_use = inject.get_table("land_use") - self.land_use = land_use - size_terms = inject.get_injectable("size_terms") - self.destination_size_terms = tour_destination_size_terms( - self.land_use, size_terms, size_term_selector - ) - - assert not self.destination_size_terms.isna().any(axis=None) - - def omnibus_size_terms_df(self): - return self.destination_size_terms - - def dest_size_terms_df(self, segment_name, trace_label): - # return size terms as df with one column named 'size_term' - # convenient if creating or merging with alts - - size_terms = self.destination_size_terms[[segment_name]].copy() - size_terms.columns = ["size_term"] - - # FIXME - no point in considering impossible alternatives (where dest size term is zero) - logger.debug( - f"SizeTermCalculator dropping {(~(size_terms.size_term > 0)).sum()} " - f"of {len(size_terms)} rows where size_term is zero for {segment_name}" - ) - size_terms = size_terms[size_terms.size_term > 0] - - if len(size_terms) == 0: - logger.warning( - f"SizeTermCalculator: no zones with non-zero size terms for {segment_name} in {trace_label}" - ) - - return size_terms - - def dest_size_terms_series(self, segment_name): - # return size terms as as series - # convenient (and no copy overhead) if reindexing and assigning into alts column - return self.destination_size_terms[segment_name] +# class SizeTermCalculatorOD: # class SizeTermCalculator +# """ +# convenience object to provide size_terms for a selector (e.g. +# non_mandatory) for various segments (e.g. tour_type or purpose) +# returns size terms for specified segment in df or series form. +# """ +# +# def __init__(self, size_term_selector): +# # do this once so they can request size_terms for various segments (tour_type or purpose) +# land_use = state.checkpoint.load_dataframe("land_use") +# self.land_use = land_use +# size_terms = state.get_injectable("size_terms") +# self.destination_size_terms = tour_destination_size_terms( +# self.land_use, size_terms, size_term_selector +# ) +# +# assert not self.destination_size_terms.isna().any(axis=None) +# +# def omnibus_size_terms_df(self): +# return self.destination_size_terms +# +# def dest_size_terms_df(self, segment_name, trace_label): +# # return size terms as df with one column named 'size_term' +# # convenient if creating or merging with alts +# +# size_terms = self.destination_size_terms[[segment_name]].copy() +# size_terms.columns = ["size_term"] +# +# # FIXME - no point in considering impossible alternatives (where dest size term is zero) +# logger.debug( +# f"SizeTermCalculator dropping {(~(size_terms.size_term > 0)).sum()} " +# f"of {len(size_terms)} rows where size_term is zero for {segment_name}" +# ) +# size_terms = size_terms[size_terms.size_term > 0] +# +# if len(size_terms) == 0: +# logger.warning( +# f"SizeTermCalculator: no zones with non-zero size terms for {segment_name} in {trace_label}" +# ) +# +# return size_terms +# +# def dest_size_terms_series(self, segment_name): +# # return size terms as as series +# # convenient (and no copy overhead) if reindexing and assigning into alts column +# return self.destination_size_terms[segment_name] def run_od_sample( + state, spec_segment_name, tours, model_settings, @@ -694,8 +698,8 @@ def run_od_sample( chunk_size, trace_label, ): - model_spec = simulate.spec_for_segment( + state, model_settings, spec_id="SAMPLE_SPEC", segment_name=spec_segment_name, @@ -716,7 +720,7 @@ def run_od_sample( # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): + if pre_sample_taz and not state.settings.want_dest_choice_presampling: pre_sample_taz = False logger.info( f"Disabled destination zone presampling for {trace_label} " @@ -724,12 +728,12 @@ def run_od_sample( ) if pre_sample_taz: - logger.info( "Running %s destination_presample with %d tours" % (trace_label, len(tours)) ) choices = od_presample( + state, spec_segment_name, choosers, model_settings, @@ -756,6 +760,7 @@ def run_od_sample( def run_od_logsums( + state: workflow.State, spec_segment_name, tours_merged_df, od_sample, @@ -773,7 +778,9 @@ def run_od_logsums( (person, OD_id) pair in od_sample, and computing the logsum of all the utilities """ chunk_tag = "tour_od.logsums" - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = state.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) origin_id_col = model_settings["ORIG_COL_NAME"] dest_id_col = model_settings["DEST_COL_NAME"] tour_od_id_col = get_od_id_col(origin_id_col, dest_id_col) @@ -791,13 +798,14 @@ def run_od_logsums( logger.info("Running %s with %s rows", trace_label, len(choosers)) - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + state.tracing.dump_df(DUMP, choosers, trace_label, "choosers") # run trip mode choice to compute tour mode choice logsums if logsum_settings.get("COMPUTE_TRIP_MODE_CHOICE_LOGSUMS", False): - pseudo_tours = choosers.copy() - trip_mode_choice_settings = config.read_model_settings("trip_mode_choice") + trip_mode_choice_settings = state.filesystem.read_model_settings( + "trip_mode_choice" + ) # tours_merged table doesn't yet have all the cols it needs to be called (e.g. # home_zone_id), so in order to compute tour mode choice/trip mode choice logsums @@ -826,9 +834,10 @@ def run_od_logsums( # tour dest as separate column in the trips table bc the trip mode choice # preprocessor isn't able to get the tour dest from the tours table bc the # tours don't yet have ODs. - stop_frequency_alts = inject.get_injectable("stop_frequency_alts") + stop_frequency_alts = state.get_injectable("stop_frequency_alts") pseudo_tours["tour_destination"] = pseudo_tours[dest_id_col] trips = trip.initialize_from_tours( + state, pseudo_tours, stop_frequency_alts, [origin_id_col, dest_id_col, "tour_destination", "unique_id"], @@ -843,7 +852,7 @@ def run_od_logsums( nest_spec = config.get_logit_model_settings(logsum_settings) # actual coeffs dont matter here, just need them to load the nest structure - coefficients = simulate.get_segment_coefficients( + coefficients = state.filesystem.get_segment_coefficients( logsum_settings, pseudo_tours.iloc[0]["tour_purpose"] ) nest_spec = simulate.eval_nest_coefficients( @@ -865,17 +874,19 @@ def run_od_logsums( if col not in trips: logsum_trips[col] = reindex(pseudo_tours[col], logsum_trips.unique_id) - pipeline.replace_table("trips", logsum_trips) - tracing.register_traceable_table("trips", logsum_trips) - pipeline.get_rn_generator().add_channel("trips", logsum_trips) + state.add_table("trips", logsum_trips) + state.tracing.register_traceable_table("trips", logsum_trips) + state.get_rn_generator().add_channel("trips", logsum_trips) - # run trip mode choice on pseudo-trips. use orca instead of pipeline to + # run trip mode choice on pseudo-trips. use a direct call instead of pipeline to # execute the step because pipeline can only handle one open step at a time - orca.run(["trip_mode_choice"]) + from activitysim.abm.models.trip_mode_choice import trip_mode_choice + + trip_mode_choice(state, logsum_trips, state.get("network_los")) # grab trip mode choice logsums and pivot by tour mode and direction, index # on tour_id to enable merge back to choosers table - trips = inject.get_table("trips").to_frame() + trips = state.get_dataframe("trips") trip_dir_mode_logsums = trips.pivot( index=["tour_id", tour_od_id_col], columns=["tour_mode", "outbound"], @@ -895,14 +906,15 @@ def run_od_logsums( choosers.reset_index(inplace=True) choosers.set_index(choosers_og_index, inplace=True) - pipeline.get_rn_generator().drop_channel("trips") - tracing.deregister_traceable_table("trips") + state.get_rn_generator().drop_channel("trips") + state.tracing.deregister_traceable_table("trips") assert (od_sample.index == choosers.index).all() for col in new_cols: od_sample[col] = choosers[col] logsums = logsum.compute_logsums( + state, choosers, spec_segment_name, logsum_settings, @@ -923,6 +935,7 @@ def run_od_logsums( def run_od_simulate( + state: workflow.State, spec_segment_name, tours, od_sample, @@ -940,6 +953,7 @@ def run_od_simulate( """ model_spec = simulate.spec_for_segment( + state, model_settings, spec_id="SPEC", segment_name=spec_segment_name, @@ -980,12 +994,12 @@ def run_od_simulate( ) # also have to add origin attribute columns - lu = inject.get_table("land_use").to_frame(columns=origin_attr_cols) + lu = state.get_dataframe("land_use", columns=origin_attr_cols) od_sample = pd.merge( od_sample, lu, left_on=origin_col_name, right_index=True, how="left" ) - tracing.dump_df(DUMP, od_sample, trace_label, "alternatives") + state.tracing.dump_df(DUMP, od_sample, trace_label, "alternatives") constants = config.get_model_constants(model_settings) @@ -1006,8 +1020,9 @@ def run_od_simulate( if constants is not None: locals_d.update(constants) - tracing.dump_df(DUMP, choosers, trace_label, "choosers") + state.tracing.dump_df(DUMP, choosers, trace_label, "choosers") choices = interaction_sample_simulate( + state, choosers, od_sample, spec=model_spec, @@ -1022,6 +1037,7 @@ def run_od_simulate( ) if not want_logsums: + # expand pd.Series to a one-column DataFrame choices = choices.to_frame("choice") choices = _get_od_cols_from_od_id(choices, origin_col_name, dest_col_name) @@ -1030,6 +1046,7 @@ def run_od_simulate( def run_tour_od( + state, tours, persons, want_logsums, @@ -1041,8 +1058,9 @@ def run_tour_od( trace_hh_id, trace_label, ): - - size_term_calculator = SizeTermCalculator(model_settings["SIZE_TERM_SELECTOR"]) + size_term_calculator = SizeTermCalculator( + state, model_settings["SIZE_TERM_SELECTOR"] + ) preprocessor_settings = model_settings.get("preprocessor", None) origin_col_name = model_settings["ORIG_COL_NAME"] @@ -1057,12 +1075,11 @@ def run_tour_od( choices_list = [] sample_list = [] for segment_name in segments: - choosers = tours[tours[chooser_segment_column] == segment_name] choosers = pd.merge( choosers, - persons.to_frame(columns=["is_university", "demographic_segment"]), + persons[["is_university", "demographic_segment"]], left_on="person_id", right_index=True, ) @@ -1070,6 +1087,7 @@ def run_tour_od( # - annotate choosers if preprocessor_settings: expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, trace_label=trace_label, @@ -1090,6 +1108,7 @@ def run_tour_od( spec_segment_name = segment_name # spec_segment_name is segment_name od_sample_df = run_od_sample( + state, spec_segment_name, choosers, model_settings, @@ -1108,7 +1127,7 @@ def run_tour_od( # sampled alts using internal mazs, so now we # have to convert to using the external tazs od_sample_df[origin_col_name] = map_maz_to_ext_maz( - od_sample_df[origin_col_name] + state, od_sample_df[origin_col_name] ) else: raise ValueError( @@ -1118,6 +1137,7 @@ def run_tour_od( # - destination_logsums od_sample_df = run_od_logsums( + state, spec_segment_name, choosers, od_sample_df, @@ -1133,6 +1153,7 @@ def run_tour_od( # - od_simulate choices = run_od_simulate( + state, spec_segment_name, choosers, od_sample_df, diff --git a/activitysim/abm/models/util/tour_scheduling.py b/activitysim/abm/models/util/tour_scheduling.py index 1d6de73161..bf69c62349 100644 --- a/activitysim/abm/models/util/tour_scheduling.py +++ b/activitysim/abm/models/util/tour_scheduling.py @@ -1,34 +1,34 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import config, expressions, inject, simulate - -from . import estimation -from . import vectorize_tour_scheduling as vts +from activitysim.abm.models.util import vectorize_tour_scheduling as vts +from activitysim.core import config, estimation, expressions, simulate, workflow logger = logging.getLogger(__name__) def run_tour_scheduling( + state: workflow.State, model_name, chooser_tours, persons_merged, tdd_alts, tour_segment_col, - chunk_size, - trace_hh_id, ): - trace_label = model_name model_settings_file_name = f"{model_name}.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) if "LOGSUM_SETTINGS" in model_settings: - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_settings = state.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) logsum_columns = logsum_settings.get("LOGSUM_CHOOSER_COLUMNS", []) else: logsum_columns = [] @@ -41,15 +41,16 @@ def run_tour_scheduling( persons_merged = expressions.filter_chooser_columns(persons_merged, chooser_columns) - timetable = inject.get_injectable("timetable") + timetable = state.get_injectable("timetable") # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {"tt": timetable} + locals_d = {"tt": timetable.attach_state(state)} locals_d.update(config.get_model_constants(model_settings)) expressions.assign_columns( + state, df=chooser_tours, model_settings=preprocessor_settings, locals_dict=locals_d, @@ -63,19 +64,18 @@ def run_tour_scheduling( specs = {} sharrow_skips = {} for spec_segment_name, spec_settings in spec_segment_settings.items(): - bundle_name = f"{model_name}_{spec_segment_name}" # estimator for this tour_segment estimator = estimation.manager.begin_estimation( - model_name=bundle_name, bundle_name=bundle_name + state, model_name=bundle_name, bundle_name=bundle_name ) spec_file_name = spec_settings["SPEC"] - model_spec = simulate.read_model_spec(file_name=spec_file_name) - coefficients_df = simulate.read_model_coefficients(spec_settings) + model_spec = state.filesystem.read_model_spec(file_name=spec_file_name) + coefficients_df = state.filesystem.read_model_coefficients(spec_settings) specs[spec_segment_name] = simulate.eval_coefficients( - model_spec, coefficients_df, estimator + state, model_spec, coefficients_df, estimator ) sharrow_skips[spec_segment_name] = spec_settings.get("sharrow_skip", False) @@ -109,13 +109,15 @@ def run_tour_scheduling( assert "TOUR_SPEC_SEGMENTS" not in model_settings assert tour_segment_col is None - estimator = estimation.manager.begin_estimation(model_name) + estimator = estimation.manager.begin_estimation(state, model_name) spec_file_name = model_settings["SPEC"] - model_spec = simulate.read_model_spec(file_name=spec_file_name) + model_spec = state.filesystem.read_model_spec(file_name=spec_file_name) sharrow_skip = model_settings.get("sharrow_skip", False) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) if estimator: estimators[None] = estimator # add to local list @@ -135,6 +137,7 @@ def run_tour_scheduling( logger.info(f"Running {model_name} with %d tours", len(chooser_tours)) choices = vts.vectorize_tour_scheduling( + state, chooser_tours, persons_merged, tdd_alts, @@ -142,7 +145,7 @@ def run_tour_scheduling( tour_segments=tour_segments, tour_segment_col=tour_segment_col, model_settings=model_settings, - chunk_size=chunk_size, + chunk_size=state.settings.chunk_size, trace_label=trace_label, ) @@ -173,7 +176,7 @@ def run_tour_scheduling( tdds=choices.reindex(nth_tours.index), ) - timetable.replace_table() + timetable.replace_table(state) # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table diff --git a/activitysim/abm/models/util/trip.py b/activitysim/abm/models/util/trip.py index 870801e276..36a676e170 100644 --- a/activitysim/abm/models/util/trip.py +++ b/activitysim/abm/models/util/trip.py @@ -1,12 +1,14 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd from activitysim.abm.models.util.canonical_ids import set_trip_index -from activitysim.core import config, inject +from activitysim.core import workflow from activitysim.core.util import assign_in_place, reindex logger = logging.getLogger(__name__) @@ -148,16 +150,18 @@ def get_time_windows(residual, level): return np.concatenate(ranges, axis=1) -@inject.injectable() -def stop_frequency_alts(): +@workflow.cached_object +def stop_frequency_alts(state: workflow.State): # alt file for building trips even though simulation is simple_simulate not interaction_simulate - file_path = config.config_file_path("stop_frequency_alternatives.csv") + file_path = state.filesystem.get_config_file_path("stop_frequency_alternatives.csv") df = pd.read_csv(file_path, comment="#") df.set_index("alt", inplace=True) return df -def initialize_from_tours(tours, stop_frequency_alts, addtl_tour_cols_to_preserve=None): +def initialize_from_tours( + state: workflow.State, tours, stop_frequency_alts, addtl_tour_cols_to_preserve=None +): """ Instantiates a trips table based on tour-level attributes: stop frequency, tour origin, tour destination. @@ -278,7 +282,7 @@ def initialize_from_tours(tours, stop_frequency_alts, addtl_tour_cols_to_preserv else: trip_index_tour_id = "tour_id" - set_trip_index(trips, trip_index_tour_id) + set_trip_index(state, trips, trip_index_tour_id) del trips["tour_temp_index"] return trips diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index 0882a6f741..775d84b7b8 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -1,13 +1,15 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import chunk, config, expressions, inject, los, simulate +from activitysim.core import chunk, config, expressions, los, simulate from activitysim.core import timetable as tt -from activitysim.core import tracing +from activitysim.core import tracing, workflow from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.util import reindex @@ -19,11 +21,10 @@ RUN_ALTS_PREPROCESSOR_BEFORE_MERGE = True # see FIXME below before changing this -def skims_for_logsums(tour_purpose, model_settings, trace_label): - +def skims_for_logsums(state: workflow.State, tour_purpose, model_settings, trace_label): assert "LOGSUM_SETTINGS" in model_settings - network_los = inject.get_injectable("network_los") + network_los = state.get_injectable("network_los") skim_dict = network_los.get_default_skim_dict() @@ -92,7 +93,14 @@ def skims_for_logsums(tour_purpose, model_settings, trace_label): def _compute_logsums( - alt_tdd, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label + state: workflow.State, + alt_tdd, + tours_merged, + tour_purpose, + model_settings, + network_los, + skims, + trace_label, ): """ compute logsums for tours using skims for alt_tdd out_period and in_period @@ -100,8 +108,10 @@ def _compute_logsums( trace_label = tracing.extend_trace_label(trace_label, "logsums") - with chunk.chunk_log(trace_label): - logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + with chunk.chunk_log(state, trace_label): + logsum_settings = state.filesystem.read_model_settings( + model_settings["LOGSUM_SETTINGS"] + ) choosers = alt_tdd.join(tours_merged, how="left", rsuffix="_chooser") logger.info( f"{trace_label} compute_logsums for {choosers.shape[0]} choosers {alt_tdd.shape[0]} alts" @@ -121,7 +131,9 @@ def _compute_logsums( locals_dict.update(skims) # constrained coefficients can appear in expressions - coefficients = simulate.get_segment_coefficients(logsum_settings, tour_purpose) + coefficients = state.filesystem.get_segment_coefficients( + logsum_settings, tour_purpose + ) locals_dict.update(coefficients) # - run preprocessor to annotate choosers @@ -130,10 +142,10 @@ def _compute_logsums( preprocessor_settings = logsum_settings[preprocessor] if preprocessor_settings: - simulate.set_skim_wrapper_targets(choosers, skims) expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -141,9 +153,11 @@ def _compute_logsums( ) # - compute logsums - logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) + logsum_spec = state.filesystem.read_model_spec( + file_name=logsum_settings["SPEC"] + ) logsum_spec = simulate.eval_coefficients( - logsum_spec, coefficients, estimator=None + state, logsum_spec, coefficients, estimator=None ) nest_spec = config.get_logit_model_settings(logsum_settings) @@ -152,6 +166,7 @@ def _compute_logsums( ) logsums = simulate.simple_simulate_logsums( + state, choosers, logsum_spec, nest_spec, @@ -164,22 +179,20 @@ def _compute_logsums( return logsums -def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): - - tdd_segments = inject.get_injectable("tdd_alt_segments", None) +def dedupe_alt_tdd(state: workflow.State, alt_tdd, tour_purpose, trace_label): + tdd_segments = state.get_injectable("tdd_alt_segments", None) alt_tdd_periods = None logger.info("tdd_alt_segments specified for representative logsums") - with chunk.chunk_log(tracing.extend_trace_label(trace_label, "dedupe_alt_tdd")): - + with chunk.chunk_log( + state, tracing.extend_trace_label(trace_label, "dedupe_alt_tdd") + ) as chunk_sizer: if tdd_segments is not None: - dedupe_columns = ["out_period", "in_period"] # tdd_alt_segments is optionally segmented by tour purpose if "tour_purpose" in tdd_segments: - is_tdd_for_tour_purpose = tdd_segments.tour_purpose == tour_purpose if not is_tdd_for_tour_purpose.any(): is_tdd_for_tour_purpose = tdd_segments.tour_purpose.isnull() @@ -203,7 +216,7 @@ def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): how="left", on="out_period", ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) # left join representative end on in_period alt_tdd_periods = pd.merge( @@ -214,7 +227,7 @@ def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): how="left", on=["in_period"], ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) if tdd_segments.start.isnull().any(): missing_periods = tdd_segments.out_period[ @@ -239,13 +252,13 @@ def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): alt_tdd_periods = alt_tdd_periods.drop_duplicates().set_index( alt_tdd.index.name ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) # representative duration alt_tdd_periods["duration"] = ( alt_tdd_periods["end"] - alt_tdd_periods["start"] ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) logger.debug( f"{trace_label} " @@ -256,7 +269,6 @@ def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): # if there is no tdd_alt_segments file, we can at least dedupe on 'out_period', 'in_period', 'duration' if alt_tdd_periods is None: - # FIXME This won't work if they reference start or end in logsum calculations # for MTC only duration is used (to calculate all_day parking cost) dedupe_columns = ["out_period", "in_period", "duration"] @@ -274,7 +286,7 @@ def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): .drop_duplicates() .set_index(alt_tdd.index.name) ) - chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) + chunk_sizer.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) logger.debug( f"{trace_label} " @@ -286,8 +298,16 @@ def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): return alt_tdd_periods, dedupe_columns -def compute_logsums( - alt_tdd, tours_merged, tour_purpose, model_settings, skims, trace_label +def compute_tour_scheduling_logsums( + state: workflow.State, + alt_tdd, + tours_merged, + tour_purpose, + model_settings, + skims, + trace_label, + *, + chunk_sizer: chunk.ChunkSizer, ): """ Compute logsums for the tour alt_tdds, which will differ based on their different start, stop @@ -302,7 +322,7 @@ def compute_logsums( """ trace_label = tracing.extend_trace_label(trace_label, "compute_logsums") - network_los = inject.get_injectable("network_los") + network_los = state.get_injectable("network_los") # - in_period and out_period assert "out_period" not in alt_tdd @@ -317,13 +337,13 @@ def compute_logsums( alt_tdd["duration"] = alt_tdd["end"] - alt_tdd["start"] # outside chunk_log context because we extend log_df call for alt_tdd made by our only caller _schedule_tours - chunk.log_df(trace_label, "alt_tdd", alt_tdd) - - with chunk.chunk_log(trace_label): + chunk_sizer.log_df(trace_label, "alt_tdd", alt_tdd) + with chunk.chunk_log(state, trace_label) as chunk_sizer: if USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS: # compute logsums for all the tour alt_tdds (inefficient) logsums = _compute_logsums( + state, alt_tdd, tours_merged, tour_purpose, @@ -336,9 +356,9 @@ def compute_logsums( index_name = alt_tdd.index.name deduped_alt_tdds, redupe_columns = dedupe_alt_tdd( - alt_tdd, tour_purpose, trace_label + state, alt_tdd, tour_purpose, trace_label ) - chunk.log_df(trace_label, "deduped_alt_tdds", deduped_alt_tdds) + chunk_sizer.log_df(trace_label, "deduped_alt_tdds", deduped_alt_tdds) logger.info( f"{trace_label} compute_logsums " @@ -351,6 +371,7 @@ def compute_logsums( # - compute logsums for the alt_tdd_periods deduped_alt_tdds["logsums"] = _compute_logsums( + state, deduped_alt_tdds, tours_merged, tour_purpose, @@ -373,16 +394,17 @@ def compute_logsums( .set_index(index_name) .logsums ) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) del deduped_alt_tdds - chunk.log_df(trace_label, "deduped_alt_tdds", None) + chunk_sizer.log_df(trace_label, "deduped_alt_tdds", None) # this is really expensive TRACE = False if TRACE: trace_logsums_df = logsums.to_frame("representative_logsum") trace_logsums_df["brute_force_logsum"] = _compute_logsums( + state, alt_tdd, tours_merged, tour_purpose, @@ -391,14 +413,14 @@ def compute_logsums( skims, trace_label, ) - tracing.trace_df( + state.tracing.trace_df( trace_logsums_df, label=tracing.extend_trace_label(trace_label, "representative_logsums"), slicer="NONE", transpose=False, ) - # leave it to our caller to pick up logsums with call to chunk.log_df + # leave it to our caller to pick up logsums with call to chunk_sizer.log_df return logsums @@ -444,7 +466,13 @@ def get_previous_tour_by_tourid( def tdd_interaction_dataset( - tours, alts, timetable, choice_column, window_id_col, trace_label + state: workflow.State, + tours, + alts, + timetable, + choice_column, + window_id_col, + trace_label, ): """ interaction_sample_simulate expects @@ -473,13 +501,13 @@ def tdd_interaction_dataset( trace_label = tracing.extend_trace_label(trace_label, "tdd_interaction_dataset") - with chunk.chunk_log(trace_label): + with chunk.chunk_log(state, trace_label) as chunk_sizer: alts_ids = np.tile(alts.index, len(tours.index)) - chunk.log_df(trace_label, "alts_ids", alts_ids) + chunk_sizer.log_df(trace_label, "alts_ids", alts_ids) tour_ids = np.repeat(tours.index, len(alts.index)) window_row_ids = np.repeat(tours[window_id_col], len(alts.index)) - chunk.log_df(trace_label, "window_row_ids", window_row_ids) + chunk_sizer.log_df(trace_label, "window_row_ids", window_row_ids) alt_tdd = alts.take(alts_ids) @@ -502,20 +530,20 @@ def tdd_interaction_dataset( available = timetable.tour_available(window_row_ids, alts_ids) del window_row_ids - chunk.log_df(trace_label, "window_row_ids", None) + chunk_sizer.log_df(trace_label, "window_row_ids", None) logger.debug( f"tdd_interaction_dataset keeping {available.sum()} of ({len(available)}) available alt_tdds" ) assert available.any() - chunk.log_df( + chunk_sizer.log_df( trace_label, "alt_tdd_", alt_tdd_ ) # catch this before we slice on available alt_tdd = alt_tdd_.isel({dimname: available}).to_dataframe() - chunk.log_df(trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(trace_label, "alt_tdd", alt_tdd) # FIXME - don't need this any more after slicing # del alt_tdd[window_id_col] @@ -523,7 +551,9 @@ def tdd_interaction_dataset( return alt_tdd -def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_label): +def run_alts_preprocessor( + state: workflow.State, model_settings, alts, segment, locals_dict, trace_label +): """ run preprocessor on alts, as specified by ALTS_PREPROCESSOR in model_settings @@ -567,13 +597,13 @@ def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_labe preprocessor_settings = None if preprocessor_settings: - logger.debug( f"run_alts_preprocessor calling assign_columns for {segment} preprocessor_settings" ) alts = alts.copy() expressions.assign_columns( + state, df=alts, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -584,6 +614,7 @@ def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_labe def _schedule_tours( + state: workflow.State, tours, persons_merged, alts, @@ -598,6 +629,8 @@ def _schedule_tours( estimator, tour_trace_label, sharrow_skip=False, + *, + chunk_sizer: chunk.ChunkSizer, ): """ previous_tour stores values used to add columns that can be used in the spec @@ -656,7 +689,7 @@ def _schedule_tours( right_index=True, suffixes=("", "_y"), ) - chunk.log_df(tour_trace_label, "tours", tours) + chunk_sizer.log_df(tour_trace_label, "tours", tours) # - add explicit window_id_col for timetable owner if it is index # if no timetable window_id_col specified, then add index as an explicit column @@ -673,33 +706,40 @@ def _schedule_tours( # indexed (not unique) on tour_id choice_column = TDD_CHOICE_COLUMN alt_tdd = tdd_interaction_dataset( - tours, alts, timetable, choice_column, window_id_col, tour_trace_label + state, tours, alts, timetable, choice_column, window_id_col, tour_trace_label ) # print(f"tours {tours.shape} alts {alts.shape}") - chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - add logsums if logsum_tour_purpose: - logsums = compute_logsums( - alt_tdd, tours, logsum_tour_purpose, model_settings, skims, tour_trace_label + logsums = compute_tour_scheduling_logsums( + state, + alt_tdd, + tours, + logsum_tour_purpose, + model_settings, + skims, + tour_trace_label, + chunk_sizer=chunk_sizer, ) else: logsums = 0 alt_tdd["mode_choice_logsum"] = logsums del logsums - chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - merge in previous tour columns # adds start_previous and end_previous, joins on index tours = tours.join( get_previous_tour_by_tourid(tours[tour_owner_id_col], previous_tour, alts) ) - chunk.log_df(tour_trace_label, "tours", tours) + chunk_sizer.log_df(tour_trace_label, "tours", tours) # - make choices - locals_d = {"tt": timetable} + locals_d = {"tt": timetable.attach_state(state)} constants = config.get_model_constants(model_settings) if constants is not None: locals_d.update(constants) @@ -721,9 +761,9 @@ def _schedule_tours( logsum_tour_purpose # FIXME this is not always right - see note above ) alt_tdd = run_alts_preprocessor( - model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label + state, model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label ) - chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) + chunk_sizer.log_df(tour_trace_label, "alt_tdd", alt_tdd) if estimator: # write choosers after annotation @@ -731,9 +771,10 @@ def _schedule_tours( estimator.set_alt_id(choice_column) estimator.write_interaction_sample_alternatives(alt_tdd) - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers choices = interaction_sample_simulate( + state, tours, alt_tdd, spec, @@ -744,7 +785,7 @@ def _schedule_tours( trace_label=tour_trace_label, estimator=estimator, ) - chunk.log_df(tour_trace_label, "choices", choices) + chunk_sizer.log_df(tour_trace_label, "choices", choices) # - update previous_tour and timetable parameters @@ -758,6 +799,7 @@ def _schedule_tours( def schedule_tours( + state: workflow.State, tours, persons_merged, alts, @@ -800,16 +842,23 @@ def schedule_tours( if "LOGSUM_SETTINGS" in model_settings: # we need skims to calculate tvpb skim overhead in 3_ZONE systems for use by calc_rows_per_chunk - skims = skims_for_logsums(logsum_tour_purpose, model_settings, tour_trace_label) + skims = skims_for_logsums( + state, logsum_tour_purpose, model_settings, tour_trace_label + ) else: skims = None result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - tours, chunk_size, tour_trace_label, tour_chunk_tag + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( + state, tours, tour_trace_label, tour_chunk_tag ): - choices = _schedule_tours( + state, chooser_chunk, persons_merged, alts, @@ -824,11 +873,12 @@ def schedule_tours( estimator, tour_trace_label=chunk_trace_label, sharrow_skip=sharrow_skip, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(tour_trace_label, "result_list", result_list) + chunk_sizer.log_df(tour_trace_label, "result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -842,6 +892,7 @@ def schedule_tours( def vectorize_tour_scheduling( + state: workflow.State, tours, persons_merged, alts, @@ -909,7 +960,7 @@ def vectorize_tour_scheduling( timetable_window_id_col = "person_id" tour_owner_id_col = "person_id" - compute_logsums = "LOGSUM_SETTINGS" in model_settings + should_compute_logsums = "LOGSUM_SETTINGS" in model_settings assert isinstance(tour_segments, dict) @@ -919,16 +970,13 @@ def vectorize_tour_scheduling( # segregate scheduling by tour_type if multiple specs passed in dict keyed by tour_type for tour_num, nth_tours in tours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") tour_chunk_tag = tracing.extend_trace_label( trace_label, f"tour_{1 if tour_num == 1 else 'n'}" ) if tour_segment_col is not None: - for tour_segment_name, tour_segment_info in tour_segments.items(): - segment_trace_label = tracing.extend_trace_label( tour_trace_label, tour_segment_name ) @@ -939,7 +987,9 @@ def vectorize_tour_scheduling( # assume segmentation of spec and coefficients are aligned spec_segment_name = tour_segment_info.get("spec_segment_name") # assume logsum segmentation is same as tours - logsum_tour_purpose = tour_segment_name if compute_logsums else None + logsum_tour_purpose = ( + tour_segment_name if should_compute_logsums else None + ) nth_tours_in_segment = nth_tours[ nth_tours[tour_segment_col] == tour_segment_name @@ -951,6 +1001,7 @@ def vectorize_tour_scheduling( if RUN_ALTS_PREPROCESSOR_BEFORE_MERGE: locals_dict = {} alts = run_alts_preprocessor( + state, model_settings, alts, spec_segment_name, @@ -959,6 +1010,7 @@ def vectorize_tour_scheduling( ) choices = schedule_tours( + state, nth_tours_in_segment, persons_merged, alts, @@ -979,16 +1031,16 @@ def vectorize_tour_scheduling( choice_list.append(choices) else: - # MTC non_mandatory_tours are not segmented by tour_purpose and do not require logsums # FIXME should support logsums? assert ( - not compute_logsums + not should_compute_logsums ), "logsums for unsegmented spec not implemented because not currently needed" assert tour_segments.get("spec_segment_name") is None choices = schedule_tours( + state, nth_tours, persons_merged, alts, @@ -1013,6 +1065,7 @@ def vectorize_tour_scheduling( def vectorize_subtour_scheduling( + state: workflow.State, parent_tours, subtours, persons_merged, @@ -1097,7 +1150,6 @@ def vectorize_subtour_scheduling( # this ought to have been ensured when tours are created (tour_frequency.process_tours) for tour_num, nth_tours in subtours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") tour_chunk_tag = tracing.extend_trace_label( trace_label, f"tour_{1 if tour_num == 1 else 'n'}" @@ -1107,6 +1159,7 @@ def vectorize_subtour_scheduling( assert not nth_tours.parent_tour_id.duplicated().any() choices = schedule_tours( + state, nth_tours, persons_merged, alts, @@ -1118,7 +1171,7 @@ def vectorize_subtour_scheduling( previous_tour_by_parent_tour_id, tour_owner_id_col, estimator, - chunk_size, + state.settings.chunk_size, tour_trace_label, tour_chunk_tag, sharrow_skip=sharrow_skip, @@ -1144,7 +1197,6 @@ def vectorize_subtour_scheduling( def build_joint_tour_timetables( joint_tours, joint_tour_participants, persons_timetable, alts ): - # timetable with a window for each joint tour joint_tour_windows_df = tt.create_timetable_windows(joint_tours, alts) joint_tour_timetable = tt.TimeTable(joint_tour_windows_df, alts) @@ -1152,7 +1204,6 @@ def build_joint_tour_timetables( for participant_num, nth_participants in joint_tour_participants.groupby( "participant_num", sort=True ): - # nth_participant windows from persons_timetable participant_windows = persons_timetable.slice_windows_by_row_id( nth_participants.person_id @@ -1167,6 +1218,7 @@ def build_joint_tour_timetables( def vectorize_joint_tour_scheduling( + state: workflow.State, joint_tours, joint_tour_participants, persons_merged, @@ -1238,7 +1290,6 @@ def vectorize_joint_tour_scheduling( # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) for tour_num, nth_tours in joint_tours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") tour_chunk_tag = tracing.extend_trace_label( trace_label, f"tour_{1 if tour_num == 1 else 'n'}" @@ -1256,6 +1307,7 @@ def vectorize_joint_tour_scheduling( ) choices = schedule_tours( + state, nth_tours, persons_merged, alts, diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py index e68abc386d..372ed464d7 100644 --- a/activitysim/abm/models/vehicle_allocation.py +++ b/activitysim/abm/models/vehicle_allocation.py @@ -1,34 +1,25 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations -import itertools import logging -import os -import numpy as np import pandas as pd from activitysim.core import ( - assign, config, + estimation, expressions, - inject, - logit, los, - pipeline, simulate, tracing, + workflow, ) -from activitysim.core.interaction_simulate import interaction_simulate -from activitysim.core.util import assign_in_place - -from .util import estimation -from .util.mode import mode_choice_simulate logger = logging.getLogger(__name__) -def annotate_vehicle_allocation(model_settings, trace_label): +def annotate_vehicle_allocation(state: workflow.State, model_settings, trace_label): """ Add columns to the tours table in the pipeline according to spec. @@ -37,13 +28,14 @@ def annotate_vehicle_allocation(model_settings, trace_label): model_settings : dict trace_label : str """ - tours = inject.get_table("tours").to_frame() + tours = state.get_dataframe("tours") expressions.assign_columns( + state, df=tours, model_settings=model_settings.get("annotate_tours"), trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), ) - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) def get_skim_dict(network_los, choosers): @@ -89,17 +81,16 @@ def get_skim_dict(network_los, choosers): return skims -@inject.step() +@workflow.step def vehicle_allocation( - persons, - households, - vehicles, - tours, - tours_merged, - network_los, - chunk_size, - trace_hh_id, -): + state: workflow.State, + persons: pd.DataFrame, + households: pd.DataFrame, + vehicles: pd.DataFrame, + tours: pd.DataFrame, + tours_merged: pd.DataFrame, + network_los: los.Network_LOS, +) -> None: """Selects a vehicle for each occupancy level for each tour. Alternatives consist of the up to the number of household vehicles plus one @@ -113,6 +104,7 @@ def vehicle_allocation( Parameters ---------- + state : workflow.State persons : orca.DataFrameWrapper households : orca.DataFrameWrapper vehicles : orca.DataFrameWrapper @@ -120,19 +112,20 @@ def vehicle_allocation( tours : orca.DataFrameWrapper tours_merged : orca.DataFrameWrapper chunk_size : orca.injectable - trace_hh_id : orca.injectable """ trace_label = "vehicle_allocation" model_settings_file_name = "vehicle_allocation.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") - estimator = estimation.manager.begin_estimation("vehicle_allocation") + estimator = estimation.manager.begin_estimation(state, "vehicle_allocation") - model_spec_raw = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec_raw, coefficients_df, estimator) + model_spec_raw = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec_raw, coefficients_df, estimator + ) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) @@ -142,7 +135,7 @@ def vehicle_allocation( locals_dict.update(coefficients_df) # ------ constructing alternatives from model spec and joining to choosers - vehicles_wide = vehicles.to_frame().pivot_table( + vehicles_wide = vehicles.pivot_table( index="household_id", columns="vehicle_num", values="vehicle_type", @@ -170,7 +163,7 @@ def vehicle_allocation( vehicles_wide[alts_from_spec[-1]] = "" # merging vehicle alternatives to choosers - choosers = tours_merged.to_frame().reset_index() + choosers = tours_merged.reset_index() choosers = pd.merge(choosers, vehicles_wide, how="left", on="household_id") choosers.set_index("tour_id", inplace=True) @@ -182,6 +175,7 @@ def vehicle_allocation( preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -196,8 +190,6 @@ def vehicle_allocation( estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(choosers) - tours = tours.to_frame() - # ------ running for each occupancy level selected tours_veh_occup_cols = [] for occup in model_settings.get("OCCUPANCY_LEVELS", [1]): @@ -206,12 +198,12 @@ def vehicle_allocation( locals_dict.update({"occup": occup}) choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="vehicle_allocation", estimator=estimator, @@ -243,7 +235,7 @@ def vehicle_allocation( estimator.write_override_choices(choices) estimator.end_estimation() - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) tracing.print_summary( "vehicle_allocation", tours[tours_veh_occup_cols], value_counts=True @@ -251,7 +243,7 @@ def vehicle_allocation( annotate_settings = model_settings.get("annotate_tours", None) if annotate_settings: - annotate_vehicle_allocation(model_settings, trace_label) + annotate_vehicle_allocation(state, model_settings, trace_label) - if trace_hh_id: - tracing.trace_df(tours, label="vehicle_allocation", warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(tours, label="vehicle_allocation", warn_if_empty=True) diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index 324e505dda..c674ed62b7 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -1,33 +1,31 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import itertools import logging import os -import numpy as np import pandas as pd from activitysim.core import ( - assign, config, + estimation, expressions, - inject, logit, - los, - pipeline, simulate, tracing, + workflow, ) from activitysim.core.interaction_simulate import interaction_simulate -from activitysim.core.util import assign_in_place - -from .util import estimation logger = logging.getLogger(__name__) -def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_label): +def append_probabilistic_vehtype_type_choices( + state: workflow.State, choices, model_settings, trace_label +): """ Select a fuel type for the provided body type and age of the vehicle. @@ -37,7 +35,6 @@ def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_lab ---------- choices : pandas.DataFrame selection of {body_type}_{age} to append vehicle type to - probs_spec_file : str trace_label : str Returns @@ -46,7 +43,9 @@ def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_lab table of chosen vehicle types """ probs_spec_file = model_settings.get("PROBS_SPEC", None) - probs_spec = pd.read_csv(config.config_file_path(probs_spec_file), comment="#") + probs_spec = pd.read_csv( + state.filesystem.get_config_file_path(probs_spec_file), comment="#" + ) fleet_year = model_settings.get("FLEET_YEAR") probs_spec["age"] = (1 + fleet_year - probs_spec["vehicle_year"]).astype(int) @@ -75,7 +74,7 @@ def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_lab # make probabilistic choices prob_choices, rands = logit.make_choices( - chooser_probs, trace_label=trace_label, trace_choosers=choosers + state, chooser_probs, trace_label=trace_label, trace_choosers=choosers ) # convert alt choice index to vehicle type attribute @@ -91,7 +90,9 @@ def append_probabilistic_vehtype_type_choices(choices, model_settings, trace_lab return choices -def annotate_vehicle_type_choice_households(model_settings, trace_label): +def annotate_vehicle_type_choice_households( + state: workflow.State, model_settings, trace_label +): """ Add columns to the households table in the pipeline according to spec. @@ -100,16 +101,19 @@ def annotate_vehicle_type_choice_households(model_settings, trace_label): model_settings : dict trace_label : str """ - households = inject.get_table("households").to_frame() + households = state.get_dataframe("households") expressions.assign_columns( + state, df=households, model_settings=model_settings.get("annotate_households"), trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), ) - pipeline.replace_table("households", households) + state.add_table("households", households) -def annotate_vehicle_type_choice_persons(model_settings, trace_label): +def annotate_vehicle_type_choice_persons( + state: workflow.State, model_settings, trace_label +): """ Add columns to the persons table in the pipeline according to spec. @@ -118,16 +122,19 @@ def annotate_vehicle_type_choice_persons(model_settings, trace_label): model_settings : dict trace_label : str """ - persons = inject.get_table("persons").to_frame() + persons = state.get_dataframe("persons") expressions.assign_columns( + state, df=persons, model_settings=model_settings.get("annotate_persons"), trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) -def annotate_vehicle_type_choice_vehicles(model_settings, trace_label): +def annotate_vehicle_type_choice_vehicles( + state: workflow.State, model_settings, trace_label +): """ Add columns to the vehicles table in the pipeline according to spec. @@ -136,13 +143,14 @@ def annotate_vehicle_type_choice_vehicles(model_settings, trace_label): model_settings : dict trace_label : str """ - vehicles = inject.get_table("vehicles").to_frame() + vehicles = state.get_dataframe("vehicles") expressions.assign_columns( + state, df=vehicles, model_settings=model_settings.get("annotate_vehicles"), trace_label=tracing.extend_trace_label(trace_label, "annotate_vehicles"), ) - pipeline.replace_table("vehicles", vehicles) + state.add_table("vehicles", vehicles) def get_combinatorial_vehicle_alternatives(alts_cats_dict): @@ -155,7 +163,6 @@ def get_combinatorial_vehicle_alternatives(alts_cats_dict): Parameters ---------- alts_cats_dict : dict - model_settings : dict Returns ------- @@ -172,7 +179,9 @@ def get_combinatorial_vehicle_alternatives(alts_cats_dict): return alts_wide, alts_long -def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_data): +def construct_model_alternatives( + state: workflow.State, model_settings, alts_cats_dict, vehicle_type_data +): """ Construct the table of vehicle type alternatives. @@ -180,6 +189,7 @@ def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_da Parameters ---------- + state : workflow.State model_settings : dict alts_cats_dict : dict nested dictionary of vehicle body, age, and fuel options @@ -200,7 +210,6 @@ def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_da # merge vehicle type data to alternatives if data is provided if (vehicle_type_data is not None) and (probs_spec_file is None): - alts_wide = pd.merge( alts_wide, vehicle_type_data, @@ -228,7 +237,7 @@ def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_da alts_wide["age"] = alts_wide["age"].astype(int) # store alts in primary configs dir for inspection - configs_dirs = inject.get_injectable("configs_dir") + configs_dirs = state.filesystem.get_configs_dir() configs_dirs = configs_dirs if isinstance(configs_dirs, list) else [configs_dirs] if model_settings.get("WRITE_OUT_ALTS_FILE", False): @@ -239,7 +248,9 @@ def construct_model_alternatives(model_settings, alts_cats_dict, vehicle_type_da return alts_wide, alts_long -def get_vehicle_type_data(model_settings, vehicle_type_data_file): +def get_vehicle_type_data( + state: workflow.State, model_settings, vehicle_type_data_file +): """ Read in the vehicle type data and computes the vehicle age. @@ -255,7 +266,7 @@ def get_vehicle_type_data(model_settings, vehicle_type_data_file): table of vehicle type data with required body_type, age, and fuel_type columns """ vehicle_type_data = pd.read_csv( - config.config_file_path(vehicle_type_data_file), comment="#" + state.filesystem.get_config_file_path(vehicle_type_data_file), comment="#" ) fleet_year = model_settings.get("FLEET_YEAR") @@ -272,7 +283,8 @@ def get_vehicle_type_data(model_settings, vehicle_type_data_file): def iterate_vehicle_type_choice( - vehicles_merged, + state: workflow.State, + vehicles_merged: pd.DataFrame, model_settings, model_spec, locals_dict, @@ -295,7 +307,7 @@ def iterate_vehicle_type_choice( Parameters ---------- - vehicles_merged : orca.DataFrameWrapper + vehicles_merged : DataFrame vehicle list owned by each household merged with households table model_settings : dict yaml model settings file as dict @@ -323,20 +335,23 @@ def iterate_vehicle_type_choice( # adding vehicle type data to be available to locals_dict regardless of option if vehicle_type_data_file: vehicle_type_data = get_vehicle_type_data( - model_settings, vehicle_type_data_file + state, model_settings, vehicle_type_data_file ) locals_dict.update({"vehicle_type_data": vehicle_type_data}) + else: + vehicle_type_data = None # - Preparing alternatives # create alts on-the-fly as cartesian product of categorical values if alts_cats_dict: # do not include fuel types as alternatives if probability file is supplied alts_wide, alts_long = construct_model_alternatives( - model_settings, alts_cats_dict, vehicle_type_data + state, model_settings, alts_cats_dict, vehicle_type_data ) + else: + alts_wide = alts_long = None # - preparing choosers for iterating - vehicles_merged = vehicles_merged.to_frame() vehicles_merged["already_owned_veh"] = "" logger.info("Running %s with %d vehicles", trace_label, len(vehicles_merged)) all_choosers = [] @@ -353,6 +368,7 @@ def iterate_vehicle_type_choice( preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -376,7 +392,7 @@ def iterate_vehicle_type_choice( simulation_type == "simple_simulate" ), "SIMULATION_TYPE needs to be interaction_simulate or simple_simulate" - log_alt_losers = config.setting("log_alt_losers", False) + log_alt_losers = state.settings.log_alt_losers if simulation_type == "interaction_simulate": assert ( @@ -384,6 +400,7 @@ def iterate_vehicle_type_choice( ), "Need to supply combinatorial_alts in yaml" choices = interaction_simulate( + state, choosers=choosers, alternatives=alts_wide, spec=model_spec, @@ -399,16 +416,18 @@ def iterate_vehicle_type_choice( # each alternative as a distinct column in the .csv elif simulation_type == "simple_simulate": choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, log_alt_losers=log_alt_losers, nest_spec=nest_spec, locals_d=locals_dict, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="vehicle_type", estimator=estimator, ) + else: + raise NotImplementedError(simulation_type) if isinstance(choices, pd.Series): choices = choices.to_frame("choice") @@ -428,7 +447,7 @@ def iterate_vehicle_type_choice( # STEP II: append probabilistic vehicle type attributes if probs_spec_file is not None: choices = append_probabilistic_vehtype_type_choices( - choices, model_settings, trace_label + state, choices, model_settings, trace_label ) vehicles_merged.loc[choices.index, "already_owned_veh"] = choices[ @@ -454,10 +473,14 @@ def iterate_vehicle_type_choice( return all_choices, all_choosers -@inject.step() +@workflow.step def vehicle_type_choice( - persons, households, vehicles, vehicles_merged, chunk_size, trace_hh_id -): + state: workflow.State, + persons: pd.DataFrame, + households: pd.DataFrame, + vehicles: pd.DataFrame, + vehicles_merged: pd.DataFrame, +) -> None: """Assign a vehicle type to each vehicle in the `vehicles` table. If a "SIMULATION_TYPE" is set to simple_simulate in the @@ -496,19 +519,19 @@ def vehicle_type_choice( persons : orca.DataFrameWrapper households : orca.DataFrameWrapper vehicles : orca.DataFrameWrapper - vehicles_merged : orca.DataFrameWrapper - chunk_size : orca.injectable - trace_hh_id : orca.injectable + vehicles_merged : DataFrame """ trace_label = "vehicle_type_choice" model_settings_file_name = "vehicle_type_choice.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation("vehicle_type") + estimator = estimation.manager.begin_estimation(state, "vehicle_type") - model_spec_raw = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) - model_spec = simulate.eval_coefficients(model_spec_raw, coefficients_df, estimator) + model_spec_raw = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + model_spec = simulate.eval_coefficients( + state, model_spec_raw, coefficients_df, estimator + ) constants = config.get_model_constants(model_settings) @@ -517,12 +540,13 @@ def vehicle_type_choice( locals_dict.update(coefficients_df) choices, choosers = iterate_vehicle_type_choice( + state, vehicles_merged, model_settings, model_spec, locals_dict, estimator, - chunk_size, + state.settings.chunk_size, trace_label, ) @@ -553,21 +577,22 @@ def vehicle_type_choice( estimator.end_estimation() # update vehicles table - # vehicles = pd.merge(vehicles.to_frame(), choices, left_index=True, right_index=True) - vehicles = pd.concat([vehicles.to_frame(), choices], axis=1) - pipeline.replace_table("vehicles", vehicles) + vehicles = pd.concat([vehicles, choices], axis=1) + state.add_table("vehicles", vehicles) # - annotate tables if model_settings.get("annotate_households"): - annotate_vehicle_type_choice_households(model_settings, trace_label) + annotate_vehicle_type_choice_households(state, model_settings, trace_label) if model_settings.get("annotate_persons"): - annotate_vehicle_type_choice_persons(model_settings, trace_label) + annotate_vehicle_type_choice_persons(state, model_settings, trace_label) if model_settings.get("annotate_vehicles"): - annotate_vehicle_type_choice_vehicles(model_settings, trace_label) + annotate_vehicle_type_choice_vehicles(state, model_settings, trace_label) tracing.print_summary( "vehicle_type_choice", vehicles.vehicle_type, value_counts=True ) - if trace_hh_id: - tracing.trace_df(vehicles, label="vehicle_type_choice", warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df( + vehicles, label="vehicle_type_choice", warn_if_empty=True + ) diff --git a/activitysim/abm/models/work_from_home.py b/activitysim/abm/models/work_from_home.py index 0432640002..1b9bc9049b 100755 --- a/activitysim/abm/models/work_from_home.py +++ b/activitysim/abm/models/work_from_home.py @@ -1,17 +1,30 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np +import pandas as pd -from activitysim.abm.models.util import estimation -from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core import ( + config, + estimation, + expressions, + simulate, + tracing, + workflow, +) logger = logging.getLogger("activitysim") -@inject.step() -def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): +@workflow.step +def work_from_home( + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, +) -> None: """ This model predicts whether a person (worker) works from home. The output from this model is TRUE (if works from home) or FALSE (works away from home). @@ -22,15 +35,15 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): trace_label = "work_from_home" model_settings_file_name = "work_from_home.yaml" - choosers = persons_merged.to_frame() - model_settings = config.read_model_settings(model_settings_file_name) + choosers = persons_merged + model_settings = state.filesystem.read_model_settings(model_settings_file_name) chooser_filter_column_name = model_settings.get( "CHOOSER_FILTER_COLUMN_NAME", "is_worker" ) choosers = choosers[choosers[chooser_filter_column_name]] logger.info("Running %s with %d persons", trace_label, len(choosers)) - estimator = estimation.manager.begin_estimation("work_from_home") + estimator = estimation.manager.begin_estimation(state, "work_from_home") constants = config.get_model_constants(model_settings) work_from_home_alt = model_settings["WORK_FROM_HOME_ALT"] @@ -38,20 +51,20 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): # - preprocessor preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label, ) - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - coefficients_df = simulate.read_model_coefficients(model_settings) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) nest_spec = config.get_logit_model_settings(model_settings) @@ -77,7 +90,6 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): ) for iteration in range(iterations): - logger.info( "Running %s with %d persons iteration %d", trace_label, @@ -86,15 +98,20 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): ) # re-read spec to reset substitution - model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) - model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + model_spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) + model_spec = simulate.eval_coefficients( + state, model_spec, coefficients_df, estimator + ) + + if model_settings.get("sharrow_skip", False): + constants["disable_sharrow"] = True choices = simulate.simple_simulate( + state, choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="work_from_home", estimator=estimator, @@ -152,7 +169,6 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): estimator.write_override_choices(choices) estimator.end_estimation() - persons = persons.to_frame() persons["work_from_home"] = choices.reindex(persons.index).fillna(0).astype(bool) persons["is_out_of_home_worker"] = ( persons[chooser_filter_column_name] & ~persons["work_from_home"] @@ -169,9 +185,9 @@ def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): persons.work_from_home == True, -1, persons[dest_choice_column_name] ) - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary("work_from_home", persons.work_from_home, value_counts=True) - if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/tables/accessibility.py b/activitysim/abm/tables/accessibility.py index 6869da736c..b37904c16d 100644 --- a/activitysim/abm/tables/accessibility.py +++ b/activitysim/abm/tables/accessibility.py @@ -1,17 +1,19 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import pandas as pd -from activitysim.core import inject +from activitysim.core import workflow from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) -@inject.table() -def accessibility(land_use): +@workflow.table +def accessibility(state: workflow.State): """ If 'accessibility' is in input_tables list, then read it in, otherwise create skeleton table with same index as landuse. @@ -23,7 +25,8 @@ def accessibility(land_use): otherwise it will simply be replaced when accessibility model is run """ - accessibility_df = read_input_table("accessibility", required=False) + land_use = state.get_dataframe("land_use") + accessibility_df = read_input_table(state, "accessibility", required=False) if accessibility_df is None: accessibility_df = pd.DataFrame(index=land_use.index) @@ -33,24 +36,22 @@ def accessibility(land_use): else: try: assert accessibility_df.sort_index().index.equals( - land_use.to_frame().sort_index().index + land_use.sort_index().index ), f"loaded accessibility table index does not match index of land_use table" except AssertionError: - land_use_index = land_use.to_frame().index - if f"_original_{land_use_index.name}" in land_use.to_frame(): - land_use_zone_ids = land_use.to_frame()[ - f"_original_{land_use_index.name}" - ] + land_use_index = land_use.index + if f"_original_{land_use_index.name}" in land_use: + land_use_zone_ids = land_use[f"_original_{land_use_index.name}"] remapper = dict(zip(land_use_zone_ids, land_use_zone_ids.index)) accessibility_df.index = accessibility_df.index.map(remapper.get) assert accessibility_df.sort_index().index.equals( - land_use.to_frame().sort_index().index + land_use.sort_index().index ), f"loaded accessibility table index does not match index of land_use table" else: raise logger.info("loaded land_use %s" % (accessibility_df.shape,)) # replace table function with dataframe - inject.add_table("accessibility", accessibility_df) + state.add_table("accessibility", accessibility_df) return accessibility_df diff --git a/activitysim/abm/tables/disaggregate_accessibility.py b/activitysim/abm/tables/disaggregate_accessibility.py index 4c4eb9ad40..7858245f2b 100644 --- a/activitysim/abm/tables/disaggregate_accessibility.py +++ b/activitysim/abm/tables/disaggregate_accessibility.py @@ -1,19 +1,23 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import os +from typing import Any import numpy as np import pandas as pd import pandas.api.types as ptypes - from sklearn.naive_bayes import CategoricalNB -from activitysim.core import inject, config, pipeline, util, input + +from activitysim.core import input, util, workflow logger = logging.getLogger(__name__) -def find_nearest_accessibility_zone(choosers, accessibility_df, method="skims"): +def find_nearest_accessibility_zone( + state: workflow.State, choosers, accessibility_df, method="skims" +): """ Matches choosers zone to the nearest accessibility zones. Can be achieved by querying the skims or by nearest neighbor of centroids @@ -47,7 +51,7 @@ def nearest_node(oz, zones_df): if method == "centroids": # Extract and vectorize TAZ centroids - centroids = inject.get_table("maz_centroids").to_frame() + centroids = state.get_dataframe("maz_centroids") # TODO.NF This is a bit hacky, needs some work for variable zone names if "TAZ" in centroids.columns: @@ -65,7 +69,7 @@ def nearest_node(oz, zones_df): nearest = [nearest_node(Oz, _centroids.XY) for Oz in unmatched_zones] else: - skim_dict = inject.get_injectable("skim_dict") + skim_dict = state.get_injectable("skim_dict") nearest = [nearest_skim(Oz, accessibility_zones) for Oz in unmatched_zones] # Add the nearest zones to the matched zones @@ -82,14 +86,14 @@ def nearest_node(oz, zones_df): return matched_df.loc[_idx] -@inject.injectable() -def disaggregate_suffixes(): +@workflow.cached_object +def disaggregate_suffixes(state: workflow.State) -> dict[str, Any]: return {"SUFFIX": None, "ROOTS": []} -@inject.table() -def maz_centroids(): - df = input.read_input_table("maz_centroids") +@workflow.table +def maz_centroids(state: workflow.State): + df = input.read_input_table(state, "maz_centroids") if not df.index.is_monotonic_increasing: df = df.sort_index() @@ -97,16 +101,17 @@ def maz_centroids(): logger.info("loaded maz_centroids %s" % (df.shape,)) # replace table function with dataframe - inject.add_table("maz_centroids", df) + state.add_table("maz_centroids", df) return df -@inject.table() -def proto_disaggregate_accessibility(): - +@workflow.table +def proto_disaggregate_accessibility(state: workflow.State): # Read existing accessibilities, but is not required to enable model compatibility - df = input.read_input_table("proto_disaggregate_accessibility", required=False) + df = input.read_input_table( + state, "proto_disaggregate_accessibility", required=False + ) # If no df, return empty dataframe to skip this model if not df: @@ -119,33 +124,47 @@ def proto_disaggregate_accessibility(): logger.info("loaded proto_disaggregate_accessibility %s" % (df.shape,)) # replace table function with dataframe - inject.add_table("proto_disaggregate_accessibility", df) + state.add_table("proto_disaggregate_accessibility", df) return df -@inject.table() -def disaggregate_accessibility(persons, households, land_use, accessibility): +@workflow.table +def disaggregate_accessibility(state: workflow.State): """ This step initializes pre-computed disaggregate accessibility and merges it onto the full synthetic population. Function adds merged all disaggregate accessibility tables to the pipeline but returns nothing. """ + persons = state.get_dataframe("persons") + households = state.get_dataframe("households") + land_use = state.get_dataframe("land_use") + accessibility = state.get_dataframe("accessibility") + # If disaggregate_accessibilities do not exist in the pipeline, it will try loading csv of that name - proto_accessibility_df = pipeline.get_table("proto_disaggregate_accessibility") + proto_accessibility_df = state.get_dataframe("proto_disaggregate_accessibility") # If there is no table, skip. We do this first to skip as fast as possible if proto_accessibility_df.empty: return pd.DataFrame() # Get persons merged manually - persons_merged_df = inject.merge_tables( - persons.name, tables=[persons, households, land_use, accessibility] + from activitysim.abm.tables.persons import persons_merged + + persons_merged_df = persons_merged( + state, + persons, + land_use, + households, + accessibility, + disaggregate_accessibility=None, ) # Extract model settings - model_settings = config.read_model_settings("disaggregate_accessibility.yaml") + model_settings = state.filesystem.read_model_settings( + "disaggregate_accessibility.yaml" + ) merging_params = model_settings.get("MERGE_ON") nearest_method = model_settings.get("NEAREST_METHOD", "skims") accessibility_cols = [ @@ -165,7 +184,7 @@ def disaggregate_accessibility(persons, households, land_use, accessibility): # Note that from here on the 'home_zone_id' is the matched name if "nearest_accessibility_zone_id" not in persons_merged_df.columns: persons_merged_df = find_nearest_accessibility_zone( - persons_merged_df, proto_accessibility_df, nearest_method + state, persons_merged_df, proto_accessibility_df, nearest_method ) # Copy home_zone_id in proto-table to match the temporary 'nearest_zone_id' @@ -260,11 +279,6 @@ def disaggregate_accessibility(persons, households, land_use, accessibility): assert any(merge_df[accessibility_cols].isnull()) # Inject merged accessibilities so that it can be included in persons_merged function - inject.add_table("disaggregate_accessibility", merge_df[accessibility_cols]) + state.add_table("disaggregate_accessibility", merge_df[accessibility_cols]) return merge_df[accessibility_cols] - - -inject.broadcast( - "disaggregate_accessibility", "persons", cast_index=True, onto_on="person_id" -) diff --git a/activitysim/abm/tables/households.py b/activitysim/abm/tables/households.py index e0a42f63bc..142da7a015 100644 --- a/activitysim/abm/tables/households.py +++ b/activitysim/abm/tables/households.py @@ -1,21 +1,27 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import io import logging -from builtins import range import pandas as pd -from activitysim.core import inject, mem, pipeline, tracing +from activitysim.abm.misc import override_hh_ids +from activitysim.abm.tables.util import simple_table_join +from activitysim.core import tracing, workflow from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) -@inject.table() -def households(households_sample_size, override_hh_ids, trace_hh_id): +@workflow.table +def households(state: workflow.State) -> pd.DataFrame: + households_sample_size = state.settings.households_sample_size + _override_hh_ids = override_hh_ids(state) + _trace_hh_id = state.settings.trace_hh_id - df_full = read_input_table("households") + df_full = read_input_table(state, "households") tot_households = df_full.shape[0] logger.info("full household list contains %s households" % tot_households) @@ -23,35 +29,32 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): households_sliced = False # only using households listed in override_hh_ids - if override_hh_ids is not None: - + if _override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids logger.info( - "override household list containing %s households" % len(override_hh_ids) + "override household list containing %s households" % len(_override_hh_ids) ) - df = df_full[df_full.index.isin(override_hh_ids)] + df = df_full[df_full.index.isin(_override_hh_ids)] households_sliced = True - if df.shape[0] < len(override_hh_ids): + if df.shape[0] < len(_override_hh_ids): logger.info( "found %s of %s households in override household list" - % (df.shape[0], len(override_hh_ids)) + % (df.shape[0], len(_override_hh_ids)) ) if df.shape[0] == 0: raise RuntimeError("No override households found in store") # if we are tracing hh exclusively - elif trace_hh_id and households_sample_size == 1: - + elif _trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) - df = tracing.slice_ids(df_full, trace_hh_id) + df = tracing.slice_ids(df_full, _trace_hh_id) households_sliced = True # if we need a subset of full store elif tot_households > households_sample_size > 0: - logger.info( "sampling %s of %s households" % (households_sample_size, tot_households) ) @@ -66,27 +69,30 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): if the pipeline rng's base_seed is changed """ - prng = pipeline.get_rn_generator().get_external_rng("sample_households") + prng = state.get_rn_generator().get_external_rng("sample_households") df = df_full.take( prng.choice(len(df_full), size=households_sample_size, replace=False) ) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store - if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: + if ( + _trace_hh_id + and _trace_hh_id not in df.index + and _trace_hh_id in df_full.index + ): # replace first hh in sample with trace_hh logger.debug( "replacing household %s with %s in household sample" - % (df.index[0], trace_hh_id) + % (df.index[0], _trace_hh_id) ) - df_hh = df_full.loc[[trace_hh_id]] + df_hh = df_full.loc[[_trace_hh_id]] df = pd.concat([df_hh, df[1:]]) else: df = df_full - # persons table - inject.add_injectable("households_sliced", households_sliced) + state.set("households_sliced", households_sliced) if "sample_rate" not in df.columns: if households_sample_size == 0: @@ -102,27 +108,34 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): logger.debug("households.info:\n" + buffer.getvalue()) # replace table function with dataframe - inject.add_table("households", df) + state.add_table("households", df) - pipeline.get_rn_generator().add_channel("households", df) + state.get_rn_generator().add_channel("households", df) - tracing.register_traceable_table("households", df) - if trace_hh_id: - tracing.trace_df(df, "raw.households", warn_if_empty=True) + state.tracing.register_traceable_table("households", df) + if _trace_hh_id: + state.tracing.trace_df(df, "raw.households", warn_if_empty=True) return df # this is a common merge so might as well define it once here and use it -@inject.table() -def households_merged(households, land_use, accessibility): - return inject.merge_tables( - households.name, tables=[households, land_use, accessibility] +@workflow.temp_table +def households_merged( + state: workflow.State, + households: pd.DataFrame, + land_use: pd.DataFrame, + accessibility: pd.DataFrame, +) -> pd.DataFrame: + + households = simple_table_join( + households, + land_use, + left_on="home_zone_id", ) - - -inject.broadcast("households", "persons", cast_index=True, onto_on="household_id") - -# this would be accessibility around the household location - be careful with -# this one as accessibility at some other location can also matter -inject.broadcast("accessibility", "households", cast_index=True, onto_on="home_zone_id") + households = simple_table_join( + households, + accessibility, + left_on="home_zone_id", + ) + return households diff --git a/activitysim/abm/tables/landuse.py b/activitysim/abm/tables/landuse.py index 1161bfe5dc..8d9376b757 100644 --- a/activitysim/abm/tables/landuse.py +++ b/activitysim/abm/tables/landuse.py @@ -1,20 +1,27 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import io import logging -from activitysim.core import config, inject +import numpy as np +import pandas as pd + +from activitysim.core import workflow +from activitysim.core.exceptions import MissingInputTableDefinition from activitysim.core.input import read_input_table +from activitysim.core.los import Network_LOS +from activitysim.core.skim_dictionary import SkimDict logger = logging.getLogger(__name__) -@inject.table() -def land_use(): - - df = read_input_table("land_use") +@workflow.table +def land_use(state: workflow.State): + df = read_input_table(state, "land_use") - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow if sharrow_enabled: # when using sharrow, the land use file must be organized (either in raw # form or via recoding) so that the index is zero-based and contiguous @@ -33,20 +40,38 @@ def land_use(): buffer = io.StringIO() df.info(buf=buffer) logger.debug("land_use.info:\n" + buffer.getvalue()) - - # replace table function with dataframe - inject.add_table("land_use", df) - return df -inject.broadcast("land_use", "households", cast_index=True, onto_on="home_zone_id") - - -@inject.table() -def land_use_taz(): - - df = read_input_table("land_use_taz") +@workflow.table +def land_use_taz(state: workflow.State): + try: + df = read_input_table(state, "land_use_taz") + except MissingInputTableDefinition: + # if the land_use_taz table is not given explicitly in the settings, + # we will construct our best approximation of the table by collecting + # a sorted list of unique TAZ ids found in the land_use table of MAZs. + # In nearly all cases this should be good enough, unless the model + # includes TAZs without MAZs (e.g. external stations) or for some + # reason wants TAZs in some not-sorted ordering. + land_use = state.get_dataframe("land_use") + if "TAZ" not in land_use: + raise + logger.warning( + "no land_use_taz defined in input_table_list, constructing " + "from discovered TAZ values in land_use" + ) + unique_tazs = np.unique(land_use["TAZ"]) + if state.settings.recode_pipeline_columns: + df = pd.Series( + unique_tazs, + name="_original_TAZ", + index=pd.RangeIndex(unique_tazs.size, name="TAZ"), + ).to_frame() + else: + df = pd.DataFrame( + index=pd.Index(unique_tazs, name="TAZ"), + ) if not df.index.is_monotonic_increasing: df = df.sort_index() @@ -57,6 +82,6 @@ def land_use_taz(): logger.debug("land_use_taz.info:\n" + buffer.getvalue()) # replace table function with dataframe - inject.add_table("land_use_taz", df) + state.add_table("land_use_taz", df) return df diff --git a/activitysim/abm/tables/persons.py b/activitysim/abm/tables/persons.py index a3d3804cf4..8825ebe848 100644 --- a/activitysim/abm/tables/persons.py +++ b/activitysim/abm/tables/persons.py @@ -1,31 +1,34 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import io import logging import pandas as pd -from activitysim.core import inject, pipeline, tracing +from activitysim.abm.tables.util import simple_table_join +from activitysim.core import workflow from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) -def read_raw_persons(households): - - df = read_input_table("persons") +def read_raw_persons(state, households): + df = read_input_table(state, "persons") - if inject.get_injectable("households_sliced", False): + if state.get_injectable("households_sliced", False): # keep only persons in the sampled households df = df[df.household_id.isin(households.index)] return df -@inject.table() -def persons(households, trace_hh_id): - - df = read_raw_persons(households) +@workflow.table +def persons(state: workflow.State) -> pd.DataFrame: + households = state.get_dataframe("households") + trace_hh_id = state.settings.trace_hh_id + df = read_raw_persons(state, households) logger.info("loaded persons %s" % (df.shape,)) buffer = io.StringIO() @@ -33,13 +36,13 @@ def persons(households, trace_hh_id): logger.debug("persons.info:\n" + buffer.getvalue()) # replace table function with dataframe - inject.add_table("persons", df) + state.add_table("persons", df) - pipeline.get_rn_generator().add_channel("persons", df) + state.get_rn_generator().add_channel("persons", df) - tracing.register_traceable_table("persons", df) + state.tracing.register_traceable_table("persons", df) if trace_hh_id: - tracing.trace_df(df, "raw.persons", warn_if_empty=True) + state.tracing.trace_df(df, "raw.persons", warn_if_empty=True) logger.debug(f"{len(df.household_id.unique())} unique household_ids in persons") logger.debug(f"{len(households.index.unique())} unique household_ids in households") @@ -49,7 +52,7 @@ def persons(households, trace_hh_id): persons_without_households = ~df.household_id.isin(households.index) if persons_without_households.any(): logger.error( - f"{persons_without_households.sum()} persons out of {len(persons)} without households\n" + f"{persons_without_households.sum()} persons out of {len(df)} without households\n" f"{pd.Series({'person_id': persons_without_households.index.values})}" ) raise RuntimeError( @@ -71,21 +74,37 @@ def persons(households, trace_hh_id): return df -# another common merge for persons -@inject.table() +@workflow.temp_table def persons_merged( - persons, households, land_use, accessibility, disaggregate_accessibility + state: workflow.State, + persons: pd.DataFrame, + land_use: pd.DataFrame, + households: pd.DataFrame, + accessibility: pd.DataFrame, + disaggregate_accessibility: pd.DataFrame = None, ): - - if not disaggregate_accessibility.to_frame().empty: - tables = [ + n_persons = len(persons) + households = simple_table_join( + households, + land_use, + left_on="home_zone_id", + ) + households = simple_table_join( + households, + accessibility, + left_on="home_zone_id", + ) + persons = simple_table_join( + persons, + households, + left_on="household_id", + ) + if disaggregate_accessibility is not None and not disaggregate_accessibility.empty: + persons = simple_table_join( persons, - households, - land_use, - accessibility, disaggregate_accessibility, - ] - else: - tables = [persons, households, land_use, accessibility] - - return inject.merge_tables(persons.name, tables=tables) + left_on="person_id", + ) + if n_persons != len(persons): + raise RuntimeError("number of persons changed") + return persons diff --git a/activitysim/abm/tables/shadow_pricing.py b/activitysim/abm/tables/shadow_pricing.py index 396cf7c567..87870f5b68 100644 --- a/activitysim/abm/tables/shadow_pricing.py +++ b/activitysim/abm/tables/shadow_pricing.py @@ -1,16 +1,20 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import ctypes import logging import multiprocessing import time from collections import OrderedDict +from typing import Any import numpy as np import pandas as pd +from activitysim.abm.tables.size_terms import size_terms as get_size_terms from activitysim.abm.tables.size_terms import tour_destination_size_terms -from activitysim.core import config, inject, logit, tracing, util +from activitysim.core import logit, tracing, util, workflow from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) @@ -84,9 +88,10 @@ def size_table_name(model_selector): return "%s_destination_size" % model_selector -class ShadowPriceCalculator(object): +class ShadowPriceCalculator: def __init__( self, + state: workflow.State, model_settings, num_processes, shared_data=None, @@ -112,14 +117,14 @@ def __init__( """ self.num_processes = num_processes - self.use_shadow_pricing = bool(config.setting("use_shadow_pricing")) + self.use_shadow_pricing = bool(state.settings.use_shadow_pricing) self.saved_shadow_price_file_path = ( None # set by read_saved_shadow_prices if loaded ) self.model_selector = model_settings["MODEL_SELECTOR"] - if (self.num_processes > 1) and not config.setting("fail_fast"): + if (self.num_processes > 1) and not state.settings.fail_fast: # if we are multiprocessing, then fail_fast should be true or we will wait forever for failed processes logger.warning( "deprecated combination of multiprocessing and not fail_fast" @@ -134,14 +139,16 @@ def __init__( self.modeled_size = None if self.use_shadow_pricing: - self.shadow_settings = config.read_model_settings("shadow_pricing.yaml") + self.shadow_settings = state.filesystem.read_model_settings( + "shadow_pricing.yaml" + ) for k in self.shadow_settings: logger.debug( "shadow_settings %s: %s" % (k, self.shadow_settings.get(k)) ) - full_model_run = config.setting("households_sample_size") == 0 + full_model_run = state.settings.households_sample_size == 0 if ( self.use_shadow_pricing and not full_model_run @@ -167,9 +174,7 @@ def __init__( self.use_shadow_pricing = False # - destination_size_table (desired_size) - self.desired_size = inject.get_table( - size_table_name(self.model_selector) - ).to_frame() + self.desired_size = state.get_dataframe(size_table_name(self.model_selector)) self.desired_size = self.desired_size.sort_index() assert ( @@ -205,7 +210,9 @@ def __init__( if self.shadow_settings["LOAD_SAVED_SHADOW_PRICES"]: # read_saved_shadow_prices logs error and returns None if file not found - self.shadow_prices = self.read_saved_shadow_prices(model_settings) + self.shadow_prices = self.read_saved_shadow_prices( + state, model_settings + ) if self.shadow_prices is None: self.max_iterations = self.shadow_settings.get("MAX_ITERATIONS", 5) @@ -239,10 +246,9 @@ def __init__( self.use_shadow_pricing and self.shadow_settings["SHADOW_PRICE_METHOD"] == "simulation" ): - assert self.model_selector in ["workplace", "school"] self.target = {} - land_use = inject.get_table("land_use").to_frame() + land_use = state.get_dataframe("land_use") if self.model_selector == "workplace": employment_targets = self.shadow_settings[ @@ -276,7 +282,7 @@ def __init__( ), f"{target} is not in landuse columns: {land_use.columns}" self.target[segment] = land_use[target] - def read_saved_shadow_prices(self, model_settings): + def read_saved_shadow_prices(self, state, model_settings): """ Read saved shadow_prices from csv file in data_dir (so-called warm start) returns None if no saved shadow price file name specified or named file not found @@ -298,7 +304,7 @@ def read_saved_shadow_prices(self, model_settings): ) if saved_shadow_price_file_name: # FIXME - where should we look for this file? - file_path = config.data_file_path( + file_path = state.filesystem.get_data_file_path( saved_shadow_price_file_name, mandatory=False ) if file_path: @@ -479,7 +485,6 @@ def set_choices(self, choices, segment_ids): modeled_size = pd.DataFrame(index=self.desired_size.index) for seg_name in self.desired_size: - segment_choices = choices[(segment_ids == self.segment_ids[seg_name])] modeled_size[seg_name] = segment_choices.value_counts() @@ -512,7 +517,7 @@ def set_choices(self, choices, segment_ids): self.choices_synced = self.synchronize_choices(choice_merged) - def check_fit(self, iteration): + def check_fit(self, state: workflow.State, iteration): """ Check convergence criteria fit of modeled_size to target desired_size (For multiprocessing, this is global modeled_size summed across processes, @@ -550,7 +555,6 @@ def check_fit(self, iteration): self.choices_by_iteration[iteration] = self.choices_synced if self.shadow_settings["SHADOW_PRICE_METHOD"] != "simulation": - modeled_size = self.modeled_size desired_size = self.desired_size @@ -638,7 +642,7 @@ def check_fit(self, iteration): logger.info("\nshadow_pricing num_fail\n%s" % self.num_fail) if write_choices: - tracing.write_csv( + state.tracing.write_csv( self.choices_by_iteration, "%s_choices_by_shadow_price_iteration" % self.model_selector, transpose=False, @@ -646,7 +650,7 @@ def check_fit(self, iteration): return converged - def update_shadow_prices(self): + def update_shadow_prices(self, state): """ Adjust shadow_prices based on relative values of modeled_size and desired_size. @@ -771,7 +775,7 @@ def update_shadow_prices(self): """ percent_tolerance = self.shadow_settings["PERCENT_TOLERANCE"] sampled_persons = pd.DataFrame() - persons_merged = inject.get_table("persons_merged").to_frame() + persons_merged = state.get_dataframe("persons_merged") # need to join the segment to the choices to sample correct persons segment_to_name_dict = self.shadow_settings.get( @@ -843,7 +847,7 @@ def update_shadow_prices(self): index=choices.index, ) # using ActivitySim's RNG to make choices for repeatability - current_sample, rands = logit.make_choices(probs) + current_sample, rands = logit.make_choices(state, probs) current_sample = current_sample[current_sample == 1] if len(sampled_persons) == 0: @@ -857,14 +861,12 @@ def update_shadow_prices(self): raise RuntimeError("unknown SHADOW_PRICE_METHOD %s" % shadow_price_method) def dest_size_terms(self, segment): - assert segment in self.segment_ids size_term_adjustment = 1 utility_adjustment = 0 if self.use_shadow_pricing: - shadow_price_method = self.shadow_settings["SHADOW_PRICE_METHOD"] if shadow_price_method == "ctramp": @@ -891,7 +893,7 @@ def dest_size_terms(self, segment): return size_terms - def write_trace_files(self, iteration): + def write_trace_files(self, state: workflow.State, iteration): """ Write trace files for this iteration Writes desired_size, modeled_size, and shadow_prices tables @@ -907,20 +909,20 @@ def write_trace_files(self, iteration): logger.info("write_trace_files iteration %s" % iteration) if iteration == 1: # write desired_size only on first iteration, as it doesn't change - tracing.write_csv( + state.tracing.write_csv( self.desired_size, "shadow_price_%s_desired_size" % self.model_selector, transpose=False, ) - tracing.write_csv( + state.tracing.write_csv( self.modeled_size, "shadow_price_%s_modeled_size_%s" % (self.model_selector, iteration), transpose=False, ) if self.use_shadow_pricing: - tracing.write_csv( + state.tracing.write_csv( self.shadow_prices, "shadow_price_%s_shadow_prices_%s" % (self.model_selector, iteration), transpose=False, @@ -976,7 +978,6 @@ def buffers_for_shadow_pricing(shadow_pricing_info): data_buffers = {} for block_key, block_shape in block_shapes.items(): - # buffer_size must be int, not np.int64 buffer_size = util.iprod(block_shape) @@ -1002,7 +1003,7 @@ def buffers_for_shadow_pricing(shadow_pricing_info): return data_buffers -def buffers_for_shadow_pricing_choice(shadow_pricing_choice_info): +def buffers_for_shadow_pricing_choice(state, shadow_pricing_choice_info): """ Same as above buffers_for_shadow_price function except now we need to store the actual choices for the simulation based shadow pricing method @@ -1026,7 +1027,6 @@ def buffers_for_shadow_pricing_choice(shadow_pricing_choice_info): data_buffers = {} for block_key, block_shape in block_shapes.items(): - # buffer_size must be int, not np.int64 buffer_size = util.iprod(block_shape) @@ -1049,7 +1049,7 @@ def buffers_for_shadow_pricing_choice(shadow_pricing_choice_info): data_buffers[block_key + "_choice"] = shared_data_buffer - persons = read_input_table("persons") + persons = read_input_table(state, "persons") sp_choice_df = persons.reset_index()["person_id"].to_frame() # declare a shared Array with data from sp_choice_df @@ -1162,7 +1162,7 @@ def shadow_price_data_from_buffers(data_buffers, shadow_pricing_info, model_sele return np.frombuffer(data.get_obj(), dtype=dtype).reshape(shape), data.get_lock() -def load_shadow_price_calculator(model_settings): +def load_shadow_price_calculator(state: workflow.State, model_settings): """ Initialize ShadowPriceCalculator for model_selector (e.g. school or workplace) @@ -1178,20 +1178,20 @@ def load_shadow_price_calculator(model_settings): spc : ShadowPriceCalculator """ - num_processes = inject.get_injectable("num_processes", 1) + num_processes = state.get_injectable("num_processes", 1) model_selector = model_settings["MODEL_SELECTOR"] # - get shared_data from data_buffers (if multiprocessing) - data_buffers = inject.get_injectable("data_buffers", None) + data_buffers = state.get_injectable("data_buffers", None) if data_buffers is not None: logger.info("Using existing data_buffers for shadow_price") # - shadow_pricing_info - shadow_pricing_info = inject.get_injectable("shadow_pricing_info", None) + shadow_pricing_info = state.get_injectable("shadow_pricing_info", None) assert shadow_pricing_info is not None - shadow_pricing_choice_info = inject.get_injectable( + shadow_pricing_choice_info = state.get_injectable( "shadow_pricing_choice_info", None ) assert shadow_pricing_choice_info is not None @@ -1218,6 +1218,7 @@ def load_shadow_price_calculator(model_settings): # - ShadowPriceCalculator spc = ShadowPriceCalculator( + state, model_settings, num_processes, data, @@ -1230,17 +1231,12 @@ def load_shadow_price_calculator(model_settings): return spc -# first define add_size_tables as an orca step with no scale argument at all. -@inject.step() -def add_size_tables(disaggregate_suffixes): - return _add_size_tables(disaggregate_suffixes) - - -# then define _add_size_tables as a second method which also offers an optional -# default argument to not scale sizes. This is used only in disaggregate -# accessibility (for now) and is not called via orca. We need to do this to -# avoid having to create a new orca variable for the scale argument. -def _add_size_tables(disaggregate_suffixes, scale=True): +@workflow.step +def add_size_tables( + state: workflow.State, + disaggregate_suffixes: dict[str, Any], + scale: bool = True, +) -> None: """ inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace) @@ -1260,9 +1256,9 @@ def _add_size_tables(disaggregate_suffixes, scale=True): (size table) counts. """ - use_shadow_pricing = bool(config.setting("use_shadow_pricing")) + use_shadow_pricing = bool(state.settings.use_shadow_pricing) - shadow_settings = config.read_model_settings("shadow_pricing.yaml") + shadow_settings = state.filesystem.read_model_settings("shadow_pricing.yaml") shadow_pricing_models = shadow_settings.get("shadow_pricing_models") if shadow_pricing_models is None: @@ -1295,8 +1291,7 @@ def _add_size_tables(disaggregate_suffixes, scale=True): # since these are scaled to model size, they have to be created while single-process for model_selector, model_name in shadow_pricing_models.items(): - - model_settings = config.read_model_settings(model_name) + model_settings = state.filesystem.read_model_settings(model_name) if suffix is not None and roots: model_settings = util.suffix_tables_in_settings( @@ -1312,24 +1307,23 @@ def _add_size_tables(disaggregate_suffixes, scale=True): chooser_table_name = model_settings["CHOOSER_TABLE_NAME"] chooser_segment_column = model_settings["CHOOSER_SEGMENT_COLUMN_NAME"] - choosers_df = inject.get_table(chooser_table_name).to_frame() + choosers_df = state.get_dataframe(chooser_table_name) if "CHOOSER_FILTER_COLUMN_NAME" in model_settings: choosers_df = choosers_df[ choosers_df[model_settings["CHOOSER_FILTER_COLUMN_NAME"]] != 0 ] # - raw_desired_size - land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") + land_use = state.get_dataframe("land_use") + size_terms = get_size_terms(state) raw_size = tour_destination_size_terms(land_use, size_terms, model_selector) assert set(raw_size.columns) == set(segment_ids.keys()) - full_model_run = config.setting("households_sample_size") == 0 + full_model_run = state.settings.households_sample_size == 0 scale_size_table = scale and scale_size_table if (use_shadow_pricing and full_model_run) and scale_size_table: - # need to scale destination size terms because ctramp and daysim approaches directly # compare modeled size and target size when computing shadow prices # Does not apply to simulation approach which compares proportions. @@ -1387,10 +1381,10 @@ def _add_size_tables(disaggregate_suffixes, scale=True): scaled_size.index.is_monotonic_increasing ), f"size table {size_table_name(model_selector)} not is_monotonic_increasing" - inject.add_table(size_table_name(model_selector), scaled_size, replace=True) + state.add_table(size_table_name(model_selector), scaled_size) -def get_shadow_pricing_info(): +def get_shadow_pricing_info(state): """ return dict with info about dtype and shapes of desired and modeled size tables @@ -1404,17 +1398,16 @@ def get_shadow_pricing_info(): block_shapes: dict {: } """ - land_use = inject.get_table("land_use") - size_terms = inject.get_injectable("size_terms") + land_use = state.get_dataframe("land_use") + size_terms = state.get_injectable("size_terms") - shadow_settings = config.read_model_settings("shadow_pricing.yaml") + shadow_settings = state.filesystem.read_model_settings("shadow_pricing.yaml") # shadow_pricing_models is dict of {: } shadow_pricing_models = shadow_settings.get("shadow_pricing_models", {}) blocks = OrderedDict() for model_selector in shadow_pricing_models: - sp_rows = len(land_use) sp_cols = len(size_terms[size_terms.model_selector == model_selector]) @@ -1434,7 +1427,7 @@ def get_shadow_pricing_info(): return shadow_pricing_info -def get_shadow_pricing_choice_info(): +def get_shadow_pricing_choice_info(state): """ return dict with info about dtype and shapes of desired and modeled size tables @@ -1448,16 +1441,15 @@ def get_shadow_pricing_choice_info(): block_shapes: dict {: } """ - persons = read_input_table("persons") + persons = read_input_table(state, "persons") - shadow_settings = config.read_model_settings("shadow_pricing.yaml") + shadow_settings = state.filesystem.read_model_settings("shadow_pricing.yaml") # shadow_pricing_models is dict of {: } shadow_pricing_models = shadow_settings.get("shadow_pricing_models", {}) blocks = OrderedDict() for model_selector in shadow_pricing_models: - # each person will have a work or school location choice sp_rows = len(persons) @@ -1480,21 +1472,19 @@ def get_shadow_pricing_choice_info(): return shadow_pricing_choice_info -@inject.injectable(cache=True) -def shadow_pricing_info(): - +@workflow.cached_object +def shadow_pricing_info(state: workflow.State): # when multiprocessing with shared data mp_tasks has to call network_los methods # get_shadow_pricing_info() and buffers_for_shadow_pricing() logger.debug("loading shadow_pricing_info injectable") - return get_shadow_pricing_info() - + return get_shadow_pricing_info(state) -@inject.injectable(cache=True) -def shadow_pricing_choice_info(): +@workflow.cached_object +def shadow_pricing_choice_info(state: workflow.State): # when multiprocessing with shared data mp_tasks has to call network_los methods # get_shadow_pricing_info() and buffers_for_shadow_pricing() logger.debug("loading shadow_pricing_choice_info injectable") - return get_shadow_pricing_choice_info() + return get_shadow_pricing_choice_info(state) diff --git a/activitysim/abm/tables/size_terms.py b/activitysim/abm/tables/size_terms.py index e31b004e5d..531a0b7d78 100644 --- a/activitysim/abm/tables/size_terms.py +++ b/activitysim/abm/tables/size_terms.py @@ -1,18 +1,19 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import numpy as np import pandas as pd -from activitysim.core import config, inject +from activitysim.core import workflow logger = logging.getLogger(__name__) -@inject.injectable(cache=True) -def size_terms(): - f = config.config_file_path("destination_choice_size_terms.csv") +@workflow.cached_object +def size_terms(state: workflow.State): + f = state.filesystem.get_config_file_path("destination_choice_size_terms.csv") return pd.read_csv(f, comment="#", index_col="segment") @@ -57,7 +58,7 @@ def tour_destination_size_terms(land_use, size_terms, model_selector): Parameters ---------- - land_use - pipeline table + land_use - pd.DataFrame size_terms - pipeline table model_selector - str @@ -78,8 +79,6 @@ def tour_destination_size_terms(land_use, size_terms, model_selector): ... """ - land_use = land_use.to_frame() - # don't count on land_use being sorted by index if not land_use.index.is_monotonic_increasing: land_use = land_use.sort_index() diff --git a/activitysim/abm/tables/skims.py b/activitysim/abm/tables/skims.py index 39440b29f2..af310c30b8 100644 --- a/activitysim/abm/tables/skims.py +++ b/activitysim/abm/tables/skims.py @@ -3,8 +3,7 @@ import logging -from activitysim.core import config, inject, los -from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.core import los, workflow logger = logging.getLogger(__name__) @@ -13,33 +12,38 @@ """ -@inject.injectable(cache=True) -def network_los_preload(): - +@workflow.cached_object +def network_los_preload(state: workflow.State) -> los.Network_LOS: # when multiprocessing with shared data mp_tasks has to call network_los methods # allocate_shared_skim_buffers() and load_shared_data() BEFORE network_los.load_data() logger.debug("loading network_los_without_data_loaded injectable") - nw_los = los.Network_LOS() - + nw_los = los.Network_LOS(state) return nw_los -@inject.injectable(cache=True) -def network_los(network_los_preload): - +@workflow.cached_object +def network_los( + state: workflow.State, # noqa: F841 + network_los_preload: los.Network_LOS, +) -> los.Network_LOS: logger.debug("loading network_los injectable") network_los_preload.load_data() return network_los_preload -@inject.injectable(cache=True) -def skim_dict(network_los): - return network_los.get_default_skim_dict() - +@workflow.cached_object +def skim_dict( + state: workflow.State, # noqa: F841 + network_los: los.Network_LOS, +): + result = network_los.get_default_skim_dict() + return result -@inject.injectable() -def log_settings(): +@workflow.cached_object +def log_settings( + state: workflow.State, # noqa: F841 +): # abm settings to log on startup return [ "households_sample_size", diff --git a/activitysim/abm/tables/table_dict.py b/activitysim/abm/tables/table_dict.py index 2b1cb9086c..12894516cb 100644 --- a/activitysim/abm/tables/table_dict.py +++ b/activitysim/abm/tables/table_dict.py @@ -1,10 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging from collections import OrderedDict from activitysim.abm.models.util import canonical_ids as cid -from activitysim.core import inject +from activitysim.core import workflow logger = logging.getLogger(__name__) @@ -14,14 +16,14 @@ """ -@inject.injectable() -def rng_channels(): +@workflow.cached_object +def rng_channels(state: workflow.State): return cid.RANDOM_CHANNELS -@inject.injectable() -def traceable_tables(): +@workflow.cached_object +def traceable_tables(state: workflow.State): # names of all traceable tables ordered by dependency on household_id # e.g. 'persons' has to be registered AFTER 'households' @@ -29,20 +31,14 @@ def traceable_tables(): return cid.TRACEABLE_TABLES -@inject.injectable() -def traceable_table_indexes(): +@workflow.cached_object +def traceable_table_indexes(state: workflow.State): # traceable_table_indexes is OrderedDict {: } # so we can find first registered table to slice by ref_col return OrderedDict() -@inject.injectable() -def traceable_table_ids(): - # traceable_table_ids is dict {: [, ]} - return dict() - - -@inject.injectable() -def canonical_table_index_names(): +@workflow.cached_object +def canonical_table_index_names(state: workflow.State): # traceable_table_ids is dict {: [, ]} return cid.CANONICAL_TABLE_INDEX_NAMES diff --git a/activitysim/abm/tables/time_windows.py b/activitysim/abm/tables/time_windows.py index 1a6279173e..af639e6691 100644 --- a/activitysim/abm/tables/time_windows.py +++ b/activitysim/abm/tables/time_windows.py @@ -1,21 +1,24 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -import os import numpy as np import pandas as pd -from activitysim.core import config, inject from activitysim.core import timetable as tt +from activitysim.core import workflow logger = logging.getLogger(__name__) -@inject.injectable(cache=True) -def tdd_alts(): +@workflow.cached_object +def tdd_alts(state: workflow.State) -> pd.DataFrame: # right now this file just contains the start and end hour - file_path = config.config_file_path("tour_departure_and_duration_alternatives.csv") + file_path = state.filesystem.get_config_file_path( + "tour_departure_and_duration_alternatives.csv" + ) df = pd.read_csv(file_path) df["duration"] = df.end - df.start @@ -26,9 +29,8 @@ def tdd_alts(): return df -@inject.injectable(cache=True) -def tdd_alt_segments(): - +@workflow.cached_object +def tdd_alt_segments(state: workflow.State) -> pd.DataFrame: # tour_purpose,time_period,start,end # work,EA,3,5 # work,AM,6,8 @@ -36,12 +38,11 @@ def tdd_alt_segments(): # school,PM,15,17 # school,EV,18,22 - file_path = config.config_file_path( + file_path = state.filesystem.get_config_file_path( "tour_departure_and_duration_segments.csv", mandatory=False ) if file_path: - df = pd.read_csv(file_path, comment="#") # - NARROW @@ -54,18 +55,20 @@ def tdd_alt_segments(): return df -@inject.table() -def person_windows(persons, tdd_alts): - +@workflow.table +def person_windows( + state: workflow.State, + persons: pd.DataFrame, + tdd_alts: pd.DataFrame, +) -> pd.DataFrame: df = tt.create_timetable_windows(persons, tdd_alts) - inject.add_table("person_windows", df) - return df -@inject.injectable() -def timetable(person_windows, tdd_alts): - - logging.debug("@inject timetable") - return tt.TimeTable(person_windows.to_frame(), tdd_alts, person_windows.name) +@workflow.cached_object +def timetable( + state: workflow.State, person_windows: pd.DataFrame, tdd_alts: pd.DataFrame +) -> tt.TimeTable: + logging.debug("@workflow.cached_object timetable") + return tt.TimeTable(person_windows, tdd_alts, "person_windows") diff --git a/activitysim/abm/tables/tours.py b/activitysim/abm/tables/tours.py index a3bd8a8112..b23cfaa28e 100644 --- a/activitysim/abm/tables/tours.py +++ b/activitysim/abm/tables/tours.py @@ -1,15 +1,23 @@ # ActivitySim # See full license in LICENSE.txt. -import logging +from __future__ import annotations -from activitysim.core import inject +import logging -logger = logging.getLogger(__name__) +import pandas as pd +from activitysim.abm.tables.util import simple_table_join +from activitysim.core import workflow -@inject.table() -def tours_merged(tours, persons_merged): - return inject.merge_tables(tours.name, tables=[tours, persons_merged]) +logger = logging.getLogger(__name__) -inject.broadcast("persons_merged", "tours", cast_index=True, onto_on="person_id") +@workflow.temp_table +def tours_merged( + state: workflow.State, tours: pd.DataFrame, persons_merged: pd.DataFrame +): + return simple_table_join( + tours, + persons_merged, + left_on="person_id", + ) diff --git a/activitysim/abm/tables/trips.py b/activitysim/abm/tables/trips.py index 890cbd6f57..ee99635868 100644 --- a/activitysim/abm/tables/trips.py +++ b/activitysim/abm/tables/trips.py @@ -1,15 +1,15 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -from activitysim.core import inject +from activitysim.abm.tables.util import simple_table_join +from activitysim.core import workflow logger = logging.getLogger(__name__) -@inject.table() -def trips_merged(trips, tours): - return inject.merge_tables(trips.name, tables=[trips, tours]) - - -inject.broadcast("tours", "trips", cast_index=True, onto_on="tour_id") +@workflow.temp_table +def trips_merged(state: workflow.State, trips, tours): + return simple_table_join(trips, tours, "tour_id") diff --git a/activitysim/abm/tables/util.py b/activitysim/abm/tables/util.py new file mode 100644 index 0000000000..0b43126a6b --- /dev/null +++ b/activitysim/abm/tables/util.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import pandas as pd + + +def simple_table_join( + left: pd.DataFrame, right: pd.DataFrame, left_on: str +) -> pd.DataFrame: + """ + A simple table join. + + The left table should usually have a many-to-one (or a one-to-one) + relationship with the right table (so, exactly one row on the right table + matches each row in the left). This is not enforced and the code can + still work with many-to-many join, but ActivitySim by convention includes + only many-to-one joins. + + This function mostly mirrors the usual pandas `join`, except when there + are duplicate column names in the right-side table, in which case those + duplciate columns are silently dropped instead of getting renamed. + + Parameters + ---------- + left, right : DataFrame + left_on : str + The name of the column of the left + + Returns + ------- + DataFrame + """ + # all the column names in both left and right + intersection = set(left.columns).intersection(right.columns) + intersection.discard(left_on) # intersection is ok if it's the join key + + # duplicate column names in the right-side table are ignored. + right = right.drop(intersection, axis=1) + + return pd.merge( + left, + right, + left_on=left_on, + right_index=True, + ) diff --git a/activitysim/abm/tables/vehicles.py b/activitysim/abm/tables/vehicles.py index fdc886a25b..c998c9abe3 100644 --- a/activitysim/abm/tables/vehicles.py +++ b/activitysim/abm/tables/vehicles.py @@ -1,14 +1,19 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -from activitysim.core import inject, pipeline, tracing +import pandas as pd + +from activitysim.abm.tables.util import simple_table_join +from activitysim.core import workflow logger = logging.getLogger(__name__) -@inject.table() -def vehicles(households): +@workflow.table +def vehicles(state: workflow.State, households: pd.DataFrame): """Creates the vehicles table and load it as an injectable This method initializes the `vehicles` table, where the number of rows @@ -16,7 +21,7 @@ def vehicles(households): Parameters ---------- - households : orca.DataFrameWrapper + households : DataFrame Returns ------- @@ -24,9 +29,7 @@ def vehicles(households): """ # initialize vehicles table - vehicles = households.to_frame().loc[ - households.index.repeat(households["auto_ownership"]) - ] + vehicles = households.loc[households.index.repeat(households["auto_ownership"])] vehicles = vehicles.reset_index()[["household_id"]] vehicles["vehicle_num"] = vehicles.groupby("household_id").cumcount() + 1 @@ -35,34 +38,27 @@ def vehicles(households): vehicles.set_index("vehicle_id", inplace=True) # replace table function with dataframe - inject.add_table("vehicles", vehicles) + state.add_table("vehicles", vehicles) - pipeline.get_rn_generator().add_channel("vehicles", vehicles) - tracing.register_traceable_table("vehicles", vehicles) + state.get_rn_generator().add_channel("vehicles", vehicles) + state.tracing.register_traceable_table("vehicles", vehicles) return vehicles -@inject.table() -def vehicles_merged(vehicles, households_merged): +@workflow.temp_table +def vehicles_merged( + state: workflow.State, vehicles: pd.DataFrame, households_merged: pd.DataFrame +): """Augments the vehicles table with household attributes Parameters ---------- - vehicles : orca.DataFrameWrapper - households_merged : orca.DataFrameWrapper + vehicles : DataFrame + households_merged : DataFrame Returns ------- vehicles_merged : pandas.DataFrame """ - - vehicles_merged = inject.merge_tables( - vehicles.name, tables=[vehicles, households_merged] - ) - return vehicles_merged - - -inject.broadcast( - "households_merged", "vehicles", cast_index=True, onto_on="household_id" -) + return simple_table_join(vehicles, households_merged, "household_id") diff --git a/activitysim/abm/test/conftest.py b/activitysim/abm/test/conftest.py deleted file mode 100644 index f14a69149d..0000000000 --- a/activitysim/abm/test/conftest.py +++ /dev/null @@ -1,55 +0,0 @@ -import os - -import orca -import pandas as pd -import pytest - -from activitysim.core import pipeline -from activitysim.core.los import Network_LOS as los - - -@pytest.fixture(scope="module") -def initialize_pipeline( - module: str, tables: dict[str, str], initialize_network_los: bool -) -> pipeline.Pipeline: - test_dir = os.path.join("test", module) - configs_dir = os.path.join(test_dir, "configs") - data_dir = os.path.join(test_dir, "data") - output_dir = os.path.join(test_dir, "output") - - if os.path.isdir(configs_dir): - orca.add_injectable("configs_dir", configs_dir) - - if os.path.isdir(data_dir): - orca.add_injectable("data_dir", data_dir) - - if os.path.isdir(test_dir): - if not os.path.isdir(output_dir): - os.mkdir(output_dir) - orca.add_injectable("output_dir", output_dir) - - # Read in the input test dataframes - for dataframe_name, idx_name in tables.items(): - df = pd.read_csv( - os.path.join("test", module, "data", f"{dataframe_name}.csv"), - index_col=idx_name, - ) - orca.add_table(dataframe_name, df) - - if initialize_network_los: - net_los = los() - net_los.load_data() - orca.add_injectable("network_los", net_los) - - # Add the dataframes to the pipeline - pipeline.open_pipeline() - pipeline.add_checkpoint(module) - pipeline.close_pipeline() - - # By convention, this method needs to yield something - yield pipeline._PIPELINE - - # pytest teardown code - pipeline.close_pipeline() - pipeline_file_path = os.path.join(output_dir, "pipeline.h5") - os.unlink(pipeline_file_path) diff --git a/activitysim/abm/test/run_multi_zone_mp.py b/activitysim/abm/test/run_multi_zone_mp.py index dacd1b3103..f4751864ba 100644 --- a/activitysim/abm/test/run_multi_zone_mp.py +++ b/activitysim/abm/test/run_multi_zone_mp.py @@ -6,7 +6,7 @@ import pandas.testing as pdt from test_multi_zone import example_path, regress_3_zone, setup_dirs -from activitysim.core import inject, mp_tasks, pipeline +from activitysim.core import mp_tasks # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 100 @@ -17,20 +17,20 @@ def test_mp_run(): configs_dir = [example_path("configs_3_zone"), example_path("configs")] data_dir = example_path("data_3") - setup_dirs(configs_dir, data_dir) - inject.add_injectable("settings_file_name", "settings_mp.yaml") + state = setup_dirs(configs_dir, data_dir) + state.add_injectable("settings_file_name", "settings_mp.yaml") - run_list = mp_tasks.get_run_list() + run_list = mp_tasks.get_run_list(state) mp_tasks.print_run_list(run_list) # do this after config.handle_standard_args, as command line args may override injectables injectables = ["data_dir", "configs_dir", "output_dir", "settings_file_name"] - injectables = {k: inject.get_injectable(k) for k in injectables} + injectables = {k: state.get_injectable(k) for k in injectables} mp_tasks.run_multiprocess(run_list, injectables) - pipeline.open_pipeline("_") + pipeline.checkpoint.restore("_") regress_3_zone() - pipeline.close_pipeline() + pipeline.checkpoint.close_store() if __name__ == "__main__": diff --git a/activitysim/abm/test/test_misc/setup_utils.py b/activitysim/abm/test/test_misc/setup_utils.py index d844dcc5fd..65deed2129 100644 --- a/activitysim/abm/test/test_misc/setup_utils.py +++ b/activitysim/abm/test/test_misc/setup_utils.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os @@ -12,7 +14,7 @@ import pytest import yaml -from activitysim.core import config, inject, pipeline, random, tracing +from activitysim.core import config, random, tracing, workflow # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 50 @@ -37,36 +39,33 @@ def setup_dirs(ancillary_configs_dir=None, data_dir=None): # ancillary_configs_dir is used by run_mp to test multiprocess - test_pipeline_configs_dir = os.path.join(os.path.dirname(__file__), "configs") + # test_pipeline_configs_dir = os.path.join(os.path.dirname(__file__), "configs") example_configs_dir = example_path("configs") - configs_dir = [test_pipeline_configs_dir, example_configs_dir] + # configs_dir = [test_pipeline_configs_dir, example_configs_dir] + configs_dir = [example_configs_dir] if ancillary_configs_dir is not None: configs_dir = [ancillary_configs_dir] + configs_dir - inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) if not data_dir: data_dir = example_path("data") - inject.add_injectable("data_dir", data_dir) - - inject.clear_cache() + state = workflow.State.make_default( + configs_dir=configs_dir, + output_dir=output_dir, + data_dir=data_dir, + ) - tracing.config_logger() + state.logging.config_logger() - tracing.delete_output_files("csv") - tracing.delete_output_files("txt") - tracing.delete_output_files("yaml") - tracing.delete_output_files("omx") + state.tracing.delete_output_files("csv") + state.tracing.delete_output_files("txt") + state.tracing.delete_output_files("yaml") + state.tracing.delete_output_files("omx") - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() + return state def close_handlers(): @@ -77,15 +76,3 @@ def close_handlers(): logger.handlers = [] logger.propagate = True logger.setLevel(logging.NOTSET) - - -def inject_settings(**kwargs): - - settings = config.read_settings_file("settings.yaml", mandatory=True) - - for k in kwargs: - settings[k] = kwargs[k] - - inject.add_injectable("settings", settings) - - return settings diff --git a/activitysim/abm/test/test_misc/test_load_cached_accessibility.py b/activitysim/abm/test/test_misc/test_load_cached_accessibility.py index 721b441d4e..b19d136463 100644 --- a/activitysim/abm/test/test_misc/test_load_cached_accessibility.py +++ b/activitysim/abm/test/test_misc/test_load_cached_accessibility.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os @@ -12,9 +14,9 @@ import pytest import yaml -from activitysim.core import config, inject, pipeline, random, tracing +from activitysim.core import config, configuration, random, tracing, workflow -from .setup_utils import inject_settings, setup_dirs +from .setup_utils import setup_dirs # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 50 @@ -35,11 +37,6 @@ def example_path(dirname): return pkg_resources.resource_filename("activitysim", resource) -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() - - def close_handlers(): loggers = logging.Logger.manager.loggerDict for name in loggers: @@ -50,30 +47,27 @@ def close_handlers(): def test_load_cached_accessibility(): - - inject.clear_cache() - inject.reinject_decorated_tables() - data_dir = [os.path.join(os.path.dirname(__file__), "data"), example_path("data")] - setup_dirs(data_dir=data_dir) + state = setup_dirs(data_dir=data_dir) # # add OPTIONAL ceched table accessibility to input_table_list # activitysim.abm.tables.land_use.accessibility() will load this table if listed here # presumably independently calculated outside activitysim or a cached copy created during a previous run # - settings = config.read_settings_file("settings.yaml", mandatory=True) - input_table_list = settings.get("input_table_list") + settings = state.settings + input_table_list = settings.input_table_list input_table_list.append( - { - "tablename": "accessibility", - "filename": "cached_accessibility.csv", - "index_col": "zone_id", - } - ) - inject_settings( - households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, input_table_list=input_table_list + configuration.InputTable.parse_obj( + { + "tablename": "accessibility", + "filename": "cached_accessibility.csv", + "index_col": "zone_id", + } + ) ) + state.settings.households_sample_size = HOUSEHOLDS_SAMPLE_SIZE + state.settings.input_table_list = input_table_list _MODELS = [ "initialize_landuse", @@ -82,15 +76,13 @@ def test_load_cached_accessibility(): ] try: - pipeline.run(models=_MODELS, resume_after=None) + state.run(models=_MODELS, resume_after=None) - accessibility_df = pipeline.get_table("accessibility") + accessibility_df = state.checkpoint.load_dataframe("accessibility") assert "auPkRetail" in accessibility_df finally: - pipeline.close_pipeline() - inject.clear_cache() close_handlers() diff --git a/activitysim/abm/test/test_misc/test_misc.py b/activitysim/abm/test/test_misc/test_misc.py index f4daf7250c..2c47f6e878 100644 --- a/activitysim/abm/test/test_misc/test_misc.py +++ b/activitysim/abm/test/test_misc/test_misc.py @@ -2,38 +2,22 @@ # See full license in LICENSE.txt. import os -import pytest - -from activitysim.core import inject - # The following import statement has the side-effect of registering injectables: -from .. import __init__ +import activitysim.abm # noqa: F401 +from activitysim.core import configuration, workflow def test_misc(): - - inject.clear_cache() - - with pytest.raises(RuntimeError) as excinfo: - inject.get_injectable("configs_dir") - assert "directory does not exist" in str(excinfo.value) - - with pytest.raises(RuntimeError) as excinfo: - inject.get_injectable("data_dir") - assert "directory does not exist" in str(excinfo.value) - - with pytest.raises(RuntimeError) as excinfo: - inject.get_injectable("output_dir") - assert "directory does not exist" in str(excinfo.value) - configs_dir = os.path.join(os.path.dirname(__file__), "configs_test_misc") - inject.add_injectable("configs_dir", configs_dir) + data_dir = os.path.join(os.path.dirname(__file__), "data") - settings = inject.get_injectable("settings") - assert isinstance(settings, dict) + state = workflow.State().initialize_filesystem( + configs_dir=configs_dir, + data_dir=data_dir, + ) - data_dir = os.path.join(os.path.dirname(__file__), "data") - inject.add_injectable("data_dir", data_dir) + state.load_settings() + assert isinstance(state.settings, configuration.Settings) # default values if not specified in settings - assert inject.get_injectable("chunk_size") == 0 + assert state.settings.chunk_size == 0 diff --git a/activitysim/abm/test/test_misc/test_summarize.py b/activitysim/abm/test/test_misc/test_summarize.py deleted file mode 100644 index 6f76e1f2c1..0000000000 --- a/activitysim/abm/test/test_misc/test_summarize.py +++ /dev/null @@ -1,79 +0,0 @@ -import logging -import os - -import pandas as pd -import pytest - -# import models is necessary to initalize the model steps with orca -from activitysim.abm import models -from activitysim.core import config, pipeline - - -# Used by conftest.py initialize_pipeline method -@pytest.fixture(scope="module") -def module() -> str: - """ - A pytest fixture that returns the data folder location. - :return: folder location for any necessary data to initialize the tests - """ - return "summarize" - - -# Used by conftest.py initialize_pipeline method -@pytest.fixture(scope="module") -def tables() -> dict[str, str]: - """ - A pytest fixture that returns the "mock" tables to build pipeline dataframes. The - key-value pair is the name of the table and the index column. - :return: dict - """ - return { - "land_use": "zone_id", - "tours": "tour_id", - "trips": "trip_id", - "persons": "person_id", - "households": "household_id", - } - - -# Used by conftest.py initialize_pipeline method -# Set to true if you need to read skims into the pipeline -@pytest.fixture(scope="module") -def initialize_network_los() -> bool: - """ - A pytest boolean fixture indicating whether network skims should be read from the - fixtures test data folder. - :return: bool - """ - return True - - -def test_summarize(initialize_pipeline: pipeline.Pipeline, caplog): - # Run summarize model - caplog.set_level(logging.DEBUG) - pipeline.run(models=["summarize"]) - - # Retrieve output tables to check contents - model_settings = config.read_model_settings("summarize.yaml") - output_location = ( - model_settings["OUTPUT"] if "OUTPUT" in model_settings else "summaries" - ) - output_dir = config.output_file_path(output_location) - - # Check that households are counted correctly - households_count = pd.read_csv( - config.output_file_path(os.path.join(output_location, f"households_count.csv")) - ) - households = pd.read_csv(config.data_file_path("households.csv")) - assert int(households_count.iloc[0]) == len(households) - - # Check that bike trips are counted correctly - trips_by_mode_count = pd.read_csv( - config.output_file_path( - os.path.join(output_location, f"trips_by_mode_count.csv") - ) - ) - trips = pd.read_csv(config.data_file_path("trips.csv")) - assert int(trips_by_mode_count.BIKE.iloc[0]) == len( - trips[trips.trip_mode == "BIKE"] - ) diff --git a/activitysim/abm/test/test_misc/test_trip_departure_choice.py b/activitysim/abm/test/test_misc/test_trip_departure_choice.py index f36cf6df20..6d462c0bd0 100644 --- a/activitysim/abm/test/test_misc/test_trip_departure_choice.py +++ b/activitysim/abm/test/test_misc/test_trip_departure_choice.py @@ -1,150 +1,152 @@ -import numpy as np -import pandas as pd -import pytest - -import activitysim.abm.models.trip_departure_choice as tdc -from activitysim.abm.models.util.trip import get_time_windows -from activitysim.core import los - -from .setup_utils import setup_dirs - - -@pytest.fixture(scope="module") -def trips(): - outbound_array = [True, True, False, False, False, True, True, False, False, True] - - trips = pd.DataFrame( - data={ - "tour_id": [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], - "trip_duration": [2, 2, 7, 7, 7, 12, 12, 4, 4, 5], - "inbound_duration": [0, 0, 7, 7, 7, 0, 0, 4, 4, 5], - "main_leg_duration": [4, 4, 2, 2, 2, 2, 2, 1, 1, 2], - "outbound_duration": [2, 2, 0, 0, 0, 12, 12, 0, 0, 5], - "trip_count": [2, 2, 3, 3, 3, 2, 2, 2, 2, 1], - "trip_num": [1, 2, 1, 2, 3, 1, 2, 1, 2, 1], - "outbound": outbound_array, - "chunk_id": [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], - "is_work": [ - True, - True, - False, - False, - False, - False, - False, - False, - False, - True, - ], - "is_school": [ - False, - False, - False, - False, - False, - False, - False, - True, - True, - False, - ], - "is_eatout": [ - False, - False, - True, - True, - True, - True, - True, - False, - False, - False, - ], - "start": [8, 8, 18, 18, 18, 18, 18, 24, 24, 19], - "end": [14, 14, 39, 39, 39, 39, 39, 29, 29, 26], - "origin": [3, 5, 15, 12, 24, 8, 17, 8, 9, 6], - "destination": [5, 9, 12, 24, 20, 17, 18, 9, 11, 14], - }, - index=range(10), - ) - - trips.index.name = "trip_id" - return trips - - -@pytest.fixture(scope="module") -def settings(): - return { - "skims_file": "skims.omx", - "skim_time_periods": {"labels": ["EA", "AM", "MD", "PM", "NT"]}, - } - - -@pytest.fixture(scope="module") -def model_spec(): - index = [ - "@(df['stop_time_duration'] * df['is_work'].astype(int)).astype(int)", - "@(df['stop_time_duration'] * df['is_school'].astype(int)).astype(int)", - "@(df['stop_time_duration'] * df['is_eatout'].astype(int)).astype(int)", - ] - - values = { - "inbound": [0.933020, 0.370260, 0.994840], - "outbound": [0.933020, 0.370260, 0.994840], - } - - return pd.DataFrame(index=index, data=values) - - -def test_build_patterns(trips): - time_windows = get_time_windows(48, 3) - patterns = tdc.build_patterns(trips, time_windows) - patterns = patterns.sort_values(["tour_id", "outbound", "trip_num"]) - - assert patterns.shape[0] == 34 - assert patterns.shape[1] == 6 - assert patterns.index.name == tdc.TOUR_LEG_ID - - output_columns = [ - tdc.TOUR_ID, - tdc.PATTERN_ID, - tdc.TRIP_NUM, - tdc.STOP_TIME_DURATION, - tdc.TOUR_ID, - tdc.OUTBOUND, - ] - - assert set(output_columns).issubset(patterns.columns) - - -def test_get_tour_legs(trips): - tour_legs = tdc.get_tour_legs(trips) - assert tour_legs.index.name == tdc.TOUR_LEG_ID - assert ( - np.unique(tour_legs[tdc.TOUR_ID].values).shape[0] - == np.unique(trips[tdc.TOUR_ID].values).shape[0] - ) - - -def test_generate_alternative(trips): - alts = tdc.generate_alternatives(trips, tdc.STOP_TIME_DURATION) - assert alts.shape[0] == 67 - assert alts.shape[1] == 1 - - assert alts.index.name == tdc.TRIP_ID - assert alts.columns[0] == tdc.STOP_TIME_DURATION - - pd.testing.assert_series_equal( - trips.groupby(trips.index)["trip_duration"].max(), - alts.groupby(alts.index)[tdc.STOP_TIME_DURATION].max(), - check_names=False, - ) - - -def test_apply_stage_two_model(model_spec, trips): - setup_dirs() - departures = tdc.apply_stage_two_model(model_spec, trips, 0, "TEST Trip Departure") - assert len(departures) == len(trips) - pd.testing.assert_index_equal(departures.index, trips.index) - - departures = pd.concat([trips, departures], axis=1) +# import numpy as np +# import pandas as pd +# import pytest +# +# import activitysim.abm.models.trip_departure_choice as tdc +# from activitysim.abm.models.util.trip import get_time_windows +# from activitysim.core import los +# +# from .setup_utils import setup_dirs +# +# +# @pytest.fixture(scope="module") +# def trips(): +# outbound_array = [True, True, False, False, False, True, True, False, False, True] +# +# trips = pd.DataFrame( +# data={ +# "tour_id": [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], +# "trip_duration": [2, 2, 7, 7, 7, 12, 12, 4, 4, 5], +# "inbound_duration": [0, 0, 7, 7, 7, 0, 0, 4, 4, 5], +# "main_leg_duration": [4, 4, 2, 2, 2, 2, 2, 1, 1, 2], +# "outbound_duration": [2, 2, 0, 0, 0, 12, 12, 0, 0, 5], +# "trip_count": [2, 2, 3, 3, 3, 2, 2, 2, 2, 1], +# "trip_num": [1, 2, 1, 2, 3, 1, 2, 1, 2, 1], +# "outbound": outbound_array, +# "chunk_id": [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], +# "is_work": [ +# True, +# True, +# False, +# False, +# False, +# False, +# False, +# False, +# False, +# True, +# ], +# "is_school": [ +# False, +# False, +# False, +# False, +# False, +# False, +# False, +# True, +# True, +# False, +# ], +# "is_eatout": [ +# False, +# False, +# True, +# True, +# True, +# True, +# True, +# False, +# False, +# False, +# ], +# "start": [8, 8, 18, 18, 18, 18, 18, 24, 24, 19], +# "end": [14, 14, 39, 39, 39, 39, 39, 29, 29, 26], +# "origin": [3, 5, 15, 12, 24, 8, 17, 8, 9, 6], +# "destination": [5, 9, 12, 24, 20, 17, 18, 9, 11, 14], +# }, +# index=range(10), +# ) +# +# trips.index.name = "trip_id" +# return trips +# +# +# @pytest.fixture(scope="module") +# def settings(): +# return { +# "skims_file": "skims.omx", +# "skim_time_periods": {"labels": ["EA", "AM", "MD", "PM", "NT"]}, +# } +# +# +# @pytest.fixture(scope="module") +# def model_spec(): +# index = [ +# "@(df['stop_time_duration'] * df['is_work'].astype(int)).astype(int)", +# "@(df['stop_time_duration'] * df['is_school'].astype(int)).astype(int)", +# "@(df['stop_time_duration'] * df['is_eatout'].astype(int)).astype(int)", +# ] +# +# values = { +# "inbound": [0.933020, 0.370260, 0.994840], +# "outbound": [0.933020, 0.370260, 0.994840], +# } +# +# return pd.DataFrame(index=index, data=values) +# +# +# def test_build_patterns(trips): +# time_windows = get_time_windows(48, 3) +# patterns = tdc.build_patterns(trips, time_windows) +# patterns = patterns.sort_values(["tour_id", "outbound", "trip_num"]) +# +# assert patterns.shape[0] == 34 +# assert patterns.shape[1] == 6 +# assert patterns.index.name == tdc.TOUR_LEG_ID +# +# output_columns = [ +# tdc.TOUR_ID, +# tdc.PATTERN_ID, +# tdc.TRIP_NUM, +# tdc.STOP_TIME_DURATION, +# tdc.TOUR_ID, +# tdc.OUTBOUND, +# ] +# +# assert set(output_columns).issubset(patterns.columns) +# +# +# def test_get_tour_legs(trips): +# tour_legs = tdc.get_tour_legs(trips) +# assert tour_legs.index.name == tdc.TOUR_LEG_ID +# assert ( +# np.unique(tour_legs[tdc.TOUR_ID].values).shape[0] +# == np.unique(trips[tdc.TOUR_ID].values).shape[0] +# ) +# +# +# def test_generate_alternative(trips): +# alts = tdc.generate_alternatives(trips, tdc.STOP_TIME_DURATION) +# assert alts.shape[0] == 67 +# assert alts.shape[1] == 1 +# +# assert alts.index.name == tdc.TRIP_ID +# assert alts.columns[0] == tdc.STOP_TIME_DURATION +# +# pd.testing.assert_series_equal( +# trips.groupby(trips.index)["trip_duration"].max(), +# alts.groupby(alts.index)[tdc.STOP_TIME_DURATION].max(), +# check_names=False, +# ) +# +# +# def test_apply_stage_two_model(state, model_spec, trips): +# setup_dirs() +# departures = tdc.apply_stage_two_model( +# state, model_spec, trips, 0, "TEST Trip Departure" +# ) +# assert len(departures) == len(trips) +# pd.testing.assert_index_equal(departures.index, trips.index) +# +# departures = pd.concat([trips, departures], axis=1) diff --git a/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py b/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py index a3f6ffdd95..24fdebde3e 100644 --- a/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py +++ b/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py @@ -1,192 +1,199 @@ -import numpy as np -import pandas as pd -import pytest - -from activitysim.abm.models import trip_scheduling_choice as tsc -from activitysim.abm.tables.skims import skim_dict -from activitysim.core import los - -from .setup_utils import setup_dirs - - -@pytest.fixture(scope="module") -def tours(): - tours = pd.DataFrame( - data={ - "duration": [2, 44, 32, 12, 11, 16], - "num_outbound_stops": [2, 4, 0, 0, 1, 3], - "num_inbound_stops": [1, 0, 0, 2, 1, 2], - "tour_type": ["othdisc"] * 2 + ["eatout"] * 4, - "origin": [3, 10, 15, 23, 5, 8], - "destination": [5, 9, 12, 24, 20, 17], - tsc.LAST_OB_STOP: [1, 3, 0, 0, 12, 14], - tsc.FIRST_IB_STOP: [2, 0, 0, 4, 6, 20], - }, - index=range(6), - ) - - tours.index.name = "tour_id" - - tours[tsc.HAS_OB_STOPS] = tours[tsc.NUM_OB_STOPS] >= 1 - tours[tsc.HAS_IB_STOPS] = tours[tsc.NUM_IB_STOPS] >= 1 - - return tours - - -@pytest.fixture(scope="module") -def settings(): - return {"skims_file": "skims.omx", "skim_time_periods": {"labels": ["MD"]}} - - -@pytest.fixture(scope="module") -def model_spec(): - index = [ - "@(df['main_leg_duration']>df['duration']).astype(int)", - "@(df['main_leg_duration'] == 0)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 1)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 2)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 3)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 4)&(df['tour_type']=='othdiscr')", - "@df['tour_type']=='othdiscr'", - "@df['tour_type']=='eatout'", - "@df['tour_type']=='eatout'", - ] - - values = [ - -999, - -6.5884, - -5.0326, - -2.0526, - -1.0313, - -0.46489, - 0.060382, - -0.7508, - 0.53247, - ] - - return pd.DataFrame(index=index, data=values, columns=["stage_one"]) - - -@pytest.fixture(scope="module") -def skims(settings): - setup_dirs() - nw_los = los.Network_LOS() - nw_los.load_data() - skim_d = skim_dict(nw_los) - - od_skim_stack_wrapper = skim_d.wrap("origin", "destination") - do_skim_stack_wrapper = skim_d.wrap("destination", "origin") - obib_skim_stack_wrapper = skim_d.wrap(tsc.LAST_OB_STOP, tsc.FIRST_IB_STOP) - - skims = [od_skim_stack_wrapper, do_skim_stack_wrapper, obib_skim_stack_wrapper] - - return skims - - -@pytest.fixture(scope="module") -def locals_dict(skims): - return {"od_skims": skims[0], "do_skims": skims[1], "obib_skims": skims[2]} - - -def test_generate_schedule_alternatives(tours): - windows = tsc.generate_schedule_alternatives(tours) - assert windows.shape[0] == 296 - assert windows.shape[1] == 4 - - output_columns = [ - tsc.SCHEDULE_ID, - tsc.MAIN_LEG_DURATION, - tsc.OB_DURATION, - tsc.IB_DURATION, - ] - - assert set(output_columns).issubset(windows.columns) - - -def test_no_stops_patterns(tours): - no_stops = tours[ - (tours["num_outbound_stops"] == 0) & (tours["num_inbound_stops"] == 0) - ].copy() - windows = tsc.no_stops_patterns(no_stops) - - assert windows.shape[0] == 1 - assert windows.shape[1] == 3 - - output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] - - assert set(output_columns).issubset(windows.columns) - - pd.testing.assert_series_equal( - windows[tsc.MAIN_LEG_DURATION], - no_stops["duration"], - check_names=False, - check_dtype=False, - ) - assert windows[windows[tsc.IB_DURATION] > 0].empty - assert windows[windows[tsc.OB_DURATION] > 0].empty - - -def test_one_way_stop_patterns(tours): - one_way_stops = tours[ - ( - (tours["num_outbound_stops"] > 0).astype(int) - + (tours["num_inbound_stops"] > 0).astype(int) - ) - == 1 - ].copy() - windows = tsc.stop_one_way_only_patterns(one_way_stops) - - assert windows.shape[0] == 58 - assert windows.shape[1] == 3 - - output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] - - assert set(output_columns).issubset(windows.columns) - - inbound_options = windows[(windows[tsc.IB_DURATION] > 0)] - outbound_options = windows[windows[tsc.OB_DURATION] > 0] - assert np.unique(inbound_options.index).shape[0] == 1 - assert np.unique(outbound_options.index).shape[0] == 1 - - -def test_two_way_stop_patterns(tours): - two_way_stops = tours[ - ( - (tours["num_outbound_stops"] > 0).astype(int) - + (tours["num_inbound_stops"] > 0).astype(int) - ) - == 2 - ].copy() - windows = tsc.stop_two_way_only_patterns(two_way_stops) - - assert windows.shape[0] == 237 - assert windows.shape[1] == 3 - - output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] - - assert set(output_columns).issubset(windows.columns) - - -def test_run_trip_scheduling_choice(model_spec, tours, skims, locals_dict): - """ - Test run the model. - """ - - out_tours = tsc.run_trip_scheduling_choice( - model_spec, tours, skims, locals_dict, 2, None, "PyTest Trip Scheduling" - ) - - assert len(tours) == len(out_tours) - pd.testing.assert_index_equal( - tours.sort_index().index, out_tours.sort_index().index - ) - - output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] - - assert set(output_columns).issubset(out_tours.columns) - - assert len( - out_tours[ - out_tours[output_columns].sum(axis=1) == out_tours[tsc.TOUR_DURATION_COLUMN] - ] - ) == len(tours) +# import numpy as np +# import pandas as pd +# import pytest +# +# from activitysim.abm.models import trip_scheduling_choice as tsc +# from activitysim.abm.tables.skims import skim_dict +# from activitysim.core import los, workflow +# +# from .setup_utils import setup_dirs +# +# +# @pytest.fixture(scope="module") +# def tours(): +# tours = pd.DataFrame( +# data={ +# "duration": [2, 44, 32, 12, 11, 16], +# "num_outbound_stops": [2, 4, 0, 0, 1, 3], +# "num_inbound_stops": [1, 0, 0, 2, 1, 2], +# "tour_type": ["othdisc"] * 2 + ["eatout"] * 4, +# "origin": [3, 10, 15, 23, 5, 8], +# "destination": [5, 9, 12, 24, 20, 17], +# tsc.LAST_OB_STOP: [1, 3, 0, 0, 12, 14], +# tsc.FIRST_IB_STOP: [2, 0, 0, 4, 6, 20], +# }, +# index=range(6), +# ) +# +# tours.index.name = "tour_id" +# +# tours[tsc.HAS_OB_STOPS] = tours[tsc.NUM_OB_STOPS] >= 1 +# tours[tsc.HAS_IB_STOPS] = tours[tsc.NUM_IB_STOPS] >= 1 +# +# return tours +# +# +# @pytest.fixture(scope="module") +# def settings(): +# return {"skims_file": "skims.omx", "skim_time_periods": {"labels": ["MD"]}} +# +# +# @pytest.fixture(scope="module") +# def model_spec(): +# index = [ +# "@(df['main_leg_duration']>df['duration']).astype(int)", +# "@(df['main_leg_duration'] == 0)&(df['tour_type']=='othdiscr')", +# "@(df['main_leg_duration'] == 1)&(df['tour_type']=='othdiscr')", +# "@(df['main_leg_duration'] == 2)&(df['tour_type']=='othdiscr')", +# "@(df['main_leg_duration'] == 3)&(df['tour_type']=='othdiscr')", +# "@(df['main_leg_duration'] == 4)&(df['tour_type']=='othdiscr')", +# "@df['tour_type']=='othdiscr'", +# "@df['tour_type']=='eatout'", +# "@df['tour_type']=='eatout'", +# ] +# +# values = [ +# -999, +# -6.5884, +# -5.0326, +# -2.0526, +# -1.0313, +# -0.46489, +# 0.060382, +# -0.7508, +# 0.53247, +# ] +# +# return pd.DataFrame(index=index, data=values, columns=["stage_one"]) +# +# +# @pytest.fixture(scope="module") +# def skims(settings): +# setup_dirs() +# nw_los = los.Network_LOS() +# nw_los.load_data() +# skim_d = skim_dict(nw_los) +# +# od_skim_stack_wrapper = skim_d.wrap("origin", "destination") +# do_skim_stack_wrapper = skim_d.wrap("destination", "origin") +# obib_skim_stack_wrapper = skim_d.wrap(tsc.LAST_OB_STOP, tsc.FIRST_IB_STOP) +# +# skims = [od_skim_stack_wrapper, do_skim_stack_wrapper, obib_skim_stack_wrapper] +# +# return skims +# +# +# @pytest.fixture(scope="module") +# def locals_dict(skims): +# return {"od_skims": skims[0], "do_skims": skims[1], "obib_skims": skims[2]} +# +# +# def test_generate_schedule_alternatives(tours): +# windows = tsc.generate_schedule_alternatives(tours) +# assert windows.shape[0] == 296 +# assert windows.shape[1] == 4 +# +# output_columns = [ +# tsc.SCHEDULE_ID, +# tsc.MAIN_LEG_DURATION, +# tsc.OB_DURATION, +# tsc.IB_DURATION, +# ] +# +# assert set(output_columns).issubset(windows.columns) +# +# +# def test_no_stops_patterns(tours): +# no_stops = tours[ +# (tours["num_outbound_stops"] == 0) & (tours["num_inbound_stops"] == 0) +# ].copy() +# windows = tsc.no_stops_patterns(no_stops) +# +# assert windows.shape[0] == 1 +# assert windows.shape[1] == 3 +# +# output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] +# +# assert set(output_columns).issubset(windows.columns) +# +# pd.testing.assert_series_equal( +# windows[tsc.MAIN_LEG_DURATION], +# no_stops["duration"], +# check_names=False, +# check_dtype=False, +# ) +# assert windows[windows[tsc.IB_DURATION] > 0].empty +# assert windows[windows[tsc.OB_DURATION] > 0].empty +# +# +# def test_one_way_stop_patterns(tours): +# one_way_stops = tours[ +# ( +# (tours["num_outbound_stops"] > 0).astype(int) +# + (tours["num_inbound_stops"] > 0).astype(int) +# ) +# == 1 +# ].copy() +# windows = tsc.stop_one_way_only_patterns(one_way_stops) +# +# assert windows.shape[0] == 58 +# assert windows.shape[1] == 3 +# +# output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] +# +# assert set(output_columns).issubset(windows.columns) +# +# inbound_options = windows[(windows[tsc.IB_DURATION] > 0)] +# outbound_options = windows[windows[tsc.OB_DURATION] > 0] +# assert np.unique(inbound_options.index).shape[0] == 1 +# assert np.unique(outbound_options.index).shape[0] == 1 +# +# +# def test_two_way_stop_patterns(tours): +# two_way_stops = tours[ +# ( +# (tours["num_outbound_stops"] > 0).astype(int) +# + (tours["num_inbound_stops"] > 0).astype(int) +# ) +# == 2 +# ].copy() +# windows = tsc.stop_two_way_only_patterns(two_way_stops) +# +# assert windows.shape[0] == 237 +# assert windows.shape[1] == 3 +# +# output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] +# +# assert set(output_columns).issubset(windows.columns) +# +# +# def test_run_trip_scheduling_choice( +# state: workflow.State, model_spec, tours, skims, locals_dict +# ): +# """ +# Test run the model. +# """ +# +# out_tours = tsc.run_trip_scheduling_choice( +# state, +# model_spec, +# tours, +# skims, +# locals_dict, +# trace_label="PyTest Trip Scheduling", +# ) +# +# assert len(tours) == len(out_tours) +# pd.testing.assert_index_equal( +# tours.sort_index().index, out_tours.sort_index().index +# ) +# +# output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] +# +# assert set(output_columns).issubset(out_tours.columns) +# +# assert len( +# out_tours[ +# out_tours[output_columns].sum(axis=1) == out_tours[tsc.TOUR_DURATION_COLUMN] +# ] +# ) == len(tours) diff --git a/activitysim/abm/test/test_pipeline/test_pipeline.py b/activitysim/abm/test/test_pipeline/test_pipeline.py index 0bb09a4cc1..6afb7cb8f5 100644 --- a/activitysim/abm/test/test_pipeline/test_pipeline.py +++ b/activitysim/abm/test/test_pipeline/test_pipeline.py @@ -1,18 +1,18 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os import numpy as np -import numpy.testing as npt import openmatrix as omx import pandas as pd import pandas.testing as pdt import pkg_resources import pytest -import yaml -from activitysim.core import config, inject, pipeline, random, tracing +from activitysim.core import random, tracing, workflow # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 50 @@ -44,29 +44,25 @@ def setup_dirs(ancillary_configs_dir=None, data_dir=None): if ancillary_configs_dir is not None: configs_dir = [ancillary_configs_dir] + configs_dir - inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) if not data_dir: data_dir = example_path("data") - inject.add_injectable("data_dir", data_dir) - - inject.clear_cache() - - tracing.config_logger() + state = workflow.State.make_default( + configs_dir=configs_dir, + output_dir=output_dir, + data_dir=data_dir, + ) - tracing.delete_output_files("csv") - tracing.delete_output_files("txt") - tracing.delete_output_files("yaml") - tracing.delete_output_files("omx") + state.logging.config_logger() + state.tracing.delete_output_files("csv") + state.tracing.delete_output_files("txt") + state.tracing.delete_output_files("yaml") + state.tracing.delete_output_files("omx") -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() + return state def close_handlers(): @@ -79,35 +75,21 @@ def close_handlers(): logger.setLevel(logging.NOTSET) -def inject_settings(**kwargs): - - settings = config.read_settings_file("settings.yaml", mandatory=True) - - for k in kwargs: - settings[k] = kwargs[k] - - inject.add_injectable("settings", settings) - - return settings - - def test_rng_access(): - setup_dirs() + state = setup_dirs() + state.settings.rng_base_seed = 0 - inject.add_injectable("rng_base_seed", 0) + state.checkpoint.restore() - pipeline.open_pipeline() - - rng = pipeline.get_rn_generator() + rng = state.get_rn_generator() assert isinstance(rng, random.Random) - pipeline.close_pipeline() - inject.clear_cache() + state.checkpoint.close_store() -def regress_mini_auto(): +def regress_mini_auto(state: workflow.State): # regression test: these are among the middle households in households table # should be the same results as in run_mp (multiprocessing) test case @@ -117,7 +99,9 @@ def regress_mini_auto(): choices, index=pd.Index(hh_ids, name="household_id"), name="auto_ownership" ) - auto_choice = pipeline.get_table("households").sort_index().auto_ownership + auto_choice = ( + state.checkpoint.load_dataframe("households").sort_index().auto_ownership + ) offset = ( HOUSEHOLDS_SAMPLE_SIZE // 2 @@ -138,9 +122,11 @@ def regress_mini_auto(): pdt.assert_series_equal(auto_choice, expected_choice, check_dtype=False) -def regress_mini_mtf(): +def regress_mini_mtf(state: workflow.State): - mtf_choice = pipeline.get_table("persons").sort_index().mandatory_tour_frequency + mtf_choice = ( + state.checkpoint.load_dataframe("persons").sort_index().mandatory_tour_frequency + ) # these choices are for pure regression - their appropriateness has not been checked per_ids = [2566701, 2566702, 3061895] @@ -169,9 +155,9 @@ def regress_mini_mtf(): ) -def regress_mini_location_choice_logsums(): +def regress_mini_location_choice_logsums(state: workflow.State): - persons = pipeline.get_table("persons") + persons = state.checkpoint.load_dataframe("persons") # DEST_CHOICE_LOGSUM_COLUMN_NAME is specified in school_location.yaml and should be assigned assert "school_location_logsum" in persons @@ -183,11 +169,13 @@ def regress_mini_location_choice_logsums(): def test_mini_pipeline_run(): - setup_dirs() + from activitysim.abm.tables.skims import network_los_preload - inject_settings( - households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, write_skim_cache=True - ) + state = setup_dirs() + state.get(network_los_preload) + + state.settings.households_sample_size = HOUSEHOLDS_SAMPLE_SIZE + state.network_settings.write_skim_cache = True _MODELS = [ "initialize_landuse", @@ -198,32 +186,33 @@ def test_mini_pipeline_run(): "auto_ownership_simulate", ] - pipeline.run(models=_MODELS, resume_after=None) + state.run(models=_MODELS, resume_after=None) - regress_mini_auto() + regress_mini_auto(state) - pipeline.run_model("cdap_simulate") - pipeline.run_model("mandatory_tour_frequency") + state.run.by_name("cdap_simulate") + state.run.by_name("mandatory_tour_frequency") - regress_mini_mtf() - regress_mini_location_choice_logsums() + regress_mini_mtf(state) + regress_mini_location_choice_logsums(state) # try to get a non-existant table with pytest.raises(RuntimeError) as excinfo: - pipeline.get_table("bogus") + state.checkpoint.load_dataframe("bogus") assert "never checkpointed" in str(excinfo.value) # try to get an existing table from a non-existant checkpoint with pytest.raises(RuntimeError) as excinfo: - pipeline.get_table("households", checkpoint_name="bogus") + state.checkpoint.load_dataframe("households", checkpoint_name="bogus") assert "not in checkpoints" in str(excinfo.value) # should create optional workplace_location_sample table - workplace_location_sample_df = pipeline.get_table("workplace_location_sample") + workplace_location_sample_df = state.checkpoint.load_dataframe( + "workplace_location_sample" + ) assert "mode_choice_logsum" in workplace_location_sample_df - pipeline.close_pipeline() - inject.clear_cache() + state.checkpoint.close_store() close_handlers() @@ -233,46 +222,50 @@ def test_mini_pipeline_run2(): # exactly the same results as for test_mini_pipeline_run # when we restart pipeline - setup_dirs() + state = setup_dirs() + from activitysim.abm.tables.skims import network_los_preload + + state.get(network_los_preload) - inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, read_skim_cache=True) + state.settings.households_sample_size = HOUSEHOLDS_SAMPLE_SIZE + state.network_settings.read_skim_cache = True # should be able to get this BEFORE pipeline is opened - checkpoints_df = pipeline.get_checkpoints() + checkpoints_df = state.checkpoint.get_inventory() prev_checkpoint_count = len(checkpoints_df.index) - # print "checkpoints_df\n%s" % checkpoints_df[['checkpoint_name']] - assert prev_checkpoint_count == 9 + assert "auto_ownership_simulate" in checkpoints_df.checkpoint_name.values + assert "cdap_simulate" in checkpoints_df.checkpoint_name.values + assert "mandatory_tour_frequency" in checkpoints_df.checkpoint_name.values - pipeline.open_pipeline("auto_ownership_simulate") + state.checkpoint.restore("auto_ownership_simulate") - regress_mini_auto() + regress_mini_auto(state) # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: - pipeline.run_model("auto_ownership_simulate") + state.run.by_name("auto_ownership_simulate") assert "run model 'auto_ownership_simulate' more than once" in str(excinfo.value) # and these new ones - pipeline.run_model("cdap_simulate") - pipeline.run_model("mandatory_tour_frequency") + state.run.by_name("cdap_simulate") + state.run.by_name("mandatory_tour_frequency") - regress_mini_mtf() + regress_mini_mtf(state) # should be able to get this before pipeline is closed (from existing open store) - checkpoints_df = pipeline.get_checkpoints() + checkpoints_df = state.checkpoint.get_inventory() assert len(checkpoints_df.index) == prev_checkpoint_count # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 - hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values + hh_ids = state.checkpoint.load_dataframe("households").head(num_hh_ids).index.values hh_ids = pd.DataFrame({"household_id": hh_ids}) - hh_ids_path = config.data_file_path("override_hh_ids.csv") + hh_ids_path = state.filesystem.get_data_file_path("override_hh_ids.csv") hh_ids.to_csv(hh_ids_path, index=False, header=True) - pipeline.close_pipeline() - inject.clear_cache() + state.checkpoint.close_store() close_handlers() @@ -280,12 +273,14 @@ def test_mini_pipeline_run3(): # test that hh_ids setting overrides household sampling - setup_dirs() - inject_settings(hh_ids="override_hh_ids.csv") + state = setup_dirs() + state.settings.hh_ids = "override_hh_ids.csv" - households = inject.get_table("households").to_frame() + households = state.get_dataframe("households") - override_hh_ids = pd.read_csv(config.data_file_path("override_hh_ids.csv")) + override_hh_ids = pd.read_csv( + state.filesystem.get_data_file_path("override_hh_ids.csv") + ) print("\noverride_hh_ids\n%s" % override_hh_ids) @@ -294,7 +289,6 @@ def test_mini_pipeline_run3(): assert households.shape[0] == override_hh_ids.shape[0] assert households.index.isin(override_hh_ids.household_id).all() - inject.clear_cache() close_handlers() @@ -304,52 +298,30 @@ def full_run( households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, trace_hh_id=None, trace_od=None, - check_for_variability=None, + check_for_variability=False, ): - setup_dirs() + state = setup_dirs() - settings = inject_settings( - households_sample_size=households_sample_size, - chunk_size=chunk_size, - trace_hh_id=trace_hh_id, - trace_od=trace_od, - testing_fail_trip_destination=False, - check_for_variability=check_for_variability, - want_dest_choice_sample_tables=False, - use_shadow_pricing=False, - ) # shadow pricing breaks replicability when sample_size varies + state.settings.households_sample_size = households_sample_size + state.settings.chunk_size = chunk_size + state.settings.trace_hh_id = trace_hh_id + state.settings.trace_od = trace_od + state.settings.testing_fail_trip_destination = False + state.settings.check_for_variability = check_for_variability + state.settings.want_dest_choice_sample_tables = False + state.settings.use_shadow_pricing = False # FIXME should enable testing_fail_trip_destination? - MODELS = settings["models"] + MODELS = state.settings.models - pipeline.run(models=MODELS, resume_after=resume_after) + state.run(models=MODELS, resume_after=resume_after) - tours = pipeline.get_table("tours") + tours = state.checkpoint.load_dataframe("tours") tour_count = len(tours.index) - return tour_count - - -def get_trace_csv(file_name): - - file_name = config.output_file_path(file_name) - df = pd.read_csv(file_name) - - # label value_1 value_2 value_3 value_4 - # 0 tour_id 38 201 39 40 - # 1 mode DRIVE_LOC DRIVE_COM DRIVE_LOC DRIVE_LOC - # 2 person_id 1888694 1888695 1888695 1888696 - # 3 tour_type work othmaint work school - # 4 tour_num 1 1 1 1 - - # transpose df and rename columns - labels = df.label.values - df = df.transpose()[1:] - df.columns = labels - - return df + return state, tour_count EXPECT_TOUR_COUNT = 121 @@ -401,9 +373,9 @@ def regress_tour_modes(tours_df): assert (tours_df.tour_mode.values == EXPECT_MODES).all() -def regress(): +def regress(state: workflow.State): - persons_df = pipeline.get_table("persons") + persons_df = state.checkpoint.load_dataframe("persons") persons_df = persons_df[persons_df.household_id == HH_ID] print("persons_df\n%s" % persons_df[["value_of_time", "distance_to_work"]]) @@ -415,7 +387,7 @@ def regress(): 3249923 23.349532 0.62 """ - tours_df = pipeline.get_table("tours") + tours_df = state.checkpoint.load_dataframe("tours") regress_tour_modes(tours_df) @@ -443,7 +415,7 @@ def regress(): assert "mode_choice_logsum" in tours_df assert not tours_df.mode_choice_logsum.isnull().any() - trips_df = pipeline.get_table("trips") + trips_df = state.checkpoint.load_dataframe("trips") assert trips_df.shape[0] > 0 assert not trips_df.purpose.isnull().any() assert not trips_df.depart.isnull().any() @@ -456,7 +428,7 @@ def regress(): assert trips_df.shape[0] >= 2 * tours_df.shape[0] # write_trip_matrices - trip_matrices_file = config.output_file_path("trips_md.omx") + trip_matrices_file = state.get_output_file_path("trips_md.omx") assert os.path.exists(trip_matrices_file) trip_matrices = omx.open_file(trip_matrices_file) assert trip_matrices.shape() == (25, 25) @@ -473,7 +445,7 @@ def test_full_run1(): if SKIP_FULL_RUN: return - tour_count = full_run( + state, tour_count = full_run( trace_hh_id=HH_ID, check_for_variability=True, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, @@ -485,9 +457,9 @@ def test_full_run1(): tour_count == EXPECT_TOUR_COUNT ), "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) - regress() + regress(state) - pipeline.close_pipeline() + state.checkpoint.close_store() def test_full_run2(): @@ -497,7 +469,7 @@ def test_full_run2(): if SKIP_FULL_RUN: return - tour_count = full_run( + state, tour_count = full_run( resume_after="non_mandatory_tour_scheduling", trace_hh_id=HH_ID ) @@ -505,9 +477,9 @@ def test_full_run2(): tour_count == EXPECT_TOUR_COUNT ), "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) - regress() + regress(state) - pipeline.close_pipeline() + state.checkpoint.close_store() def test_full_run3_with_chunks(): @@ -517,7 +489,7 @@ def test_full_run3_with_chunks(): if SKIP_FULL_RUN: return - tour_count = full_run( + state, tour_count = full_run( trace_hh_id=HH_ID, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, chunk_size=500000, @@ -527,9 +499,9 @@ def test_full_run3_with_chunks(): tour_count == EXPECT_TOUR_COUNT ), "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) - regress() + regress(state) - pipeline.close_pipeline() + state.checkpoint.close_store() def test_full_run4_stability(): @@ -539,13 +511,13 @@ def test_full_run4_stability(): if SKIP_FULL_RUN: return - tour_count = full_run( + state, tour_count = full_run( trace_hh_id=HH_ID, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE - 10 ) - regress() + regress(state) - pipeline.close_pipeline() + state.checkpoint.close_store() def test_full_run5_singleton(): @@ -557,17 +529,17 @@ def test_full_run5_singleton(): if SKIP_FULL_RUN: return - tour_count = full_run(trace_hh_id=HH_ID, households_sample_size=1, chunk_size=1) + state, tour_count = full_run( + trace_hh_id=HH_ID, households_sample_size=1, chunk_size=1 + ) - regress() + regress(state) - pipeline.close_pipeline() + state.checkpoint.close_store() if __name__ == "__main__": - from activitysim import abm # register injectables - print("running test_full_run1") test_full_run1() # teardown_function(None) diff --git a/activitysim/benchmarking/benchmarks/mtc1mp4.py b/activitysim/benchmarking/benchmarks/mtc1mp4.py index 8fe8dce187..9e9573d044 100644 --- a/activitysim/benchmarking/benchmarks/mtc1mp4.py +++ b/activitysim/benchmarking/benchmarks/mtc1mp4.py @@ -73,6 +73,7 @@ def setup_cache(): template_component_timings_mp( + state, globals(), COMPONENT_NAMES, EXAMPLE_NAME, diff --git a/activitysim/benchmarking/componentwise.py b/activitysim/benchmarking/componentwise.py index bb3c909b08..ea34d8153c 100644 --- a/activitysim/benchmarking/componentwise.py +++ b/activitysim/benchmarking/componentwise.py @@ -1,36 +1,38 @@ +from __future__ import annotations + import glob import logging import logging.handlers import os import traceback +from pathlib import Path import numpy as np import pandas as pd import yaml -from ..cli.create import get_example -from ..cli.run import INJECTABLES, config, pipeline -from ..core import inject, tracing -from ..core.pipeline import open_pipeline, run_model -from . import workspace +from activitysim.benchmarking import workspace +from activitysim.cli.create import get_example +from activitysim.cli.run import INJECTABLES, config +from activitysim.core import tracing, workflow logger = logging.getLogger(__name__) def reload_settings(settings_filename, **kwargs): - settings = config.read_settings_file(settings_filename, mandatory=True) + settings = state.filesystem.read_settings_file(settings_filename, mandatory=True) for k in kwargs: settings[k] = kwargs[k] - inject.add_injectable("settings", settings) + state.add_injectable("settings", settings) return settings -def component_logging(component_name): +def component_logging(state: workflow.State, component_name): root_logger = logging.getLogger() CLOG_FMT = "%(asctime)s %(levelname)7s - %(name)s: %(message)s" - logfilename = config.log_file_path(f"asv-{component_name}.log") + logfilename = state.get_log_file_path(f"asv-{component_name}.log") # avoid creation of multiple file handlers for logging components # as we will re-enter this function for every component run @@ -40,7 +42,7 @@ def component_logging(component_name): ): return - tracing.config_logger(basic=True) + state.logging.config_logger(basic=True) file_handler = logging.handlers.RotatingFileHandler( filename=logfilename, mode="a", @@ -56,6 +58,7 @@ def component_logging(component_name): def setup_component( + state, component_name, working_dir=".", preload_injectables=(), @@ -75,11 +78,11 @@ def setup_component( """ if isinstance(configs_dirs, str): configs_dirs = [configs_dirs] - inject.add_injectable( + state.add_injectable( "configs_dir", [os.path.join(working_dir, i) for i in configs_dirs] ) - inject.add_injectable("data_dir", os.path.join(working_dir, data_dir)) - inject.add_injectable("output_dir", os.path.join(working_dir, output_dir)) + state.add_injectable("data_dir", os.path.join(working_dir, data_dir)) + state.add_injectable("output_dir", os.path.join(working_dir, output_dir)) reload_settings( settings_filename, @@ -88,22 +91,22 @@ def setup_component( **other_settings, ) - component_logging(component_name) + component_logging(state, component_name) logger.info("connected to component logger") config.filter_warnings() logging.captureWarnings(capture=True) # register abm steps and other abm-specific injectables outside of # benchmark timing loop - if not inject.is_injectable("preload_injectables"): + if "preload_injectables" not in state.context: logger.info("preload_injectables yes import") - from activitysim import abm + from activitysim import abm # noqa: F401 else: logger.info("preload_injectables no import") # Extract the resume_after argument based on the model immediately # prior to the component being benchmarked. - models = config.setting("models") + models = state.settings.models try: component_index = models.index(component_name) except ValueError: @@ -115,7 +118,7 @@ def setup_component( else: resume_after = None - if config.setting("multiprocess", False): + if state.settings.multiprocess: raise NotImplementedError( "multiprocess component benchmarking is not yet implemented" ) @@ -124,15 +127,15 @@ def setup_component( # components. Instead, those benchmarks are generated in # aggregate during setup and then extracted from logs later. else: - open_pipeline(resume_after, mode="r") + state.checkpoint.restore(resume_after, mode="r") for k in preload_injectables: - if inject.get_injectable(k, None) is not None: + if state.get_injectable(k, None) is not None: logger.info("pre-loaded %s", k) # Directories Logging for k in ["configs_dir", "settings_file_name", "data_dir", "output_dir"]: - logger.info(f"DIRECTORY {k}: {inject.get_injectable(k, None)}") + logger.info(f"DIRECTORY {k}: {state.get_injectable(k, None)}") # Settings Logging log_settings = [ @@ -154,10 +157,10 @@ def setup_component( logger.info("setup_component completed: %s", component_name) -def run_component(component_name): +def run_component(state, component_name): logger.info("run_component: %s", component_name) try: - if config.setting("multiprocess", False): + if state.settings.multiprocess: raise NotImplementedError( "multiprocess component benchmarking is not yet implemented" ) @@ -166,7 +169,7 @@ def run_component(component_name): # components. Instead, those benchmarks are generated in # aggregate during setup and then extracted from logs later. else: - run_model(component_name) + state.run.by_name(component_name) except Exception as err: logger.exception("run_component exception: %s", component_name) raise @@ -175,21 +178,21 @@ def run_component(component_name): return 0 -def teardown_component(component_name): +def teardown_component(state, component_name): logger.info("teardown_component: %s", component_name) # use the pipeline module to clear out all the orca tables, so # the next benchmark run has a clean slate. # anything needed should be reloaded from the pipeline checkpoint file - pipeline_tables = pipeline.registered_tables() + pipeline_tables = state.registered_tables() for table_name in pipeline_tables: logger.info("dropping table %s", table_name) - pipeline.drop_table(table_name) + state.drop_table(table_name) - if config.setting("multiprocess", False): + if state.settings.multiprocess: raise NotImplementedError("multiprocess benchmarking is not yet implemented") else: - pipeline.close_pipeline() + state.checkpoint.close_store() logger.critical( "teardown_component completed: %s\n\n%s\n\n", component_name, "~" * 88 ) @@ -197,6 +200,7 @@ def teardown_component(component_name): def pre_run( + state, model_working_dir, configs_dirs=None, data_dir="data", @@ -229,40 +233,40 @@ def pre_run( for a model run. """ if configs_dirs is None: - inject.add_injectable("configs_dir", os.path.join(model_working_dir, "configs")) + state.add_injectable("configs_dir", os.path.join(model_working_dir, "configs")) else: configs_dirs_ = [os.path.join(model_working_dir, i) for i in configs_dirs] - inject.add_injectable("configs_dir", configs_dirs_) - inject.add_injectable("data_dir", os.path.join(model_working_dir, data_dir)) - inject.add_injectable("output_dir", os.path.join(model_working_dir, output_dir)) + state.add_injectable("configs_dir", configs_dirs_) + state.add_injectable("data_dir", os.path.join(model_working_dir, data_dir)) + state.add_injectable("output_dir", os.path.join(model_working_dir, output_dir)) if settings_file_name is not None: - inject.add_injectable("settings_file_name", settings_file_name) + state.add_injectable("settings_file_name", settings_file_name) # Always pre_run from the beginning config.override_setting("resume_after", None) # register abm steps and other abm-specific injectables - if not inject.is_injectable("preload_injectables"): - from activitysim import ( # register abm steps and other abm-specific injectables - abm, - ) + if "preload_injectables" not in state.context: + from activitysim import abm # noqa: F401 + + # register abm steps and other abm-specific injectables if settings_file_name is not None: - inject.add_injectable("settings_file_name", settings_file_name) + state.add_injectable("settings_file_name", settings_file_name) # cleanup # cleanup_output_files() - tracing.config_logger(basic=False) + state.logging.config_logger(basic=False) config.filter_warnings() logging.captureWarnings(capture=True) # directories for k in ["configs_dir", "settings_file_name", "data_dir", "output_dir"]: - logger.info("SETTING %s: %s" % (k, inject.get_injectable(k, None))) + logger.info("SETTING %s: %s" % (k, state.get_injectable(k, None))) - log_settings = inject.get_injectable("log_settings", {}) + log_settings = state.get_injectable("log_settings", {}) for k in log_settings: logger.info("SETTING %s: %s" % (k, config.setting(k))) @@ -297,37 +301,37 @@ def pre_run( logger.info(f"MODELS: {config.setting('models')}") - if config.setting("multiprocess", False): + if state.settings.multiprocess: logger.info("run multi-process complete simulation") else: logger.info("run single process simulation") - pipeline.run(models=config.setting("models")) - pipeline.close_pipeline() + state.run(models=state.settings.models) + state.checkpoint.close_store() tracing.print_elapsed_time("prerun required models for checkpointing", t0) return 0 -def run_multiprocess(): +def run_multiprocess(state: workflow.State): logger.info("run multiprocess simulation") - tracing.delete_trace_files() - tracing.delete_output_files("h5") - tracing.delete_output_files("csv") - tracing.delete_output_files("txt") - tracing.delete_output_files("yaml") - tracing.delete_output_files("prof") - tracing.delete_output_files("omx") + state.tracing.delete_trace_files() + state.tracing.delete_output_files("h5") + state.tracing.delete_output_files("csv") + state.tracing.delete_output_files("txt") + state.tracing.delete_output_files("yaml") + state.tracing.delete_output_files("prof") + state.tracing.delete_output_files("omx") from activitysim.core import mp_tasks - injectables = {k: inject.get_injectable(k) for k in INJECTABLES} - mp_tasks.run_multiprocess(injectables) + injectables = {k: state.get_injectable(k) for k in INJECTABLES} + mp_tasks.run_multiprocess(state, injectables) - assert not pipeline.is_open() - - if config.setting("cleanup_pipeline_after_run", False): - pipeline.cleanup_pipeline() + # assert not pipeline.is_open() + # + # if state.settings.cleanup_pipeline_after_run: + # pipeline.cleanup_pipeline() ######## @@ -408,10 +412,8 @@ def template_setup_cache( os.makedirs(model_dir(example_name, config_overload_dir), exist_ok=True) # Find the settings file and extract the complete set of models included - from ..core.config import read_settings_file - try: - existing_settings, settings_filenames = read_settings_file( + existing_settings, settings_filenames = state.filesystem.read_settings_file( settings_filename, mandatory=True, include_stack=True, @@ -490,6 +492,8 @@ def template_setup_cache( os.makedirs(model_dir(example_name, output_dir), exist_ok=True) + state = workflow.State.make_default(Path(model_dir(example_name))) + # Running the model through all the steps and checkpointing everywhere is # expensive and only needs to be run once. Once it is done we will write # out a completion token file to indicate to future benchmark attempts @@ -502,6 +506,7 @@ def template_setup_cache( if not os.path.exists(token_file) and not use_multiprocess: try: pre_run( + state, model_dir(example_name), use_config_dirs, data_dir, @@ -530,13 +535,14 @@ def template_setup_cache( asv_commit = os.environ.get("ASV_COMMIT", "ASV_COMMIT_UNKNOWN") try: pre_run( + state, model_dir(example_name), use_config_dirs, data_dir, output_dir, settings_filename, ) - run_multiprocess() + run_multiprocess(state) except Exception as err: with open( model_dir( @@ -644,6 +650,7 @@ def time_component(self): def template_component_timings_mp( + state: workflow.State, module_globals, component_names, example_name, @@ -685,8 +692,8 @@ class ComponentTiming: def track_component(self): durations = [] - inject.add_injectable("output_dir", model_dir(example_name, output_dir)) - logfiler = config.log_file_path(f"timing_log.mp_households_*.csv") + state.add_injectable("output_dir", model_dir(example_name, output_dir)) + logfiler = state.get_log_file_path(f"timing_log.mp_households_*.csv") for logfile in glob.glob(logfiler): df = pd.read_csv(logfile) dfq = df.query(f"component_name=='{self.component_name}'") diff --git a/activitysim/cli/cli.py b/activitysim/cli/cli.py index de95e78ffa..5a9f5e6ec6 100644 --- a/activitysim/cli/cli.py +++ b/activitysim/cli/cli.py @@ -1,5 +1,15 @@ import argparse +from activitysim import __version__ + +text_art = """ + ___ __ _ _ __ _____ + / _ |____/ /_(_) __(_) /___ __/ __(_)_ _ + / __ / __/ __/ / |/ / / __/ // /\ \/ / ' \ {} +/_/ |_\__/\__/_/|___/_/\__/\_, /___/_/_/_/_/ + /___/ +""" + class CLI: def __init__(self, version, description): @@ -24,5 +34,6 @@ def add_subcommand(self, name, args_func, exec_func, description): subparser.set_defaults(afunc=exec_func) def execute(self): + print(text_art.format(__version__)) args = self.parser.parse_args() return args.afunc(args) diff --git a/activitysim/cli/create.py b/activitysim/cli/create.py index 7cd8d218c5..7211471d77 100644 --- a/activitysim/cli/create.py +++ b/activitysim/cli/create.py @@ -1,9 +1,13 @@ +from __future__ import annotations + import glob import hashlib import logging import os import shutil import sys +import tarfile +import zipfile from pathlib import Path import pkg_resources @@ -72,7 +76,6 @@ def create(args): """ if args.list: - list_examples() return 0 @@ -98,7 +101,12 @@ def list_examples(): def get_example( - example_name, destination, benchmarking=False, optimize=True, link=True + example_name, + destination, + benchmarking=False, + optimize=True, + link=True, + with_subdirs=False, ): """ Copy project data to user-specified directory. @@ -110,13 +118,12 @@ def get_example( Parameters ---------- - example_name: str, name of the example to copy. Options can be found via list_examples() destination: name of target directory to copy files to. - If the target directory already exists, project files - will be copied into a subdirectory with the same name - as the example + If the target directory does not exist, it is created. + Project files will then be copied into a subdirectory + with the same name as the example benchmarking: bool optimize: bool link: bool or path-like @@ -125,14 +132,25 @@ def get_example( value, then a cache directory is created using in a location selected by the platformdirs library (or, if not installed, linking is skipped.) + with_subdirs: bool, default False + Also return any instructions about sub-directories. + + Returns + ------- + Path or (Path, dict) + The path to the location where the example was installed, and + optionally also a mapping of example subdirectory locations. """ if example_name not in EXAMPLES: sys.exit(f"error: could not find example '{example_name}'") if os.path.isdir(destination): dest_path = os.path.join(destination, example_name) + elif os.path.isfile(destination): + raise FileExistsError(destination) else: - dest_path = destination + os.makedirs(destination) + dest_path = os.path.join(destination, example_name) example = EXAMPLES[example_name] itemlist = example.get("include", []) @@ -140,7 +158,6 @@ def get_example( itemlist.extend(example.get("benchmarking", [])) for item in itemlist: - # split include string into source/destination paths items = item.split() assets = items[0] @@ -155,7 +172,9 @@ def get_example( sha256 = None if assets.startswith("http"): - download_asset(assets, target_path, sha256, link=link) + download_asset( + assets, target_path, sha256, link=link, base_path=destination + ) else: for asset_path in glob.glob(_example_path(assets)): @@ -180,9 +199,18 @@ def get_example( if instructions: print(instructions) + if with_subdirs: + subdirs = example.get("subdirs", {}) + subdirs.setdefault("configs_dir", ("configs",)) + subdirs.setdefault("data_dir", ("data",)) + subdirs.setdefault("output_dir", "output") -def copy_asset(asset_path, target_path, dirs_exist_ok=False): + return Path(dest_path), subdirs + else: + return Path(dest_path) + +def copy_asset(asset_path, target_path, dirs_exist_ok=False): print(f"copying {os.path.basename(asset_path)} ...") sys.stdout.flush() if os.path.isdir(asset_path): @@ -196,16 +224,98 @@ def copy_asset(asset_path, target_path, dirs_exist_ok=False): shutil.copy(asset_path, target_path) -def download_asset(url, target_path, sha256=None, link=True): - if link: - if not isinstance(link, (str, Path)): +def _decompress_archive(archive_path: Path, target_location: Path): + # decompress archive file into working directory + if archive_path.suffixes[-2:] == [".tar", ".gz"]: + with tarfile.open(archive_path) as tfile: + common_prefix = os.path.commonprefix(tfile.getnames()) + if common_prefix in {"", ".", "./", None}: + working_dir = target_location + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + else: + working_subdir = target_location.joinpath(common_prefix) + tfile.extractall(working_dir) + elif archive_path.suffixes[-2:] == [".tar", ".zst"]: + working_dir = target_location + try: + working_dir.mkdir(parents=True, exist_ok=True) + except FileExistsError: + pass + working_subdir = working_dir + from sharrow.utils.tar_zst import extract_zst + + extract_zst(archive_path, working_dir) + elif archive_path.suffix == ".zip": + with zipfile.ZipFile(archive_path, "r") as zf: + common_prefix = os.path.commonprefix(zf.namelist()) + if common_prefix in {"", ".", "./", None}: + working_dir = target_location + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + else: + working_subdir = target_location.joinpath(common_prefix) + zf.extractall(working_dir) + else: + raise ValueError(f"unknown archive file type {''.join(archive_path.suffixes)}") + return working_subdir + + +def download_asset( + url: str, + target_path: str, + sha256: str = None, + link: bool = True, + base_path: str | None = None, + unpack: str | None = None, +): + """ + Download assets (extra files) associated with examples. + + Parameters + ---------- + url : str + The URL to download. + target_path : str + The location where the asset should be made available. The raw asset + file is not necessarily stored here, as it may be stored in a cache + directory and symlinked here instead (see `link`). + sha256 : str, optional + Checksum for the file. If there is already a cached file and the + checksum matches, it is not re-downloaded and the cached version is + used. Otherwise, the file is downloaded, and if the downloaded file's + checksum does not match, an error is raised. + link : bool, default True + Download the raw asset to a cache location, and then symlink to the + desired `target_path` location. Note symlinks may not work on Windows + so the file will still be stored in the cache but it will be *copied* + instead of linked. + base_path : str, optional + Give the base directory for the example. + unpack : str, optional + If the asset is an archive file (.zip, .tar.gz, or .tar.zst), it + will be decompressed into this location. + """ + if isinstance(target_path, Path): + target_path = str(target_path) + original_target_path = target_path + if link or unpack: + original_target_path = target_path + if base_path is not None and os.path.isabs(target_path): + target_path = os.path.relpath(target_path, base_path) + if base_path is not None: + if unpack: + if os.path.isabs(unpack): + unpack = os.path.relpath(unpack, base_path) + else: + unpack = os.path.join(base_path, unpack) + if not isinstance(link, str | Path): try: import platformdirs except ImportError: link = False else: link = platformdirs.user_data_dir("ActivitySim") - original_target_path = target_path target_path = os.path.join(link, target_path) os.makedirs(os.path.dirname(target_path), exist_ok=True) if url.endswith(".gz") and not target_path.endswith(".gz"): @@ -249,11 +359,10 @@ def download_asset(url, target_path, sha256=None, link=True): ) elif not sha256: print(f"| computed checksum {computed_sha256}") - if link: - os.makedirs( - os.path.dirname(os.path.normpath(original_target_path)), - exist_ok=True, - ) + if link or unpack: + target_dir = os.path.dirname(os.path.normpath(original_target_path)) + if target_dir: + os.makedirs(target_dir, exist_ok=True) # check if the original_target_path exists and if so check if it is the correct file if os.path.isfile(os.path.normpath(original_target_path)): @@ -263,22 +372,29 @@ def download_asset(url, target_path, sha256=None, link=True): if existing_sha256 != sha256: os.unlink(os.path.normpath(original_target_path)) - # if the original_target_path exists now it is the correct file, keep it - if not os.path.isfile(os.path.normpath(original_target_path)): - try: - os.symlink( - os.path.normpath(target_path), - os.path.normpath(original_target_path), - ) - except OSError: - # permission errors likely foil symlinking on windows - shutil.copy( - os.path.normpath(target_path), - os.path.normpath(original_target_path), - ) - print(f"| copied to {os.path.normpath(original_target_path)}") - else: - print(f"| symlinked to {os.path.normpath(original_target_path)}") + if unpack: + _decompress_archive( + Path(os.path.normpath(target_path)), + Path(os.path.normpath(unpack)), + ) + print(f"| unpacked to {os.path.normpath(unpack)}") + elif link: + # if the original_target_path exists now it is the correct file, keep it + if not os.path.isfile(os.path.normpath(original_target_path)): + try: + os.symlink( + os.path.normpath(target_path), + os.path.normpath(original_target_path), + ) + except OSError: + # permission errors likely foil symlinking on windows + shutil.copy( + os.path.normpath(target_path), + os.path.normpath(original_target_path), + ) + print(f"| copied to {os.path.normpath(original_target_path)}") + else: + print(f"| symlinked to {os.path.normpath(original_target_path)}") def sha256_checksum(filename, block_size=65536): diff --git a/activitysim/cli/exercise.py b/activitysim/cli/exercise.py new file mode 100644 index 0000000000..72bb1cf65b --- /dev/null +++ b/activitysim/cli/exercise.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import os +import traceback + + +def add_exercise_args(parser): + """Create command args""" + parser.add_argument( + "example_name", + type=str, + metavar="EXAMPLE_NAME", + help="name of registered external example to test", + ) + + +def main(args): + """ + Run tests on a registered external example. + """ + example_name = args.example_name + try: + resultcode = _main(example_name) + except Exception: + traceback.print_exc() + return 99 + return resultcode + + +def _main(example_name: str): + if not example_name: + print("no example_name given") + return 101 + + import tempfile + + from activitysim.examples.external import exercise_external_example + + tempdir = tempfile.TemporaryDirectory() + os.chdir(tempdir.name) + try: + resultcode = exercise_external_example(example_name, tempdir.name) + except Exception: + print(f"error in testing {example_name} in {tempdir.name}") + traceback.print_exc() + return 98 + return resultcode diff --git a/activitysim/cli/main.py b/activitysim/cli/main.py index e4b37ba120..423dd6a3ca 100644 --- a/activitysim/cli/main.py +++ b/activitysim/cli/main.py @@ -5,7 +5,7 @@ def prog(): from activitysim import __doc__, __version__, workflows - from activitysim.cli import CLI, benchmark, create, run + from activitysim.cli import CLI, benchmark, create, exercise, run asim = CLI(version=__version__, description=__doc__) asim.add_subcommand( @@ -32,6 +32,12 @@ def prog(): exec_func=workflows.main, description=workflows.main.__doc__, ) + asim.add_subcommand( + name="test", + args_func=exercise.add_exercise_args, + exec_func=exercise.main, + description=exercise.main.__doc__, + ) return asim diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index 71677e8ad8..49467bb8e7 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -1,3 +1,5 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import argparse @@ -9,7 +11,8 @@ import numpy as np -from activitysim.core import chunk, config, inject, mem, pipeline, tracing +from activitysim.core import chunk, config, mem, tracing, workflow +from activitysim.core.configuration import FileSystem, Settings logger = logging.getLogger(__name__) @@ -78,6 +81,12 @@ def add_run_args(parser, multiprocess=True): "Can make single process runs faster, " "but will cause thrashing on MP runs.", ) + parser.add_argument( + "--persist-sharrow-cache", + action="store_true", + help="Store the sharrow cache in a persistent user cache directory.", + ) + parser.add_argument( "-e", "--ext", @@ -97,15 +106,17 @@ def add_run_args(parser, multiprocess=True): metavar="(N)", nargs="?", type=int, - help="run multiprocess. Adds configs_mp settings" - " by default. Optionally give a number of processes," - " which will override the settings file.", + help="run multiprocess. Adds configs_mp settings " + "by default as the first config directory, but only if it is found" + "and is not already explicitly included elsewhere in the list of " + "configs. Optionally give a number of processes greater than 1, " + "which will override the number of processes written in settings file.", ) -def validate_injectable(name): +def validate_injectable(state: workflow.State, name, make_if_missing=False): try: - dir_paths = inject.get_injectable(name) + dir_paths = state.get(name) except RuntimeError: # injectable is missing, meaning is hasn't been explicitly set # and defaults cannot be found. @@ -119,15 +130,18 @@ def validate_injectable(name): for dir_path in dir_paths: if not os.path.exists(dir_path): - sys.exit("Could not find %s '%s'" % (name, os.path.abspath(dir_path))) + if make_if_missing: + os.makedirs(dir_path) + else: + sys.exit("Could not find %s '%s'" % (name, os.path.abspath(dir_path))) return dir_paths -def handle_standard_args(args, multiprocess=True): - def inject_arg(name, value, cache=False): +def handle_standard_args(state: workflow.State, args, multiprocess=True): + def inject_arg(name, value): assert name in INJECTABLES - inject.add_injectable(name, value, cache=cache) + state.set(name, value) if args.working_dir: # activitysim will look in the current working directory for @@ -154,68 +168,79 @@ def inject_arg(name, value, cache=False): else: inject_arg("imported_extensions", ()) - # settings_file_name should be cached or else it gets squashed by config.py - if args.settings_file: - inject_arg("settings_file_name", args.settings_file, cache=True) + state.filesystem = FileSystem.parse_args(args) + for config_dir in state.filesystem.get_configs_dir(): + if not config_dir.is_dir(): + print(f"missing config directory: {config_dir}", file=sys.stderr) + raise NotADirectoryError(f"missing config directory: {config_dir}") + for data_dir in state.filesystem.get_data_dir(): + if not data_dir.is_dir(): + print(f"missing data directory: {data_dir}", file=sys.stderr) + raise NotADirectoryError(f"missing data directory: {data_dir}") - if args.config: - inject_arg("configs_dir", args.config) - - if args.data: - inject_arg("data_dir", args.data) - - if args.output: - inject_arg("output_dir", args.output) - - if multiprocess and args.multiprocess: - config_paths = validate_injectable("configs_dir") - - if not os.path.exists("configs_mp"): - logger.warning("could not find 'configs_mp'. skipping...") - else: - logger.info("adding 'configs_mp' to config_dir list...") - config_paths.insert(0, "configs_mp") - inject_arg("configs_dir", config_paths) + try: + state.load_settings() + except Exception as err: + logger.exception(f"Error {err} in loading settings") + raise - config.override_setting("multiprocess", True) - if args.multiprocess > 0: - config.override_setting("num_processes", args.multiprocess) + if args.multiprocess: + if "configs_mp" not in state.filesystem.configs_dir: + # when triggering multiprocessing from command arguments, + # add 'configs_mp' as the first config directory, but only + # if it exists, and it is not already explicitly included + # in the set of config directories. + if not state.filesystem.get_working_subdir("configs_mp").exists(): + logger.warning("could not find 'configs_mp'. skipping...") + else: + logger.info("adding 'configs_mp' to config_dir list...") + state.filesystem.configs_dir = ( + "configs_mp", + ) + state.filesystem.configs_dir + + state.settings.multiprocess = True + if args.multiprocess > 1: + # setting --multiprocess to just 1 implies using the number of + # processes discovered in the configs file, while setting to more + # than 1 explicitly overrides that setting + state.settings.num_processes = args.multiprocess if args.chunk_size: - config.override_setting("chunk_size", int(args.chunk_size)) + state.settings.chunk_size = int(args.chunk_size) if args.chunk_training_mode is not None: - config.override_setting("chunk_training_mode", args.chunk_training_mode) + state.settings.chunk_training_mode = args.chunk_training_mode if args.households_sample_size is not None: - config.override_setting("households_sample_size", args.households_sample_size) - - for injectable in ["configs_dir", "data_dir", "output_dir"]: - validate_injectable(injectable) + state.settings.households_sample_size = args.households_sample_size if args.pipeline: - inject.add_injectable("pipeline_file_name", args.pipeline) + state.filesystem.pipeline_file_name = args.pipeline if args.resume: - config.override_setting("resume_after", args.resume) + state.settings.resume_after = args.resume + + if args.persist_sharrow_cache: + state.filesystem.persist_sharrow_cache() + return state -def cleanup_output_files(): - tracing.delete_trace_files() +def cleanup_output_files(state: workflow.State): + tracing.delete_trace_files(state) csv_ignore = [] - if config.setting("memory_profile", False): + if state.settings.memory_profile: # memory profiling is opened potentially before `cleanup_output_files` # is called, but we want to leave any (newly created) memory profiling # log files that may have just been created. - mem_prof_log = config.log_file_path("memory_profile.csv") + mem_prof_log = state.get_log_file_path("memory_profile.csv") csv_ignore.append(mem_prof_log) - tracing.delete_output_files("h5") - tracing.delete_output_files("csv", ignore=csv_ignore) - tracing.delete_output_files("txt") - tracing.delete_output_files("yaml") - tracing.delete_output_files("prof") - tracing.delete_output_files("omx") + state.tracing.delete_output_files("h5") + state.tracing.delete_output_files("csv", ignore=csv_ignore) + state.tracing.delete_output_files("txt") + state.tracing.delete_output_files("yaml") + state.tracing.delete_output_files("prof") + state.tracing.delete_output_files("omx") def run(args): @@ -229,26 +254,26 @@ def run(args): int: sys.exit exit code """ + state = workflow.State() + # register abm steps and other abm-specific injectables # by default, assume we are running activitysim.abm # other callers (e.g. populationsim) will have to arrange to register their own steps and injectables # (presumably) in a custom run_simulation.py instead of using the 'activitysim run' command - if not inject.is_injectable("preload_injectables"): + if not "preload_injectables" in state: # register abm steps and other abm-specific injectables from activitysim import abm # noqa: F401 - tracing.config_logger(basic=True) - handle_standard_args(args) # possibly update injectables + state.logging.config_logger(basic=True) + state = handle_standard_args(state, args) # possibly update injectables - if config.setting("rotate_logs", False): - config.rotate_log_directory() + if state.settings.rotate_logs: + state.logging.rotate_log_directory() - if config.setting("memory_profile", False) and not config.setting( - "multiprocess", False - ): + if state.settings.memory_profile and not state.settings.multiprocess: # Memory sidecar is only useful for single process runs # multiprocess runs log memory usage without blocking in the controlling process. - mem_prof_log = config.log_file_path("memory_profile.csv") + mem_prof_log = state.get_log_file_path("memory_profile.csv") from ..core.memory_sidecar import MemorySidecar memory_sidecar_process = MemorySidecar(mem_prof_log) @@ -256,49 +281,51 @@ def run(args): memory_sidecar_process = None # legacy support for run_list setting nested 'models' and 'resume_after' settings - if config.setting("run_list"): - warnings.warn( - "Support for 'run_list' settings group will be removed.\n" - "The run_list.steps setting is renamed 'models'.\n" - "The run_list.resume_after setting is renamed 'resume_after'.\n" - "Specify both 'models' and 'resume_after' directly in settings config file.", - FutureWarning, - ) - run_list = config.setting("run_list") - if "steps" in run_list: - assert not config.setting( - "models" - ), f"Don't expect 'steps' in run_list and 'models' as stand-alone setting!" - config.override_setting("models", run_list["steps"]) - - if "resume_after" in run_list: - assert not config.setting( - "resume_after" - ), f"Don't expect 'resume_after' both in run_list and as stand-alone setting!" - config.override_setting("resume_after", run_list["resume_after"]) + # if state.settings.run_list: + # warnings.warn( + # "Support for 'run_list' settings group will be removed.\n" + # "The run_list.steps setting is renamed 'models'.\n" + # "The run_list.resume_after setting is renamed 'resume_after'.\n" + # "Specify both 'models' and 'resume_after' directly in settings config file.", + # FutureWarning, + # ) + # run_list = state.settings.run_list + # if "steps" in run_list: + # assert not config.setting( + # "models" + # ), f"Don't expect 'steps' in run_list and 'models' as stand-alone setting!" + # config.override_setting("models", run_list["steps"]) + # + # if "resume_after" in run_list: + # assert not config.setting( + # "resume_after" + # ), f"Don't expect 'resume_after' both in run_list and as stand-alone setting!" + # config.override_setting("resume_after", run_list["resume_after"]) # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint - resume_after = config.setting("resume_after", None) + resume_after = state.settings.resume_after # cleanup if not resuming if not resume_after: - cleanup_output_files() - elif config.setting("cleanup_trace_files_on_resume", False): - tracing.delete_trace_files() - - tracing.config_logger(basic=False) # update using possibly new logging configs - config.filter_warnings() + cleanup_output_files(state) + elif state.settings.cleanup_trace_files_on_resume: + tracing.delete_trace_files(state) + + state.logging.config_logger( + basic=False + ) # update using possibly new logging configs + config.filter_warnings(state) logging.captureWarnings(capture=True) # directories for k in ["configs_dir", "settings_file_name", "data_dir", "output_dir"]: - logger.info("SETTING %s: %s" % (k, inject.get_injectable(k, None))) + logger.info("SETTING %s: %s" % (k, getattr(state.filesystem, k, None))) - log_settings = inject.get_injectable("log_settings", {}) + log_settings = state.settings.log_settings for k in log_settings: - logger.info("SETTING %s: %s" % (k, config.setting(k))) + logger.info("SETTING %s: %s" % (k, getattr(state.settings, k, None))) # OMP_NUM_THREADS: openmp # OPENBLAS_NUM_THREADS: openblas @@ -335,32 +362,32 @@ def run(args): t0 = tracing.print_elapsed_time() try: - if config.setting("multiprocess", False): + if state.settings.multiprocess: logger.info("run multiprocess simulation") from activitysim.core import mp_tasks - injectables = {k: inject.get_injectable(k) for k in INJECTABLES} - mp_tasks.run_multiprocess(injectables) + injectables = {k: state.get_injectable(k) for k in INJECTABLES} + injectables["settings"] = state.settings + # injectables["settings_package"] = state.settings.dict() + mp_tasks.run_multiprocess(state, injectables) - assert not pipeline.is_open() - - if config.setting("cleanup_pipeline_after_run", False): - pipeline.cleanup_pipeline() + if state.settings.cleanup_pipeline_after_run: + state.checkpoint.cleanup() else: logger.info("run single process simulation") - pipeline.run( - models=config.setting("models"), + state.run( + models=state.settings.models, resume_after=resume_after, memory_sidecar_process=memory_sidecar_process, ) - if config.setting("cleanup_pipeline_after_run", False): - pipeline.cleanup_pipeline() # has side effect of closing open pipeline + if state.settings.cleanup_pipeline_after_run: + state.checkpoint.cleanup() # has side effect of closing open pipeline else: - pipeline.close_pipeline() + state.checkpoint.close_store() mem.log_global_hwm() # main process except Exception: @@ -369,10 +396,10 @@ def run(args): logger.exception("activitysim run encountered an unrecoverable error") raise - chunk.consolidate_logs() - mem.consolidate_logs() + chunk.consolidate_logs(state) + mem.consolidate_logs(state) - from ..core.flow import TimeLogger + from activitysim.core.flow import TimeLogger TimeLogger.aggregate_summary(logger) @@ -385,7 +412,6 @@ def run(args): if __name__ == "__main__": - from activitysim import abm # register injectables # noqa: F401 parser = argparse.ArgumentParser() diff --git a/activitysim/cli/test/test_cli.py b/activitysim/cli/test/test_cli.py index b8f0fc3842..9358fb7405 100644 --- a/activitysim/cli/test/test_cli.py +++ b/activitysim/cli/test/test_cli.py @@ -59,7 +59,7 @@ def test_create_copy(): assert os.path.exists(target) for folder in ["configs", "configs_mp", "data", "output"]: - assert os.path.isdir(os.path.join(target, folder)) + assert os.path.isdir(os.path.join(target, "prototype_mtc", folder)) # clean up shutil.rmtree(target) @@ -70,14 +70,8 @@ def test_run(): cp = subprocess.run(["activitysim", "run"], capture_output=True) - msg = ( - "please specify either a --working_dir " - "containing 'configs', 'data', and 'output' " - "folders or all three of --config, --data, and --output" - ) - # expect error - assert msg in str(cp.stderr) + assert "missing" in str(cp.stderr) if __name__ == "__main__": diff --git a/activitysim/core/assign.py b/activitysim/core/assign.py index f8e73adc6b..d504d63780 100644 --- a/activitysim/core/assign.py +++ b/activitysim/core/assign.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging from builtins import object, zip from collections import OrderedDict @@ -7,7 +9,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, config, pipeline, util +from activitysim.core import chunk, util, workflow logger = logging.getLogger(__name__) @@ -135,7 +137,7 @@ def write(self, msg): ) -def local_utilities(): +def local_utilities(state): """ Dict of useful modules and functions to provides as locals for use in eval of expressions @@ -150,12 +152,12 @@ def local_utilities(): "np": np, "reindex": util.reindex, "reindex_i": util.reindex_i, - "setting": config.setting, + "setting": lambda *arg: state.settings._get_attr(*arg), "other_than": util.other_than, - "rng": pipeline.get_rn_generator(), + "rng": state.get_rn_generator(), } - utility_dict.update(config.get_global_constants()) + utility_dict.update(state.get_global_constants()) return utility_dict @@ -173,6 +175,7 @@ def is_temp(target): def assign_variables( + state, assignment_expressions, df, locals_dict, @@ -218,8 +221,9 @@ def assign_variables( variables : pandas.DataFrame Will have the index of `df` and columns named by target and containing the result of evaluating expression - trace_df : pandas.DataFrame or None + trace_results : pandas.DataFrame or None a dataframe containing the eval result values for each assignment expression + trace_assigned_locals : dict or None """ np_logger = NumpyLogger(logger) @@ -234,7 +238,9 @@ def is_temp_series_val(target): return target.startswith("_") def to_series(x): - if x is None or np.isscalar(x): + if np.isscalar(x): + return pd.Series(x, index=df.index) + if x is None: return pd.Series([x] * len(df.index), index=df.index) return x @@ -250,7 +256,7 @@ def to_series(x): trace_assigned_locals = OrderedDict() # avoid touching caller's passed-in locals_d parameter (they may be looping) - _locals_dict = local_utilities() + _locals_dict = local_utilities(state) if locals_dict is not None: _locals_dict.update(locals_dict) if df_alias: @@ -276,10 +282,9 @@ def to_series(x): n_randoms += 1 assignment_expressions.loc[expression_idx, "expression"] = expression if n_randoms: - from activitysim.core import pipeline try: - random_draws = pipeline.get_rn_generator().normal_for_df( + random_draws = state.get_rn_generator().normal_for_df( df, broadcast=True, size=n_randoms ) except RuntimeError: @@ -297,7 +302,7 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): _locals_dict["rng_lognormal"] = rng_lognormal - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously @@ -343,7 +348,6 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): continue try: - # FIXME - log any numpy warnings/errors but don't raise np_logger.target = str(target) np_logger.expression = str(expression) @@ -396,7 +400,6 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): _locals_dict[target] = expr_values if trace_results is not None: - trace_results = pd.DataFrame.from_dict(trace_results) trace_results.index = df[trace_rows].index @@ -407,11 +410,11 @@ def rng_lognormal(random_draws, mu, sigma, broadcast=True, scale=False): assert variables, "No non-temp variables were assigned." if chunk_log: - chunk.log_df(trace_label, "temps", temps) - chunk.log_df(trace_label, "variables", variables) + chunk_log.log_df(trace_label, "temps", temps) + chunk_log.log_df(trace_label, "variables", variables) # these are going away - let caller log result df - chunk.log_df(trace_label, "temps", None) - chunk.log_df(trace_label, "variables", None) + chunk_log.log_df(trace_label, "temps", None) + chunk_log.log_df(trace_label, "variables", None) # we stored result in dict - convert to df variables = util.df_from_dict(variables, index=df.index) diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index 6634b479d8..c0e65028dd 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -1,6 +1,8 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import datetime import glob import logging @@ -15,8 +17,8 @@ import pandas as pd import xarray as xr -from . import config, mem, tracing, util -from .util import GB +from activitysim.core import configuration, mem, tracing, util, workflow +from activitysim.core.util import GB logger = logging.getLogger(__name__) @@ -52,7 +54,7 @@ MODE_RETRAIN rebuild chunk_cache table and save/replace in output/cache/chunk_cache.csv preforms a complete rebuild of chunk_cache table by doing adaptive chunking starting with based on default initial - settings (DEFAULT_INITIAL_ROWS_PER_CHUNK) and observing rss, uss, and allocated bytes to compute rows_size. + settings (see configuration.settings) and observing rss, uss, and allocated bytes to compute rows_size. This will run somewhat slower than the other modes because of overhead of small first chunk, and possible instability in the second chunk due to inaccuracies caused by small initial chunk_size sample @@ -89,10 +91,6 @@ LOG_SUBCHUNK_HISTORY = False # only useful for debugging WRITE_SUBCHUNK_HISTORY = False # only useful for debugging -DEFAULT_INITIAL_ROWS_PER_CHUNK = ( - 100 # fallback for default_initial_rows_per_chunk setting -) - # # cache and history files @@ -121,35 +119,26 @@ # globals # -SETTINGS = {} -CHUNK_LEDGERS = [] -CHUNK_SIZERS = [] ledger_lock = threading.Lock() -def chunk_method(): - method = SETTINGS.get("chunk_method") - if method is None: - method = SETTINGS.setdefault( - "chunk_method", config.setting("chunk_method", DEFAULT_CHUNK_METHOD) - ) - assert ( - method in CHUNK_METHODS - ), f"chunk_method setting '{method}' not recognized. Should be one of: {CHUNK_METHODS}" +def chunk_method(state: workflow.State): + method = state.settings.chunk_method + # if method is None: + # method = SETTINGS.setdefault("chunk_method", state.settings.chunk_method) + # assert ( + # method in CHUNK_METHODS + # ), f"chunk_method setting '{method}' not recognized. Should be one of: {CHUNK_METHODS}" return method -def chunk_metric(): - return SETTINGS.setdefault( - "chunk_metric", USS if chunk_method() in USS_CHUNK_METHODS else "rss" - ) +def chunk_metric(state: workflow.State): + return USS if chunk_method(state) in USS_CHUNK_METHODS else "rss" -def chunk_training_mode(): - training_mode = SETTINGS.setdefault( - "chunk_training_mode", config.setting("chunk_training_mode", MODE_ADAPTIVE) - ) +def chunk_training_mode(state: workflow.State): + training_mode = state.settings.chunk_training_mode if not training_mode: training_mode = MODE_CHUNKLESS assert ( @@ -158,47 +147,31 @@ def chunk_training_mode(): return training_mode -def chunk_logging(): - return len(CHUNK_LEDGERS) > 0 - - -def default_initial_rows_per_chunk(): - return SETTINGS.setdefault( - "default_initial_rows_per_chunk", - config.setting( - "default_initial_rows_per_chunk", DEFAULT_INITIAL_ROWS_PER_CHUNK - ), - ) +def chunk_logging(state: workflow.State): + return len(state.chunk.CHUNK_LEDGERS) > 0 -def min_available_chunk_ratio(): - return SETTINGS.setdefault( - "min_available_chunk_ratio", config.setting("min_available_chunk_ratio", 0) - ) +def min_available_chunk_ratio(state: workflow.State): + return state.settings.min_available_chunk_ratio -def keep_chunk_logs(): - # if we are overwriting MEM_LOG_FILE then presumably we want to delete any subprocess files - default = LOG_FILE_NAME == OMNIBUS_LOG_FILE_NAME +def keep_chunk_logs(state: workflow.State): + return state.settings.keep_chunk_logs - return SETTINGS.setdefault( - "keep_chunk_logs", config.setting("keep_chunk_logs", default) - ) - -def trace_label_for_chunk(trace_label, chunk_size, i): +def trace_label_for_chunk(state: workflow.State, trace_label: str, chunk_size, i): # add chunk_num to trace_label # if chunk_size > 0: # trace_label = tracing.extend_trace_label(trace_label, f'chunk_{i}') return trace_label -def get_base_chunk_size(): - assert len(CHUNK_SIZERS) > 0 - return CHUNK_SIZERS[0].chunk_size +def get_base_chunk_size(state: workflow.State): + assert len(state.chunk.CHUNK_SIZERS) > 0 + return state.chunk.CHUNK_SIZERS[0].chunk_size -def overhead_for_chunk_method(overhead, method=None): +def overhead_for_chunk_method(state: workflow.State, overhead, method=None): """ return appropriate overhead for row_size calculation based on current chunk_method @@ -217,7 +190,6 @@ def overhead_for_chunk_method(overhead, method=None): """ def hybrid(xss, bytes): - # this avoids pessimistic underchunking on second chunk without pre-existing cache # but it tends to overshoot on a trained runs # hybrid_overhead = np.maximum(bytes, (xss + bytes) / 2) @@ -228,7 +200,7 @@ def hybrid(xss, bytes): return hybrid_overhead - method = method or chunk_method() + method = method or chunk_method(state) if method == HYBRID_RSS: oh = hybrid(overhead[RSS], overhead[BYTES]) @@ -241,15 +213,14 @@ def hybrid(xss, bytes): return oh -def consolidate_logs(): - - glob_file_name = config.log_file_path(f"*{LOG_FILE_NAME}", prefix=False) - glob_files = glob.glob(glob_file_name) +def consolidate_logs(state: workflow.State): + glob_file_name = state.get_log_file_path(f"*{LOG_FILE_NAME}", prefix=False) + glob_files = glob.glob(str(glob_file_name)) if not glob_files: return - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS), ( + assert chunk_training_mode(state) not in (MODE_PRODUCTION, MODE_CHUNKLESS), ( f"shouldn't be any chunk log files when chunk_training_mode" f" is {MODE_PRODUCTION} or {MODE_CHUNKLESS}" ) @@ -275,10 +246,10 @@ def consolidate_logs(): len(multi_depth_chunk_tag) == 0 ), f"consolidate_logs multi_depth_chunk_tags \n{multi_depth_chunk_tag.values}" - if not keep_chunk_logs(): + if not keep_chunk_logs(state): util.delete_files(glob_files, "chunk.consolidate_logs") - log_output_path = config.log_file_path(OMNIBUS_LOG_FILE_NAME, prefix=False) + log_output_path = state.get_log_file_path(OMNIBUS_LOG_FILE_NAME, prefix=False) logger.debug(f"chunk.consolidate_logs writing omnibus log to {log_output_path}") omnibus_df.to_csv(log_output_path, mode="w", index=False) @@ -311,27 +282,31 @@ def consolidate_logs(): num_rows = omnibus_df[C_NUM_ROWS] for m in USS_CHUNK_METHODS: omnibus_df[f"{m}_row_size"] = np.ceil( - overhead_for_chunk_method(omnibus_df, m) / num_rows + overhead_for_chunk_method(state, omnibus_df, m) / num_rows ).astype(int) omnibus_df = omnibus_df.sort_values(by=C_CHUNK_TAG) - log_dir_output_path = config.log_file_path(CACHE_FILE_NAME, prefix=False) + log_dir_output_path = state.get_log_file_path(CACHE_FILE_NAME, prefix=False) logger.debug( f"chunk.consolidate_logs writing omnibus chunk cache to {log_dir_output_path}" ) omnibus_df.to_csv(log_dir_output_path, mode="w", index=False) - if (chunk_training_mode() == MODE_RETRAIN) or not _HISTORIAN.have_cached_history: - - if config.setting("resume_after"): + if ( + chunk_training_mode( + state, + ) + == MODE_RETRAIN + ) or not state.chunk.HISTORIAN.have_cached_history: + if state.settings.resume_after: # FIXME logger.warning( f"Not updating chunk_log cache directory because resume_after" ) else: cache_dir_output_path = os.path.join( - config.get_cache_dir(), CACHE_FILE_NAME + state.filesystem.get_cache_dir(), CACHE_FILE_NAME ) logger.debug( f"chunk.consolidate_logs writing chunk cache to {cache_dir_output_path}" @@ -339,20 +314,23 @@ def consolidate_logs(): omnibus_df.to_csv(cache_dir_output_path, mode="w", index=False) -class ChunkHistorian(object): +class ChunkHistorian: """ Utility for estimating row_size """ def __init__(self): - self.chunk_log_path = None self.have_cached_history = None self.cached_history_df = None - def load_cached_history(self): - - if chunk_training_mode() == MODE_RETRAIN: + def load_cached_history(self, state: workflow.State): + if ( + chunk_training_mode( + state, + ) + == MODE_RETRAIN + ): # don't need cached history if retraining return @@ -360,7 +338,9 @@ def load_cached_history(self): # already loaded, nothing to do return - chunk_cache_path = os.path.join(config.get_cache_dir(), CACHE_FILE_NAME) + chunk_cache_path = os.path.join( + state.filesystem.get_cache_dir(), CACHE_FILE_NAME + ) logger.debug( f"ChunkHistorian load_cached_history chunk_cache_path {chunk_cache_path}" @@ -383,33 +363,40 @@ def load_cached_history(self): else: self.have_cached_history = False - if chunk_training_mode() == MODE_CHUNKLESS: + if ( + chunk_training_mode( + state, + ) + == MODE_CHUNKLESS + ): return - if chunk_training_mode() == MODE_PRODUCTION: + if ( + chunk_training_mode( + state, + ) + == MODE_PRODUCTION + ): # raise RuntimeError(f"chunk_training_mode is {MODE_PRODUCTION} but no chunk_cache: {chunk_cache_path}") - SETTINGS["chunk_training_mode"] = MODE_RETRAIN + state.settings.chunk_training_mode = MODE_RETRAIN logger.warning( f"chunk_training_mode is {MODE_PRODUCTION} but no chunk_cache: {chunk_cache_path}" ) logger.warning( - f"chunk_training_mode falling back to {chunk_training_mode()}" + f"chunk_training_mode falling back to {chunk_training_mode(state,)}" ) - def cached_history_for_chunk_tag(self, chunk_tag): - + def cached_history_for_chunk_tag(self, state: workflow.State, chunk_tag): history = {} - self.load_cached_history() + self.load_cached_history(state) if self.have_cached_history: - try: df = self.cached_history_df[ self.cached_history_df[C_CHUNK_TAG] == chunk_tag ] if len(df) > 0: - if len(df) > 1: # don't expect this, but not fatal logger.warning( @@ -428,23 +415,25 @@ def cached_history_for_chunk_tag(self, chunk_tag): return history - def cached_row_size(self, chunk_tag): - + def cached_row_size(self, state: workflow.State, chunk_tag): row_size = 0 - cached_history = self.cached_history_for_chunk_tag(chunk_tag) + cached_history = self.cached_history_for_chunk_tag(state, chunk_tag) if cached_history: cum_overhead = {m: cached_history[m] for m in METRICS} num_rows = cached_history[C_NUM_ROWS] # initial_row_size based on cum_overhead and rows_processed from chunk_cache - row_size = math.ceil(overhead_for_chunk_method(cum_overhead) / num_rows) + row_size = math.ceil( + overhead_for_chunk_method(state, cum_overhead) / num_rows + ) return row_size - def write_history(self, history, chunk_tag): - - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + def write_history(self, state: workflow.State, history, chunk_tag): + assert chunk_training_mode( + state, + ) not in (MODE_PRODUCTION, MODE_CHUNKLESS) history_df = pd.DataFrame.from_dict(history) @@ -457,7 +446,7 @@ def write_history(self, history, chunk_tag): history_df = history_df[CHUNK_HISTORY_COLUMNS] if self.chunk_log_path is None: - self.chunk_log_path = config.log_file_path(LOG_FILE_NAME) + self.chunk_log_path = state.get_log_file_path(LOG_FILE_NAME) tracing.write_df_csv( history_df, @@ -469,17 +458,23 @@ def write_history(self, history, chunk_tag): ) -_HISTORIAN = ChunkHistorian() - - -class ChunkLedger(object): +class ChunkLedger: """ """ - def __init__(self, trace_label, chunk_size, baseline_rss, baseline_uss, headroom): + def __init__( + self, + state: workflow.State, + trace_label, + chunk_size, + baseline_rss, + baseline_uss, + headroom, + ): + self.state = state self.trace_label = trace_label self.chunk_size = chunk_size self.headroom = headroom - self.base_chunk_size = get_base_chunk_size() + self.base_chunk_size = get_base_chunk_size(state) self.tables = {} self.hwm_bytes = {"value": 0, "info": f"{trace_label}.init"} @@ -487,9 +482,12 @@ def __init__(self, trace_label, chunk_size, baseline_rss, baseline_uss, headroom self.hwm_uss = {"value": baseline_uss, "info": f"{trace_label}.init"} self.total_bytes = 0 - def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): - - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + def audit( + self, state: workflow.State, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False + ): + assert chunk_training_mode( + state, + ) not in (MODE_PRODUCTION, MODE_CHUNKLESS) MAX_OVERDRAFT = 0.2 @@ -505,7 +503,7 @@ def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): f"bytes: {bytes} headroom: {self.headroom} chunk_size: {self.base_chunk_size} {msg}" ) - if chunk_metric() == RSS and rss > mem_panic_threshold: + if chunk_metric(state) == RSS and rss > mem_panic_threshold: rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) if rss > mem_panic_threshold: logger.warning( @@ -513,7 +511,7 @@ def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): f"rss: {rss} chunk_size: {self.base_chunk_size} {msg}" ) - if chunk_metric() == USS and uss > mem_panic_threshold: + if chunk_metric(state) == USS and uss > mem_panic_threshold: _, uss = mem.get_rss(force_garbage_collect=True, uss=True) if uss > mem_panic_threshold: logger.warning( @@ -533,7 +531,7 @@ def close(self): f"ChunkLedger.close hwm_uss {self.hwm_uss['value']} {self.hwm_uss['info']}" ) - def log_df(self, table_name, df): + def log_df(self, state: workflow.State, table_name: str, df): def size_it(df): if isinstance(df, pd.Series): elements = util.iprod(df.shape) @@ -565,7 +563,7 @@ def size_it(df): assert False return elements, bytes - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + assert chunk_training_mode(state) not in (MODE_PRODUCTION, MODE_CHUNKLESS) if df is None: elements, bytes = (0, 0) @@ -595,9 +593,15 @@ def size_it(df): # update current total_bytes count self.total_bytes = sum(self.tables.values()) - def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): - - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + def check_local_hwm( + self, + state: workflow.State, + hwm_trace_label: str, + rss: int, + uss: int, + total_bytes: int, + ): + assert chunk_training_mode(state) not in (MODE_PRODUCTION, MODE_CHUNKLESS) from_rss_monitor = total_bytes is None @@ -615,19 +619,23 @@ def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): # total_bytes high water mark self.hwm_bytes["value"] = total_bytes self.hwm_bytes["info"] = info - self.audit(hwm_trace_label, bytes=total_bytes) + self.audit(state, hwm_trace_label, bytes=total_bytes) if rss > self.hwm_rss["value"]: # rss high water mark self.hwm_rss["value"] = rss self.hwm_rss["info"] = info - self.audit(hwm_trace_label, rss=rss, from_rss_monitor=from_rss_monitor) + self.audit( + state, hwm_trace_label, rss=rss, from_rss_monitor=from_rss_monitor + ) if uss > self.hwm_uss["value"]: # uss high water mark self.hwm_uss["value"] = uss self.hwm_uss["info"] = info - self.audit(hwm_trace_label, uss=uss, from_rss_monitor=from_rss_monitor) + self.audit( + state, hwm_trace_label, uss=uss, from_rss_monitor=from_rss_monitor + ) # silently registers global high water mark mem.check_global_hwm(RSS, rss, hwm_trace_label) @@ -636,12 +644,12 @@ def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): mem.check_global_hwm(BYTES, total_bytes, hwm_trace_label) def get_hwm_rss(self): - with ledger_lock: + with self.state.chunk.ledger_lock: net_rss = self.hwm_rss["value"] return net_rss def get_hwm_uss(self): - with ledger_lock: + with self.state.chunk.ledger_lock: net_uss = self.hwm_uss["value"] return net_uss @@ -649,76 +657,77 @@ def get_hwm_bytes(self): return self.hwm_bytes["value"] -def log_rss(trace_label, force=False): - - if chunk_training_mode() == MODE_CHUNKLESS: +def log_rss(state: workflow.State, trace_label: str, force=False): + if chunk_training_mode(state) == MODE_CHUNKLESS: # no memory tracing at all in chunkless mode return - assert len(CHUNK_LEDGERS) > 0, f"log_rss called without current chunker." + assert ( + len(state.chunk.CHUNK_LEDGERS) > 0 + ), f"log_rss called without current chunker." hwm_trace_label = f"{trace_label}.log_rss" - if chunk_training_mode() == MODE_PRODUCTION: + if chunk_training_mode(state) == MODE_PRODUCTION: # FIXME - this trace_memory_info call slows things down a lot so it is turned off for now # trace_ticks = 0 if force else mem.MEM_TRACE_TICK_LEN # mem.trace_memory_info(hwm_trace_label, trace_ticks=trace_ticks) return - rss, uss = mem.trace_memory_info(hwm_trace_label) + rss, uss = mem.trace_memory_info(hwm_trace_label, state=state) # check local hwm for all ledgers - with ledger_lock: - for c in CHUNK_LEDGERS: - c.check_local_hwm(hwm_trace_label, rss, uss, total_bytes=None) - - -def log_df(trace_label, table_name, df): - - if chunk_training_mode() in (MODE_PRODUCTION, MODE_CHUNKLESS): - return - - assert len(CHUNK_LEDGERS) > 0, f"log_df called without current chunker." - - op = "del" if df is None else "add" - hwm_trace_label = f"{trace_label}.{op}.{table_name}" - - rss, uss = mem.trace_memory_info(hwm_trace_label) - - cur_chunker = CHUNK_LEDGERS[-1] - - # registers this df and recalc total_bytes - cur_chunker.log_df(table_name, df) - - total_bytes = sum([c.total_bytes for c in CHUNK_LEDGERS]) - - # check local hwm for all ledgers - with ledger_lock: - for c in CHUNK_LEDGERS: - c.check_local_hwm(hwm_trace_label, rss, uss, total_bytes) + with state.chunk.ledger_lock: + for c in state.chunk.CHUNK_LEDGERS: + c.check_local_hwm(state, hwm_trace_label, rss, uss, total_bytes=None) class MemMonitor(threading.Thread): - def __init__(self, trace_label, stop_snooping): + def __init__( + self, state: workflow.State, trace_label: str, stop_snooping: threading.Event + ): self.trace_label = trace_label self.stop_snooping = stop_snooping + self.state = state threading.Thread.__init__(self) def run(self): - log_rss(self.trace_label) + log_rss(self.state, self.trace_label) while not self.stop_snooping.wait(timeout=mem.MEM_SNOOP_TICK_LEN): - log_rss(self.trace_label) + log_rss(self.state, self.trace_label) -class ChunkSizer(object): +class ChunkSizer: """ """ - def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): - - self.depth = len(CHUNK_SIZERS) + 1 + def __init__( + self, + state: workflow.State, + chunk_tag, + trace_label, + num_choosers=0, + chunk_size=0, + chunk_training_mode="disabled", + ): + self.state = state + if state is not None: + self.depth = len(state.chunk.CHUNK_SIZERS) + 1 + self.chunk_training_mode = chunk_training_mode + self.chunk_tag = chunk_tag + self.trace_label = trace_label + self.chunk_size = chunk_size + self.num_choosers = num_choosers + self.rows_processed = 0 + self.initial_row_size = 0 + self.rows_per_chunk = 0 + self.chunk_ledger = None + self.history = {} + self.cum_rows = 0 + self.cum_overhead = {m: 0 for m in METRICS} + self.headroom = None - if chunk_training_mode() != MODE_CHUNKLESS: - if chunk_metric() == USS: + if self.chunk_training_mode != MODE_CHUNKLESS: + if chunk_metric(self.state) == USS: self.rss, self.uss = mem.get_rss(force_garbage_collect=True, uss=True) else: self.rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) @@ -730,42 +739,28 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): # if we are in a nested call, then we must be in the scope of active Ledger # so any rss accumulated so far should be attributed to the parent active ledger - assert len(CHUNK_SIZERS) == len(CHUNK_LEDGERS) - parent = CHUNK_SIZERS[-1] + assert len(state.chunk.CHUNK_SIZERS) == len(state.chunk.CHUNK_LEDGERS) + parent = state.chunk.CHUNK_SIZERS[-1] assert parent.chunk_ledger is not None - log_rss( - trace_label - ) # give parent a complementary log_rss reading entering sub context + log_rss(self.state, trace_label) + # give parent a complementary log_rss reading entering sub context else: self.rss, self.uss = 0, 0 - chunk_size = 0 - config.override_setting("chunk_size", 0) - - self.chunk_tag = chunk_tag - self.trace_label = trace_label - self.chunk_size = chunk_size - - self.num_choosers = num_choosers - self.rows_processed = 0 + # config.override_setting("chunk_size", 0) + return - min_chunk_ratio = min_available_chunk_ratio() + min_chunk_ratio = min_available_chunk_ratio(self.state) assert ( 0 <= min_chunk_ratio <= 1 ), f"min_chunk_ratio setting {min_chunk_ratio} is not in range [0..1]" self.min_chunk_size = chunk_size * min_chunk_ratio - self.initial_row_size = 0 - self.rows_per_chunk = 0 - self.chunk_ledger = None - self.history = {} - - self.cum_rows = 0 - self.cum_overhead = {m: 0 for m in METRICS} - # if production mode, to reduce volatility, initialize cum_overhead and cum_rows from cache - if chunk_training_mode() in [MODE_ADAPTIVE, MODE_PRODUCTION]: - cached_history = _HISTORIAN.cached_history_for_chunk_tag(self.chunk_tag) + if self.chunk_training_mode in [MODE_ADAPTIVE, MODE_PRODUCTION]: + cached_history = self.state.chunk.HISTORIAN.cached_history_for_chunk_tag( + self.state, self.chunk_tag + ) if cached_history: self.cum_overhead = {m: cached_history[m] for m in METRICS} self.cum_rows = cached_history[C_NUM_ROWS] @@ -776,33 +771,35 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): f"cum_overhead: {self.cum_overhead} " ) - # add self to CHUNK_SIZERS list before setting base_chunk_size (since we might be base chunker) - CHUNK_SIZERS.append(self) + # add self to state.chunk.CHUNK_SIZERS list before setting base_chunk_size (since we might be base chunker) + state.chunk.CHUNK_SIZERS.append(self) - self.base_chunk_size = CHUNK_SIZERS[0].chunk_size + self.base_chunk_size = state.chunk.CHUNK_SIZERS[0].chunk_size # need base_chunk_size to calc headroom self.headroom = self.available_headroom( - self.uss if chunk_metric() == USS else self.rss + self.uss if chunk_metric(self.state) == USS else self.rss ) def close(self): + if self.chunk_training_mode == MODE_CHUNKLESS: + return if ((self.depth == 1) or WRITE_SUBCHUNK_HISTORY) and ( - chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + self.chunk_training_mode not in (MODE_PRODUCTION, MODE_CHUNKLESS) ): - _HISTORIAN.write_history(self.history, self.chunk_tag) + self.state.chunk.HISTORIAN.write_history( + self.state, self.history, self.chunk_tag + ) - _chunk_sizer = CHUNK_SIZERS.pop() + _chunk_sizer = self.state.chunk.CHUNK_SIZERS.pop() assert _chunk_sizer == self def available_headroom(self, xss): - headroom = self.base_chunk_size - xss # adjust deficient headroom to min_chunk_size if headroom < self.min_chunk_size: - if self.base_chunk_size > 0: logger.warning( f"Not enough memory for minimum chunk_size without exceeding specified chunk_size. " @@ -816,19 +813,21 @@ def available_headroom(self, xss): return headroom def initial_rows_per_chunk(self): - # whatever the TRAINING_MODE, use cache to determine initial_row_size # (presumably preferable to default_initial_rows_per_chunk) - self.initial_row_size = _HISTORIAN.cached_row_size(self.chunk_tag) + self.initial_row_size = self.state.chunk.HISTORIAN.cached_row_size( + self.state, self.chunk_tag + ) if self.chunk_size == 0: rows_per_chunk = self.num_choosers estimated_number_of_chunks = 1 self.initial_row_size = 0 else: - # we should be a base chunker - assert len(CHUNK_LEDGERS) == 0, f"len(CHUNK_LEDGERS): {len(CHUNK_LEDGERS)}" + assert ( + len(self.state.chunk.CHUNK_LEDGERS) == 0 + ), f"len(state.chunk.CHUNK_LEDGERS): {len(self.state.chunk.CHUNK_LEDGERS)}" if self.initial_row_size > 0: max_rows_per_chunk = np.maximum( @@ -846,11 +845,12 @@ def initial_rows_per_chunk(self): # if no initial_row_size from cache, fall back to default_initial_rows_per_chunk self.initial_row_size = 0 rows_per_chunk = min( - self.num_choosers, default_initial_rows_per_chunk() + self.num_choosers, + self.state.settings.default_initial_rows_per_chunk, ) estimated_number_of_chunks = None - if chunk_training_mode() == MODE_PRODUCTION: + if self.chunk_training_mode == MODE_PRODUCTION: warnings.warn( "ActivitySim is running with a chunk_training_mode of " f"'production' but initial_row_size is zero in {self.trace_label}" @@ -883,28 +883,26 @@ def adaptive_rows_per_chunk(self, i): prev_rss = self.rss prev_uss = self.uss - if chunk_training_mode() != MODE_PRODUCTION: - - if chunk_metric() == USS: + if self.chunk_training_mode != MODE_PRODUCTION: + if chunk_metric(self.state) == USS: self.rss, self.uss = mem.get_rss(force_garbage_collect=True, uss=True) else: self.rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) self.uss = 0 self.headroom = self.available_headroom( - self.uss if chunk_metric() == USS else self.rss + self.uss if chunk_metric(self.state) == USS else self.rss ) rows_remaining = self.num_choosers - prev_rows_processed - if chunk_training_mode() == MODE_PRODUCTION: + if self.chunk_training_mode == MODE_PRODUCTION: # since overhead changes we don't necessarily want the same number of rows per chunk every time # but we do use the row_size from cache which we trust is stable # which is stored in self.initial_row_size because initial_rows_per_chunk used it for the first chunk observed_row_size = self.initial_row_size overhead = self.cum_overhead.copy() else: - # calculate overhead for this chunk iteration overhead = {} overhead[BYTES] = self.chunk_ledger.get_hwm_bytes() @@ -915,7 +913,7 @@ def adaptive_rows_per_chunk(self, i): self.cum_overhead[m] += overhead[m] observed_row_size = prev_cum_rows and math.ceil( - overhead_for_chunk_method(self.cum_overhead) / prev_cum_rows + overhead_for_chunk_method(self.state, self.cum_overhead) / prev_cum_rows ) # rows_per_chunk is closest number of chooser rows to achieve chunk_size without exceeding it @@ -946,7 +944,7 @@ def adaptive_rows_per_chunk(self, i): # diagnostics not reported by ChunkHistorian - if chunk_metric() == USS: + if chunk_metric(self.state) == USS: self.history.setdefault("prev_uss", []).append(prev_uss) self.history.setdefault("cur_uss", []).append(self.uss) else: @@ -973,85 +971,161 @@ def adaptive_rows_per_chunk(self, i): # input() - if chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS): + if self.chunk_training_mode not in (MODE_PRODUCTION, MODE_CHUNKLESS): self.cum_rows += self.rows_per_chunk return self.rows_per_chunk, estimated_number_of_chunks @contextmanager def ledger(self): - # don't do anything in chunkless mode - if chunk_training_mode() == MODE_CHUNKLESS: + if self.chunk_training_mode == MODE_CHUNKLESS: yield return mem_monitor = None # nested chunkers should be unchunked - if len(CHUNK_LEDGERS) > 0: + if len(self.state.chunk.CHUNK_LEDGERS) > 0: assert self.chunk_size == 0 - with ledger_lock: + with self.state.chunk.ledger_lock: self.chunk_ledger = ChunkLedger( - self.trace_label, self.chunk_size, self.rss, self.uss, self.headroom + self.state, + self.trace_label, + self.chunk_size, + self.rss, + self.uss, + self.headroom, ) - CHUNK_LEDGERS.append(self.chunk_ledger) + self.state.chunk.CHUNK_LEDGERS.append(self.chunk_ledger) # reality check - there should be one ledger per sizer - assert len(CHUNK_LEDGERS) == len(CHUNK_SIZERS) + assert len(self.state.chunk.CHUNK_LEDGERS) == len(self.state.chunk.CHUNK_SIZERS) + + stop_snooping = None try: # all calls to log_df within this block will be directed to top level chunk_ledger # and passed on down the stack to the base to support hwm tallies # if this is a base chunk_sizer (and ledger) then start a thread to monitor rss usage - if (len(CHUNK_LEDGERS) == 1) and ENABLE_MEMORY_MONITOR: + if (len(self.state.chunk.CHUNK_LEDGERS) == 1) and ENABLE_MEMORY_MONITOR: stop_snooping = threading.Event() - mem_monitor = MemMonitor(self.trace_label, stop_snooping) + mem_monitor = MemMonitor(self.state, self.trace_label, stop_snooping) mem_monitor.start() log_rss( - self.trace_label, force=True + self.state, self.trace_label, force=True ) # make sure we get at least one reading yield log_rss( - self.trace_label, force=True + self.state, self.trace_label, force=True ) # make sure we get at least one reading finally: - if mem_monitor is not None: - if not mem_monitor.is_alive(): logger.error(f"mem_monitor for {self.trace_label} died!") - bug # bug + raise RuntimeError("bug") - stop_snooping.set() + if stop_snooping is not None: + stop_snooping.set() while mem_monitor.is_alive(): logger.debug( f"{self.trace_label} waiting for mem_monitor thread to terminate" ) mem_monitor.join(timeout=MEM_MONITOR_TICK) - with ledger_lock: + with self.state.chunk.ledger_lock: self.chunk_ledger.close() - CHUNK_LEDGERS.pop() + self.state.chunk.CHUNK_LEDGERS.pop() self.chunk_ledger = None + def log_rss(self, trace_label: str, force: bool = False): + if self.chunk_training_mode == MODE_CHUNKLESS: + # no memory tracing at all in chunkless mode + return + + assert ( + len(self.state.chunk.CHUNK_LEDGERS) > 0 + ), f"log_rss called without current chunker." + + hwm_trace_label = f"{trace_label}.log_rss" + + if self.chunk_training_mode == MODE_PRODUCTION: + # FIXME - this trace_memory_info call slows things down a lot so it is turned off for now + # trace_ticks = 0 if force else mem.MEM_TRACE_TICK_LEN + # mem.trace_memory_info(hwm_trace_label, trace_ticks=trace_ticks) + return + + rss, uss = mem.trace_memory_info(hwm_trace_label, state=self.state) + + # check local hwm for all ledgers + with self.state.chunk.ledger_lock: + for c in self.state.chunk.CHUNK_LEDGERS: + c.check_local_hwm( + self.state, hwm_trace_label, rss, uss, total_bytes=None + ) + + def log_df(self, trace_label: str, table_name: str, df: pd.DataFrame): + if self.chunk_training_mode in (MODE_PRODUCTION, MODE_CHUNKLESS): + return + + assert ( + len(self.state.chunk.CHUNK_LEDGERS) > 0 + ), f"log_df called without current chunker." + + op = "del" if df is None else "add" + hwm_trace_label = f"{trace_label}.{op}.{table_name}" + + rss, uss = mem.trace_memory_info(hwm_trace_label, state=self.state) + + cur_chunker = self.state.chunk.CHUNK_LEDGERS[-1] + + # registers this df and recalc total_bytes + cur_chunker.log_df(self.state, table_name, df) + + total_bytes = sum([c.total_bytes for c in self.state.chunk.CHUNK_LEDGERS]) + + # check local hwm for all ledgers + with self.state.chunk.ledger_lock: + for c in self.state.chunk.CHUNK_LEDGERS: + c.check_local_hwm(self.state, hwm_trace_label, rss, uss, total_bytes) + @contextmanager -def chunk_log(trace_label, chunk_tag=None, base=False): +def chunk_log(state: workflow.State, trace_label, chunk_tag=None, base=False): + """ + Chunk management. + + Parameters + ---------- + trace_label : str + chunk_tag : str, optional + base + Yields + ------ + ChunkSizer + """ # With `base=True` this method can be used to instantiate # a ChunkSizer class object without actually chunking. This # avoids breaking the assertion below. - if chunk_training_mode() == MODE_CHUNKLESS: - yield + if state is None: + # use default chunk_training_mode if settings is not given + _chunk_training_mode = configuration.Settings().chunk_training_mode + else: + _chunk_training_mode = state.settings.chunk_training_mode + + if _chunk_training_mode == MODE_CHUNKLESS: + yield ChunkSizer(state, "chunkless", trace_label, 0, 0, _chunk_training_mode) return - assert base == (len(CHUNK_SIZERS) == 0) + if base != (len(state.chunk.CHUNK_SIZERS) == 0): + raise AssertionError + assert base == (len(state.chunk.CHUNK_SIZERS) == 0) trace_label = f"{trace_label}.chunk_log" @@ -1059,15 +1133,16 @@ def chunk_log(trace_label, chunk_tag=None, base=False): num_choosers = 0 chunk_size = 0 - chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) + chunk_sizer = ChunkSizer( + state, chunk_tag, trace_label, num_choosers, chunk_size, _chunk_training_mode + ) chunk_sizer.initial_rows_per_chunk() with chunk_sizer.ledger(): + yield chunk_sizer - yield - - if chunk_training_mode() != MODE_CHUNKLESS: + if _chunk_training_mode != MODE_CHUNKLESS: chunk_sizer.adaptive_rows_per_chunk(1) chunk_sizer.close() @@ -1075,25 +1150,34 @@ def chunk_log(trace_label, chunk_tag=None, base=False): @contextmanager def chunk_log_skip(): - yield None -def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None): - +def adaptive_chunked_choosers( + state: workflow.State, + choosers: pd.DataFrame, + trace_label: str, + chunk_tag: str = None, + *, + chunk_size: int | None = None, +): # generator to iterate over choosers - if chunk_training_mode() == MODE_CHUNKLESS: + if state.settings.chunk_training_mode == MODE_CHUNKLESS: # The adaptive chunking logic is expensive and sometimes results # in needless data copying. So we short circuit it entirely # when chunking is disabled. logger.info(f"Running chunkless with {len(choosers)} choosers") - yield 0, choosers, trace_label + yield 0, choosers, trace_label, ChunkSizer( + state, "chunkless", trace_label, 0, 0, state.settings.chunk_training_mode + ) return chunk_tag = chunk_tag or trace_label + if chunk_size is None: + chunk_size = state.settings.chunk_size num_choosers = len(choosers.index) assert num_choosers > 0 @@ -1103,20 +1187,25 @@ def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None) f"{trace_label} Running adaptive_chunked_choosers with {num_choosers} choosers" ) - chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) + chunk_sizer = ChunkSizer( + state, + chunk_tag, + trace_label, + num_choosers, + chunk_size, + chunk_training_mode=state.settings.chunk_training_mode, + ) rows_per_chunk, estimated_number_of_chunks = chunk_sizer.initial_rows_per_chunk() i = offset = 0 while offset < num_choosers: - i += 1 assert offset + rows_per_chunk <= num_choosers - chunk_trace_label = trace_label_for_chunk(trace_label, chunk_size, i) + chunk_trace_label = trace_label_for_chunk(state, trace_label, chunk_size, i) with chunk_sizer.ledger(): - # grab the next chunk based on current rows_per_chunk chooser_chunk = choosers[offset : offset + rows_per_chunk] @@ -1125,11 +1214,11 @@ def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None) f"with {len(chooser_chunk)} of {num_choosers} choosers" ) - yield i, chooser_chunk, chunk_trace_label + yield i, chooser_chunk, chunk_trace_label, chunk_sizer offset += rows_per_chunk - if chunk_training_mode() != MODE_CHUNKLESS: + if chunk_training_mode(state) != MODE_CHUNKLESS: ( rows_per_chunk, estimated_number_of_chunks, @@ -1139,7 +1228,12 @@ def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None) def adaptive_chunked_choosers_and_alts( - choosers, alternatives, chunk_size, trace_label, chunk_tag=None + state: workflow.State, + choosers: pd.DataFrame, + alternatives: pd.DataFrame, + trace_label: str, + chunk_tag: str = None, + chunk_size: int | None = None, ): """ generator to iterate over choosers and alternatives in chunk_size chunks @@ -1160,7 +1254,6 @@ def adaptive_chunked_choosers_and_alts( choosers alternatives : pandas DataFrame sample alternatives including pick_count column in same order as choosers - rows_per_chunk : int Yields ------ @@ -1174,12 +1267,15 @@ def adaptive_chunked_choosers_and_alts( chunk of alternatives for chooser chunk """ - if chunk_training_mode() == MODE_CHUNKLESS: + if state.settings.chunk_training_mode == MODE_CHUNKLESS: # The adaptive chunking logic is expensive and sometimes results # in needless data copying. So we short circuit it entirely # when chunking is disabled. logger.info(f"Running chunkless with {len(choosers)} choosers") - yield 0, choosers, alternatives, trace_label + chunk_sizer = ChunkSizer( + state, "chunkless", trace_label, 0, 0, state.settings.chunk_training_mode + ) + yield 0, choosers, alternatives, trace_label, chunk_sizer return check_assertions = False @@ -1210,7 +1306,16 @@ def adaptive_chunked_choosers_and_alts( f"with {num_choosers} choosers and {num_alternatives} alternatives" ) - chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) + if chunk_size is None: + chunk_size = state.settings.chunk_size + chunk_sizer = ChunkSizer( + state, + chunk_tag, + trace_label, + num_choosers, + chunk_size, + chunk_training_mode=state.settings.chunk_training_mode, + ) rows_per_chunk, estimated_number_of_chunks = chunk_sizer.initial_rows_per_chunk() assert (rows_per_chunk > 0) and (rows_per_chunk <= num_choosers) @@ -1232,10 +1337,9 @@ def adaptive_chunked_choosers_and_alts( offset + rows_per_chunk <= num_choosers ), f"i {i} offset {offset} rows_per_chunk {rows_per_chunk} num_choosers {num_choosers}" - chunk_trace_label = trace_label_for_chunk(trace_label, chunk_size, i) + chunk_trace_label = trace_label_for_chunk(state, trace_label, chunk_size, i) with chunk_sizer.ledger(): - chooser_chunk = choosers[offset : offset + rows_per_chunk] alt_end = alt_chunk_ends[offset + rows_per_chunk] @@ -1254,12 +1358,12 @@ def adaptive_chunked_choosers_and_alts( f"with {len(chooser_chunk)} of {num_choosers} choosers" ) - yield i, chooser_chunk, alternative_chunk, chunk_trace_label + yield i, chooser_chunk, alternative_chunk, chunk_trace_label, chunk_sizer offset += rows_per_chunk alt_offset = alt_end - if chunk_training_mode() != MODE_CHUNKLESS: + if chunk_training_mode(state) != MODE_CHUNKLESS: ( rows_per_chunk, estimated_number_of_chunks, @@ -1269,7 +1373,7 @@ def adaptive_chunked_choosers_and_alts( def adaptive_chunked_choosers_by_chunk_id( - choosers, chunk_size, trace_label, chunk_tag=None + state: workflow.State, choosers: pd.DataFrame, trace_label: str, chunk_tag=None ): # generator to iterate over choosers in chunk_size chunks # like chunked_choosers but based on chunk_id field rather than dataframe length @@ -1277,12 +1381,15 @@ def adaptive_chunked_choosers_by_chunk_id( # all have to be included in the same chunk) # FIXME - we pathologically know name of chunk_id col in households table - if chunk_training_mode() == MODE_CHUNKLESS: + if state.settings.chunk_training_mode == MODE_CHUNKLESS: # The adaptive chunking logic is expensive and sometimes results # in needless data copying. So we short circuit it entirely # when chunking is disabled. logger.info(f"Running chunkless with {len(choosers)} choosers") - yield 0, choosers, trace_label + chunk_sizer = ChunkSizer( + state, "chunkless", trace_label, 0, 0, state.settings.chunk_training_mode + ) + yield 0, choosers, trace_label, chunk_sizer return chunk_tag = chunk_tag or trace_label @@ -1290,20 +1397,26 @@ def adaptive_chunked_choosers_by_chunk_id( num_choosers = choosers["chunk_id"].max() + 1 assert num_choosers > 0 - chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) + chunk_size = state.settings.chunk_size + chunk_sizer = ChunkSizer( + state, + chunk_tag, + trace_label, + num_choosers, + chunk_size, + chunk_training_mode=state.settings.chunk_training_mode, + ) rows_per_chunk, estimated_number_of_chunks = chunk_sizer.initial_rows_per_chunk() i = offset = 0 while offset < num_choosers: - i += 1 assert offset + rows_per_chunk <= num_choosers - chunk_trace_label = trace_label_for_chunk(trace_label, chunk_size, i) + chunk_trace_label = trace_label_for_chunk(state, trace_label, chunk_size, i) with chunk_sizer.ledger(): - chooser_chunk = choosers[ choosers["chunk_id"].between(offset, offset + rows_per_chunk - 1) ] @@ -1313,11 +1426,11 @@ def adaptive_chunked_choosers_by_chunk_id( f"with {rows_per_chunk} of {num_choosers} choosers" ) - yield i, chooser_chunk, chunk_trace_label + yield i, chooser_chunk, chunk_trace_label, chunk_sizer offset += rows_per_chunk - if chunk_training_mode() != MODE_CHUNKLESS: + if chunk_training_mode(state) != MODE_CHUNKLESS: ( rows_per_chunk, estimated_number_of_chunks, diff --git a/activitysim/core/cleaning.py b/activitysim/core/cleaning.py index 65b64db157..77874bd160 100644 --- a/activitysim/core/cleaning.py +++ b/activitysim/core/cleaning.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import logging import numpy as np import pandas as pd -from . import inject +from activitysim.core import workflow logger = logging.getLogger(__name__) @@ -32,24 +34,35 @@ def recode_to_zero_based(values, mapping): return result -def should_recode_based_on_table(tablename): +def should_recode_based_on_table(state: workflow.State, tablename): try: - base_df = inject.get_table(tablename).to_frame() + base_df = state.get_dataframe(tablename) except (KeyError, RuntimeError): # the basis table is missing, do not return False + except AssertionError: + if state.settings.input_table_list is None: + # some tests don't include table definitions. + return False + raise if base_df.index.name and f"_original_{base_df.index.name}" in base_df: return True return False -def recode_based_on_table(values, tablename): +def recode_based_on_table(state: workflow.State, values, tablename): try: - base_df = inject.get_table(tablename).to_frame() + base_df = state.get_dataframe(tablename) except (KeyError, RuntimeError): # the basis table is missing, do nothing logger.warning(f"unable to recode based on missing {tablename} table") return values + except AssertionError: + if state.settings.input_table_list is None: + # some tests don't include table definitions. + logger.warning(f"unable to recode based on missing {tablename} table") + return values + raise if base_df.index.name and f"_original_{base_df.index.name}" in base_df: source_ids = base_df[f"_original_{base_df.index.name}"] if ( diff --git a/activitysim/core/config.py b/activitysim/core/config.py index aaa7ed2fb5..5cd08e5a07 100644 --- a/activitysim/core/config.py +++ b/activitysim/core/config.py @@ -1,172 +1,23 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import argparse -import glob import logging -import os -import struct -import time import warnings -import yaml - -from activitysim.core import inject, util +from activitysim.core import workflow logger = logging.getLogger(__name__) -""" - default injectables -""" - -@inject.injectable(cache=True) -def locutor(): +@workflow.cached_object +def locutor(state: workflow.State) -> bool: # when multiprocessing, sometimes you only want one process to write trace files # mp_tasks overrides this definition to designate a single sub-process as locutor return True -@inject.injectable(cache=True) -def configs_dir(): - if not os.path.exists("configs"): - raise RuntimeError("'configs' directory does not exist") - return "configs" - - -@inject.injectable(cache=True) -def data_dir(): - if not os.path.exists("data"): - raise RuntimeError("'data' directory does not exist") - return "data" - - -@inject.injectable(cache=True) -def output_dir(): - if not os.path.exists("output"): - print( - f"'output' directory does not exist - current working directory: {os.getcwd()}" - ) - raise RuntimeError("'output' directory does not exist") - return "output" - - -@inject.injectable() -def output_file_prefix(): - return "" - - -@inject.injectable(cache=True) -def pipeline_file_name(settings): - - pipeline_file_name = settings.get("pipeline_file_name", "pipeline.h5") - - return pipeline_file_name - - -@inject.injectable() -def rng_base_seed(): - return setting("rng_base_seed", 0) - - -@inject.injectable(cache=True) -def settings_file_name(): - return "settings.yaml" - - -@inject.injectable(cache=True) -def settings(settings_file_name): - settings_dict = read_settings_file(settings_file_name, mandatory=True) - - # basic settings validation for sharrow - sharrow_enabled = settings_dict.get("sharrow", False) - recode_pipeline_columns = settings_dict.get("recode_pipeline_columns", True) - if sharrow_enabled and not recode_pipeline_columns: - warnings.warn( - "use of `sharrow` setting generally requires `recode_pipeline_columns`" - ) - - return settings_dict - - -# def testing(): -# -# assert ("pytest" in sys.modules) == ("PYTEST_CURRENT_TEST" in os.environ) -# return "PYTEST_CURRENT_TEST" in os.environ - - -def get_cache_dir(): - """ - return path of cache directory in output_dir (creating it, if need be) - - cache directory is used to store - skim memmaps created by skim+dict_factories - tvpb tap_tap table cache - - Returns - ------- - str path - """ - cache_dir = setting("cache_dir", default=None) - if cache_dir is None: - cache_dir = setting( - "cache_dir", os.path.join(inject.get_injectable("output_dir"), "cache") - ) - - if not os.path.isdir(cache_dir): - os.mkdir(cache_dir) - assert os.path.isdir(cache_dir) - - # create a git-ignore in the cache dir if it does not exist. - # this helps prevent accidentally committing cache contents to git - gitignore = os.path.join(cache_dir, ".gitignore") - if not os.path.exists(gitignore): - with open(gitignore, "wt") as f: - f.write("/*") - - return cache_dir - - -def setting(key, default=None): - return inject.get_injectable("settings").get(key, default) - - -def override_setting(key, value): - new_settings = inject.get_injectable("settings") - new_settings[key] = value - inject.add_injectable("settings", new_settings) - - -def get_global_constants(): - """ - Read global constants from settings file - - Returns - ------- - constants : dict - dictionary of constants to add to locals for use by expressions in model spec - """ - return read_settings_file("constants.yaml", mandatory=False) - - -def read_model_settings(file_name, mandatory=False): - """ - - Parameters - ---------- - file_name : str - yaml file name - mandatory : bool - throw error if file empty or not found - Returns - ------- - - """ - - model_settings = read_settings_file(file_name, mandatory=mandatory) - - return model_settings - - def future_model_settings(model_name, model_settings, future_settings): """ Warn users of new required model settings, and substitute default values @@ -227,7 +78,6 @@ def get_logit_model_settings(model_settings): nests = None if model_settings is not None: - # default to MNL logit_type = model_settings.get("LOGIT_TYPE", "MNL") @@ -244,434 +94,17 @@ def get_logit_model_settings(model_settings): return nests -def build_output_file_path(file_name, use_prefix=None): - output_dir = inject.get_injectable("output_dir") - - if use_prefix: - file_name = "%s-%s" % (use_prefix, file_name) - - file_path = os.path.join(output_dir, file_name) - - return file_path - - -def cascading_input_file_path( - file_name, dir_list_injectable_name, mandatory=True, allow_glob=False -): - - dir_paths = inject.get_injectable(dir_list_injectable_name) - dir_paths = [dir_paths] if isinstance(dir_paths, str) else dir_paths - - file_path = None - if file_name is not None: - for dir in dir_paths: - p = os.path.join(dir, file_name) - if os.path.isfile(p): - file_path = p - break - - if allow_glob and len(glob.glob(p)) > 0: - file_path = p - break - - if mandatory and not file_path: - raise FileNotFoundError( - "file_path %s: file '%s' not in %s" - % (dir_list_injectable_name, file_name, dir_paths) - ) - - return file_path - - -def data_file_path(file_name, mandatory=True, allow_glob=False): - - return cascading_input_file_path( - file_name, "data_dir", mandatory=mandatory, allow_glob=allow_glob - ) - - -def expand_input_file_list(input_files): - """ - expand list by unglobbing globs globs - """ - - # be nice and accept a string as well as a list of strings - if isinstance(input_files, str): - input_files = [input_files] - - expanded_files = [] - ungroked_files = 0 - - for file_name in input_files: - - file_name = data_file_path(file_name, allow_glob=True) - - if os.path.isfile(file_name): - expanded_files.append(file_name) - continue - - if os.path.isdir(file_name): - logger.warning( - "WARNING: expand_input_file_list skipping directory: " - "(use glob instead): %s", - file_name, - ) - ungroked_files += 1 - continue - - # - glob - logger.debug(f"expand_input_file_list trying {file_name} as glob") - globbed_files = glob.glob(file_name) - for globbed_file in globbed_files: - if os.path.isfile(globbed_file): - expanded_files.append(globbed_file) - else: - logger.warning( - "WARNING: expand_input_file_list skipping: " "(does not grok) %s", - file_name, - ) - ungroked_files += 1 - - if len(globbed_files) == 0: - logger.warning( - "WARNING: expand_input_file_list file/glob not found: %s", file_name - ) - - assert ungroked_files == 0, f"{ungroked_files} ungroked file names" - - return sorted(expanded_files) - - -def config_file_path(file_name, mandatory=True): - - return cascading_input_file_path(file_name, "configs_dir", mandatory) - - -def output_file_path(file_name): - - prefix = inject.get_injectable("output_file_prefix", None) - return build_output_file_path(file_name, use_prefix=prefix) - - -def profiling_file_path(file_name): - - profile_dir = inject.get_injectable("profile_dir", None) - if profile_dir is None: - output_dir = inject.get_injectable("output_dir") - profile_dir = os.path.join( - output_dir, time.strftime("profiling--%Y-%m-%d--%H-%M-%S") - ) - os.makedirs(profile_dir, exist_ok=True) - inject.add_injectable("profile_dir", profile_dir) - - return os.path.join(profile_dir, file_name) - - -def trace_file_path(file_name): - - output_dir = inject.get_injectable("output_dir") - - # - check for trace subfolder, create it if missing - trace_dir = os.path.join(output_dir, "trace") - if not os.path.exists(trace_dir): - os.makedirs(trace_dir) - - # construct a unique tail string from the time - # this is a convenience for opening multiple similarly named trace files - tail = hex(struct.unpack(" in directories in configs_dir list, - read settings from yaml file and return as dict. - - Settings file may contain directives that affect which file settings are returned: - - inherit_settings: boolean - backfill settings in the current file with values from the next settings file in configs_dir list - include_settings: string - read settings from specified include_file in place of the current file settings - (to avoid confusion, this directive must appea ALONE in fiel, without any additional settings or directives.) - - Parameters - ---------- - file_name - mandatory: booelan - if true, raise SettingsFileNotFound exception if no settings file, otherwise return empty dict - include_stack: boolean or list - only used for recursive calls to provide list of files included so far to detect cycles - - Returns: dict - settings from speciified settings file/s - ------- - + set warning filter to 'strict' if specified in settings """ - def backfill_settings(settings, backfill): - new_settings = backfill.copy() - new_settings.update(settings) - return new_settings - - if configs_dir_list is None: - configs_dir_list = inject.get_injectable("configs_dir") - configs_dir_list = ( - [configs_dir_list] - if isinstance(configs_dir_list, str) - else configs_dir_list - ) - assert isinstance(configs_dir_list, list) - assert len(configs_dir_list) == len( - set(configs_dir_list) - ), f"repeating file names not allowed in config_dir list: {configs_dir_list}" - - args = util.parse_suffix_args(file_name) - file_name = args.filename - - assert isinstance(args.ROOTS, list) - assert (args.SUFFIX is not None and args.ROOTS) or ( - args.SUFFIX is None and not args.ROOTS - ), ("Expected to find both 'ROOTS' and 'SUFFIX' in %s, missing one" % args.filename) - - if not file_name.lower().endswith(".yaml"): - file_name = "%s.yaml" % (file_name,) - - inheriting = False - settings = {} - if isinstance(include_stack, list): - source_file_paths = include_stack.copy() + if state is None: + strict = False else: - source_file_paths = [] - for dir in configs_dir_list: - file_path = os.path.join(dir, file_name) - if os.path.exists(file_path): - if inheriting: - # we must be inheriting - logger.debug( - "inheriting additional settings for %s from %s" - % (file_name, file_path) - ) - inheriting = True - - assert ( - file_path not in source_file_paths - ), f"read_settings_file - recursion in reading 'file_path' after loading: {source_file_paths}" - - with open(file_path) as f: - - s = yaml.load(f, Loader=yaml.SafeLoader) - if s is None: - s = {} - - settings = backfill_settings(settings, s) - - # maintain a list of files we read from to improve error message when an expected setting is not found - source_file_paths += [file_path] - - include_file_name = s.get("include_settings", False) - if include_file_name: - # FIXME - prevent users from creating borgesian garden of branching paths? - # There is a lot of opportunity for confusion if this feature were over-used - # Maybe we insist that a file with an include directive is the 'end of the road' - # essentially the current settings firle is an alias for the included file - if len(s) > 1: - logger.error( - "'include_settings' must appear alone in settings file." - ) - additional_settings = list( - set(s.keys()).difference({"include_settings"}) - ) - logger.error( - f"Unexpected additional settings: {additional_settings}" - ) - raise RuntimeError( - "'include_settings' must appear alone in settings file." - ) - - logger.debug( - "including settings for %s from %s" % (file_name, include_file_name) - ) - - # recursive call to read included file INSTEAD of the file with include_settings sepcified - s, source_file_paths = read_settings_file( - include_file_name, mandatory=True, include_stack=source_file_paths - ) - - # FIXME backfill with the included file - settings = backfill_settings(settings, s) - - # we are done as soon as we read one file successfully - # unless if inherit_settings is set to true in this file - - if not s.get("inherit_settings", False): - break - - # if inheriting, continue and backfill settings from the next existing settings file configs_dir_list - - inherit_settings = s.get("inherit_settings") - if isinstance(inherit_settings, str): - inherit_file_name = inherit_settings - assert ( - os.path.join(dir, inherit_file_name) not in source_file_paths - ), f"circular inheritance of {inherit_file_name}: {source_file_paths}: " - # make a recursive call to switch inheritance chain to specified file - - logger.debug( - "inheriting additional settings for %s from %s" - % (file_name, inherit_file_name) - ) - s, source_file_paths = read_settings_file( - inherit_file_name, - mandatory=True, - include_stack=source_file_paths, - configs_dir_list=configs_dir_list, - ) - - # backfill with the inherited file - settings = backfill_settings(settings, s) - break # break the current inheritance chain (not as bad luck as breaking a chain-letter chain?...) - - if len(source_file_paths) > 0: - settings["source_file_paths"] = source_file_paths - - if mandatory and not settings: - raise SettingsFileNotFound(file_name, configs_dir_list) - - # Adds proto_ suffix for disaggregate accessibilities - if args.SUFFIX is not None and args.ROOTS: - settings = util.suffix_tables_in_settings(settings, args.SUFFIX, args.ROOTS) - - if include_stack: - # if we were called recursively, return an updated list of source_file_paths - return settings, source_file_paths + strict = state.settings.treat_warnings_as_errors - else: - return settings - - -def base_settings_file_path(file_name): - """ - - Parameters - ---------- - file_name - - Returns - ------- - path to base settings file or None if not found - """ - - if not file_name.lower().endswith(".yaml"): - file_name = "%s.yaml" % (file_name,) - - configs_dir = inject.get_injectable("configs_dir") - configs_dir = [configs_dir] if isinstance(configs_dir, str) else configs_dir - - for dir in configs_dir: - file_path = os.path.join(dir, file_name) - if os.path.exists(file_path): - return file_path - - raise RuntimeError("base_settings_file %s not found" % file_name) - - -def filter_warnings(): - """ - set warning filter to 'strict' if specified in settings - """ - - if setting("strict", False): # noqa: E402 + if strict: # noqa: E402 warnings.filterwarnings("error", category=Warning) warnings.filterwarnings( "default", category=PendingDeprecationWarning, module="future" @@ -705,10 +138,11 @@ def filter_warnings(): # beginning pandas version 1.3, various places emit a PerformanceWarning that is # caught in the "strict" filter above, but which are currently unavoidable for complex models. - # These warning are left as warnings as an invitation for future enhancement. + # Turning this filter back to "default" could be a good helper for finding places to + # look for future performance enhancements. from pandas.errors import PerformanceWarning - warnings.filterwarnings("default", category=PerformanceWarning) + warnings.filterwarnings("ignore", category=PerformanceWarning) # pandas 1.5 # beginning in pandas version 1.5, a new warning is emitted when a column is set via iloc @@ -757,21 +191,3 @@ def filter_warnings(): category=FutureWarning, message="The trip_scheduling component now has a logic_version setting.*", ) - - -def handle_standard_args(parser=None): - - from activitysim.cli import run - - warnings.warn( - "config.handle_standard_args() has been moved to the command line " - "module and will be removed in future versions.", - FutureWarning, - ) - - if parser is None: - parser = argparse.ArgumentParser() - - run.add_run_args(parser) - args = parser.parse_args() - run.handle_standard_args(args) diff --git a/activitysim/core/configuration.py b/activitysim/core/configuration.py deleted file mode 100644 index 3174653b25..0000000000 --- a/activitysim/core/configuration.py +++ /dev/null @@ -1,291 +0,0 @@ -from typing import Union - -try: - from pydantic import BaseModel as PydanticBase -except ModuleNotFoundError: - - class PydanticBase: - pass - - -class InputTable(PydanticBase): - """ - The features that define an input table to be read by ActivitySim. - """ - - tablename: str - """Name of the injected table""" - - filename: str = None - """ - Name of the CSV or HDF5 file to read. - - If not provided, defaults to `input_store` - """ - - index_col: str = None - """table column to use for the index""" - - rename_columns: dict[str, str] = None - """dictionary of column name mappings""" - - keep_columns: list[str] = None - """ - columns to keep once read in to memory. - - Save only the columns needed for modeling or analysis to save on memory - and file I/O - """ - - h5_tablename: str = None - """table name if reading from HDF5 and different from `tablename`""" - - -class Settings(PydanticBase): - """ - The overall settings for the ActivitySim model system. - - The input for these settings is typically stored in one main YAML file, - usually called ``settings.yaml``. - - Note that this implementation is presently used only for generating - documentation, but future work may migrate the settings implementation to - actually use this pydantic code to validate the settings before running - the model. - """ - - models: list[str] - """ - list of model steps to run - auto ownership, tour frequency, etc. - - See :ref:`model_steps` for more details about each step. - """ - - resume_after: str = None - """to resume running the data pipeline after the last successful checkpoint""" - - input_table_list: list[InputTable] - """list of table names, indices, and column re-maps for each table in `input_store`""" - - input_store: str = None - """HDF5 inputs file""" - - create_input_store: bool = False - """ - Write the inputs as read in back to an HDF5 store. - - If enabled, this writes the store to the outputs folder to use for subsequent - model runs, as reading HDF5 can be faster than reading CSV files.""" - - households_sample_size: int = None - """ - Number of households to sample and simulate - - If omitted or set to 0, ActivitySim will simulate all households. - """ - trace_hh_id: Union[int, list] = None - """ - Trace household id(s) - - If omitted, no tracing is written out - """ - - trace_od: list[int] = None - """ - Trace origin, destination pair in accessibility calculation - - If omitted, no tracing is written out. - """ - - chunk_training_mode: str = None - """ - The method to use for chunk training. - - Valid values include {disabled, training, production, adaptive}. - See :ref:`chunk_size` for more details. - """ - - chunk_size: int = None - """ - Approximate amount of RAM to allocate to ActivitySim for batch processing. - - See :ref:`chunk_size` for more details. - """ - - chunk_method: str = None - """ - Memory use measure to use for chunking. - - See :ref:`chunk_size`. - """ - - checkpoints: Union[bool, list] = True - """ - When to write checkpoint (intermediate table states) to disk. - - If True, checkpoints are written at each step. If False, no intermediate - checkpoints will be written before the end of run. Or, provide an explicit - list of models to checkpoint. - """ - - check_for_variability: bool = False - """ - Debugging feature to find broken model specifications. - - Enabling this check does not alter valid results but slows down model runs. - """ - - log_alt_losers: bool = False - """ - Write out expressions when all alternatives are unavailable. - - This can be useful for model development to catch errors in specifications. - Enabling this check does not alter valid results but slows down model runs. - """ - - use_shadow_pricing: bool = False - """turn shadow_pricing on and off for work and school location""" - - output_tables: list[str] = None - """list of output tables to write to CSV or HDF5""" - - want_dest_choice_sample_tables: bool = False - """turn writing of sample_tables on and off for all models""" - - cleanup_pipeline_after_run: bool = False - """ - Cleans up pipeline after successful run. - - This will clean up pipeline only after successful runs, by creating a - single-checkpoint pipeline file, and deleting any subprocess pipelines. - """ - - sharrow: Union[bool, str] = False - """ - Set the sharrow operating mode. - - .. versionadded:: 1.2 - - * `false` - Do not use sharrow. This is the default if no value is given. - * `true` - Use sharrow optimizations when possible, but fall back to - legacy `pandas.eval` systems when any error is encountered. This is the - preferred mode for running with sharrow if reliability is more important - than performance. - * `require` - Use sharrow optimizations, and raise an error if they fail - unexpectedly. This is the preferred mode for running with sharrow - if performance is a concern. - * `test` - Run every relevant calculation using both sharrow and legacy - systems, and compare them to ensure the results match. This is the slowest - mode of operation, but useful for development and debugging. - """ - - -class ZarrDigitalEncoding(PydanticBase): - """Digital encoding instructions for skim tables. - - .. versionadded:: 1.2 - """ - - regex: str - """A regular expression for matching skim matrix names. - - All skims with names that match under typical regular expression rules - for Python will be processed together. - """ - - joint_dict: str - """The name of the joint dictionary for this group. - - This must be a unique name for this set of skims, and a new array - will be added to the Dataset with this name. It will be an integer- - type array indicating the position of each element in the jointly - encoded dictionary.""" - - -class TAZ_Settings(PydanticBase): - """ - Complex settings for TAZ skims that are not just OMX file(s). - - .. versionadded:: 1.2 - """ - - omx: str = None - """The filename of the data stored in OMX format. - - This is treated as a fallback for the raw input data, if ZARR format data - is not available. - """ - - zarr: str = None - """The filename of the data stored in ZARR format. - - Reading ZARR data can be much faster than reading OMX format data, so if - this filename is given, the ZARR file format is preferred if it exists. If - it does not exist, then OMX data is read in and then ZARR data is written - out for future usage. - - .. versionadded:: 1.2 - """ - - zarr_digital_encoding: list[ZarrDigitalEncoding] = None - """ - A list of encodings to apply before saving skims in ZARR format. - - .. versionadded:: 1.2 - """ - - -class NetworkSettings(PydanticBase): - """ - Network level of service and skims settings - - The input for these settings is typically stored in one YAML file, - usually called ``network_los.yaml``. - """ - - zone_system: int - """Which zone system type is used. - - * 1 - TAZ only. - * 2 - MAZ and TAZ. - * 3 - MAZ, TAZ, and TAP - """ - - taz_skims: Union[str, TAZ_Settings] = None - """Instructions for how to load and pre-process skim matrices. - - If given as a string, it is interpreted as the location for OMX file(s), - either as a single file or as a glob-matching pattern for multiple files. - The time period for the matrix must be represented at the end of the matrix - name and be seperated by a double_underscore (e.g. `BUS_IVT__AM` indicates base - skim BUS_IVT with a time period of AM. - - Alternatively, this can be given as a nested dictionary defined via the - TAZ_Settings class, which allows for ZARR transformation and pre-processing. - """ - - skim_time_periods: dict - """time period upper bound values and labels - - * ``time_window`` - total duration (in minutes) of the modeled time span (Default: 1440 minutes (24 hours)) - * ``period_minutes`` - length of time (in minutes) each model time period represents. Must be whole factor of ``time_window``. (Default: 60 minutes) - * ``periods`` - Breakpoints that define the aggregate periods for skims and assignment - * ``labels`` - Labels to define names for aggregate periods for skims and assignment - """ - - read_skim_cache: bool = False - """Read cached skims (using numpy memmap) from output directory. - - Reading from memmap is much faster than omx, but the memmap is a huge - uncompressed file. - """ - - write_skim_cache: bool = False - """Write memmapped cached skims to output directory. - - This is needed if you want to use the cached skims to speed up subsequent - runs. - """ - - cache_dir: str = None - """alternate dir to read/write cache files (defaults to output_dir)""" diff --git a/activitysim/core/configuration/__init__.py b/activitysim/core/configuration/__init__.py index 5cdd2f69c4..58c0a9690f 100644 --- a/activitysim/core/configuration/__init__.py +++ b/activitysim/core/configuration/__init__.py @@ -1,4 +1,6 @@ # flake8: noqa +from __future__ import annotations +from .filesystem import FileSystem from .network import * from .top import * diff --git a/activitysim/core/configuration/base.py b/activitysim/core/configuration/base.py index d6c5a3ed99..05733ea7e2 100644 --- a/activitysim/core/configuration/base.py +++ b/activitysim/core/configuration/base.py @@ -1,8 +1,32 @@ +from __future__ import annotations + from typing import Any, Union # noqa: F401 +from activitysim.core import configuration + try: from pydantic import BaseModel as PydanticBase except ModuleNotFoundError: class PydanticBase: pass + + +class PydanticReadable(PydanticBase): + @classmethod + def read_settings_file( + cls, + filesystem: "configuration.FileSystem", + file_name, + mandatory=True, + include_stack=False, + configs_dir_list=None, + ) -> PydanticReadable: + # pass through to read_settings_file, requires validator_class and provides type hinting for IDE's + return filesystem.read_settings_file( + file_name, + mandatory, + include_stack, + configs_dir_list, + validator_class=cls, + ) diff --git a/activitysim/core/configuration/filesystem.py b/activitysim/core/configuration/filesystem.py new file mode 100644 index 0000000000..d247747069 --- /dev/null +++ b/activitysim/core/configuration/filesystem.py @@ -0,0 +1,828 @@ +from __future__ import annotations + +import glob +import logging +import os +import struct +import time +from pathlib import Path + +import numba +import platformdirs +import yaml +from pydantic import DirectoryPath, validator + +from activitysim.core.configuration.base import PydanticBase +from activitysim.core.exceptions import SettingsFileNotFoundError +from activitysim.core.util import parse_suffix_args, suffix_tables_in_settings + +logger = logging.getLogger(__name__) + + +class FileSystem(PydanticBase, validate_assignment=True): + """ + Manage finding and loading files for ActivitySim's command line interface. + """ + + working_dir: DirectoryPath = None + """ + Name of the working directory. + + All other directories (configs, data, output, cache), when given as relative + paths, are assumed to be relative to this working directory. If it is not + provided, the usual Python current working directory is used. + """ + + configs_dir: tuple[Path, ...] = ("configs",) + """ + Name[s] of the config directory. + """ + + @validator("configs_dir") + def configs_dirs_must_exist(cls, configs_dir, values): + working_dir = values.get("working_dir", None) or Path.cwd() + for c in configs_dir: + c_full = working_dir.joinpath(c) + if not c_full.exists(): + raise ValueError(f"config directory {c_full} does not exist") + return configs_dir + + data_dir: tuple[Path, ...] = ("data",) + """ + Name of the data directory. + """ + + @validator("data_dir") + def data_dirs_must_exist(cls, data_dir, values): + working_dir = values.get("working_dir", None) or Path.cwd() + for d in data_dir: + d_full = working_dir.joinpath(d) + if not d_full.exists(): + raise ValueError(f"data directory {d_full} does not exist") + return data_dir + + output_dir: Path = "output" + """ + Name of the output directory. + + This directory will be created on access if it does not exist. + """ + + profile_dir: Path = None + """ + Name of the output directory for pyinstrument profiling files. + + If not given, a unique time-stamped directory will be created inside + the usual output directory. + """ + + cache_dir: Path = None + """ + Name of the output directory for general cache files. + + If not given, a directory named "cache" will be created inside + the usual output directory. + """ + + sharrow_cache_dir: Path = None + """ + Name of the output directory for sharrow cache files. + + If not given, a directory named "__sharrowcache__" will be created inside + the general cache directory. + """ + + settings_file_name: str = "settings.yaml" + + pipeline_file_name: str = "pipeline" + """ + The name for the base pipeline file or directory. + """ + + @classmethod + def parse_args(cls, args): + self = cls() + + def _parse_arg(name, x): + v = getattr(args, x, None) + if v is not None: + setattr(self, name, v) + + _parse_arg("working_dir", "working_dir") + _parse_arg("settings_file_name", "settings_file") + _parse_arg("configs_dir", "config") + _parse_arg("data_dir", "data") + _parse_arg("output_dir", "output") + + return self + + def get_working_subdir(self, subdir) -> Path: + if self.working_dir: + return self.working_dir.joinpath(subdir) + else: + return Path(subdir) + + def get_output_dir(self, subdir=None) -> Path: + """ + Get an output directory, creating it if needed. + + Parameters + ---------- + subdir : Path-like, optional + If given, get this subdirectory of the output_dir. + + Returns + ------- + Path + """ + out = self.get_working_subdir(self.output_dir) + if subdir is not None: + out = out.joinpath(subdir) + if not out.exists(): + out.mkdir(parents=True) + return out + + def get_output_file_path(self, file_name) -> Path: + return self.get_output_dir().joinpath(file_name) + + def get_pipeline_filepath(self) -> Path: + """ + Get the complete path to the pipeline file or directory. + + Returns + ------- + Path + """ + return self.get_output_dir().joinpath(self.pipeline_file_name) + + def get_profiling_file_path(self, file_name) -> Path: + """ + Get the complete path to a profile output file. + + Parameters + ---------- + file_name : str + Base name of the profiling output file. + + Returns + ------- + Path + """ + if self.profile_dir is None: + profile_dir = self.get_output_dir( + time.strftime("profiling--%Y-%m-%d--%H-%M-%S") + ) + profile_dir.mkdir(parents=True, exist_ok=True) + self.profile_dir = profile_dir + return self.profile_dir.joinpath(file_name) + + def get_log_file_path(self, file_name) -> Path: + """ + Get the complete path to a log file. + + Parameters + ---------- + file_name : str + Base name of the log file. + + Returns + ------- + Path + """ + + output_dir = self.get_output_dir() + + # - check if running asv and if so, log to commit-specific subfolder + asv_commit = os.environ.get("ASV_COMMIT", None) + if asv_commit: + output_dir = os.path.join(output_dir, f"log-{asv_commit}") + os.makedirs(output_dir, exist_ok=True) + + # - check for optional log subfolder + if os.path.exists(os.path.join(output_dir, "log")): + output_dir = os.path.join(output_dir, "log") + + file_path = os.path.join(output_dir, file_name) + + return Path(file_path) + + def get_trace_file_path( + self, file_name, tail=None, trace_dir=None, create_dirs=True, file_type=None + ): + """ + Get the complete path to a trace file. + + Parameters + ---------- + file_name : str + Base name of the trace file. + tail : str or False, optional + Add this suffix to filenames. If not given, a quasi-random short + string is derived from the current time. Set to `False` to omit + the suffix entirely. Having a unique suffix makes it easier to + open multiple comparable trace files side-by-side in Excel, which + doesn't allow identically named files to be open simultaneously. + Omitting the suffix can be valuable for using automated tools to + find file differences across many files simultaneously. + trace_dir : path-like, optional + Construct the trace file path within this directory. If not + provided (typically for normal operation) the "trace" sub-directory + of the normal output directory given by `get_output_dir` is used. + The option to give a different location is primarily used to + conduct trace file validation testing. + create_dirs : bool, default True + If the path to the containing directory of the trace file does not + yet exist, create it. + file_type : str, optional + If provided, ensure that the generated file path has this extension. + + Returns + ------- + Path + """ + if trace_dir is None: + output_dir = self.get_output_dir() + + # - check for trace subfolder, create it if missing + trace_dir = output_dir.joinpath("trace") + if not trace_dir.exists(): + trace_dir.mkdir(parents=True) + + if tail is None: + # construct a unique tail string from the time + # this is a convenience for opening multiple similarly named trace files + tail = ( + "-" + + hex(struct.unpack(" Path: + """ + Get the cache directory, creating it if needed. + + The cache directory is used to store: + - skim memmaps created by skim+dict_factories + - tvpb tap_tap table cache + - pre-compiled sharrow modules + + + Parameters + ---------- + subdir : Path-like, optional + If given, get this subdirectory of the output_dir. + + Returns + ------- + Path + """ + if self.cache_dir is None: + out = self.get_output_dir("cache") + else: + out = self.get_working_subdir(self.cache_dir) + if subdir is not None: + out = out.joinpath(subdir) + if not out.exists(): + out.mkdir(parents=True) + + # create a git-ignore in the cache dir if it does not exist. + # this helps prevent accidentally committing cache contents to git + gitignore = out.joinpath(".gitignore") + if not gitignore.exists(): + gitignore.write_text("/**") + + return out + + def get_sharrow_cache_dir(self) -> Path: + """ + Get the sharrow cache directory, creating it if needed. + + The sharrow cache directory is used to store only sharrow's cache + of pre-compiled functions. + + Returns + ------- + Path + """ + if self.sharrow_cache_dir is None: + out = self.get_cache_dir("__sharrowcache__") + else: + out = self.get_working_subdir(self.sharrow_cache_dir) + if not out.exists(): + out.mkdir(parents=True) + + # create a git-ignore in the sharrow cache dir if it does not exist. + # this helps prevent accidentally committing cache contents to git + gitignore = out.joinpath(".gitignore") + if not gitignore.exists(): + gitignore.write_text("/**") + + return out + + def persist_sharrow_cache(self) -> None: + """ + Change the sharrow cache directory to a persistent, user-global location. + + The change is made in-place to `sharrow_cache_dir` for this object. The + location for the cache is selected by `platformdirs.user_cache_dir`. + An extra directory layer based on the current numba version is also added + to the cache directory, which allows for different sets of cache files to + co-exist for different version of numba (i.e. different conda envs). + This location is not configurable -- to select a different location, + change the value of `FileSystem.sharrow_cache_dir` itself. + + See Also + -------- + FileSystem.sharrow_cache_dir + """ + self.sharrow_cache_dir = Path( + platformdirs.user_cache_dir(appname="ActivitySim") + ).joinpath(f"numba-{numba.__version__}") + self.sharrow_cache_dir.mkdir(parents=True, exist_ok=True) + + def _cascading_input_file_path( + self, file_name, dir_list_injectable_name, mandatory=True, allow_glob=False + ) -> Path: + """ + Find the first matching file among a group of directories. + + Parameters + ---------- + file_name : Path-like + The name of the file to match. + dir_list_injectable_name : {'configs_dir', 'data_dir'} + The group of directories to search. + mandatory : bool, default True + Raise a FileNotFoundError if no match is found. If set to False, + this method returns None when there is no match. + allow_glob : bool, default False + Allow glob-style matches. + + Returns + ------- + Path or None + """ + if dir_list_injectable_name == "configs_dir": + dir_paths = self.get_configs_dir() + elif dir_list_injectable_name == "data_dir": + dir_paths = self.get_data_dir() + else: + dir_paths = getattr(self, dir_list_injectable_name) + dir_paths = [dir_paths] if isinstance(dir_paths, str) else dir_paths + + file_path = None + if file_name is not None: + for dir in dir_paths: + p = os.path.join(dir, file_name) + if os.path.isfile(p): + file_path = p + break + + if allow_glob and len(glob.glob(p)) > 0: + file_path = p + break + + if mandatory and not file_path: + raise FileNotFoundError( + "file_path %s: file '%s' not in %s" + % (dir_list_injectable_name, file_name, [str(i) for i in dir_paths]) + ) + + return Path(file_path) if file_path else None + + def expand_input_file_list(self, input_files) -> list[Path]: + """ + expand list by unglobbing globs globs + """ + + # be nice and accept a string as well as a list of strings + if isinstance(input_files, (str, Path)): + input_files = [Path(input_files)] + else: + input_files = [Path(i) for i in input_files] + + expanded_files = [] + ungroked_files = 0 + + for file_name in input_files: + file_name = self.get_data_file_path(file_name, allow_glob=True) + + if file_name.is_file(): + expanded_files.append(file_name) + continue + + if file_name.is_dir(): + logger.warning( + "WARNING: _expand_input_file_list skipping directory: " + f"(use glob instead): {file_name}", + ) + ungroked_files += 1 + continue + + # - not an exact match, could be a glob pattern + logger.debug(f"expand_input_file_list trying {file_name} as glob") + globbed_files = glob.glob(str(file_name)) + for globbed_file in globbed_files: + if os.path.isfile(globbed_file) or os.path.islink(globbed_file): + expanded_files.append(Path(globbed_file)) + else: + logger.warning( + "WARNING: expand_input_file_list skipping: " + f"(does not grok) {file_name}" + ) + ungroked_files += 1 + + if len(globbed_files) == 0: + logger.warning( + f"WARNING: expand_input_file_list file/glob not found: {file_name}", + ) + + assert ungroked_files == 0, f"{ungroked_files} ungroked file names" + + return sorted(expanded_files) + + def get_configs_dir(self) -> tuple[Path]: + """ + Get the configs directories. + + Returns + ------- + tuple[Path] + """ + return tuple(self.get_working_subdir(i) for i in self.configs_dir) + + def get_config_file_path(self, file_name, mandatory=True, allow_glob=False) -> Path: + """ + Find the first matching file among config directories. + + Parameters + ---------- + file_name : Path-like + The name of the file to match. + mandatory : bool, default True + Raise a FileNotFoundError if no match is found. If set to False, + this method returns None when there is no match. + allow_glob : bool, default False + Allow glob-style matches. + + Returns + ------- + Path or None + """ + return self._cascading_input_file_path( + file_name, "configs_dir", mandatory, allow_glob + ) + + def get_data_dir(self) -> tuple[Path]: + """ + Get the data directories. + + Returns + ------- + tuple[Path] + """ + return tuple(self.get_working_subdir(i) for i in self.data_dir) + + def get_data_file_path( + self, file_name, mandatory=True, allow_glob=False, alternative_suffixes=() + ) -> Path: + """ + Find the first matching file among data directories. + + Parameters + ---------- + file_name : Path-like + The name of the file to match. + mandatory : bool, default True + Raise a FileNotFoundError if no match is found. If set to False, + this method returns None when there is no match. + allow_glob : bool, default False + Allow glob-style matches. + alternative_suffixes : Iterable[str], optional + Other file suffixes to search for, if the expected filename is + not found. This allows, for example, the data files to be stored + as compressed csv ("*.csv.gz") without changing the config files. + + Returns + ------- + Path or None + """ + try: + return self._cascading_input_file_path( + file_name, "data_dir", mandatory, allow_glob + ) + except FileNotFoundError: + if not allow_glob: + file_name = Path(file_name) + for alt in alternative_suffixes: + alt_file = self._cascading_input_file_path( + file_name.with_suffix(alt), "data_dir", mandatory=False + ) + if alt_file: + return alt_file + raise + + def open_log_file(self, file_name, mode, header=None, prefix=False): + if prefix: + file_name = f"{prefix}-{file_name}" + file_path = self.get_log_file_path(file_name) + + want_header = header and not os.path.exists(file_path) + + f = open(file_path, mode) + + if want_header: + assert mode in [ + "a", + "w", + ], f"open_log_file: header requested but mode was {mode}" + print(header, file=f) + + return f + + def read_settings_file( + self, + file_name, + mandatory=True, + include_stack=False, + configs_dir_list=None, + validator_class=None, + ): + """ + Load settings from one or more yaml files. + + This method will look for first occurrence of a yaml file named + in the directories in configs_dir list, and + read settings from that yaml file. + + Settings file may contain directives that affect which file settings + are returned: + + - inherit_settings (boolean) + If found and set to true, this method will backfill settings + in the current file with values from the next settings file + in configs_dir list (if any) + - include_settings: string + Read settings from specified include_file in place of the current + file. To avoid confusion, this directive must appear ALONE in the + target file, without any additional settings or directives. + + Parameters + ---------- + file_name : str + mandatory : boolean, default True + If true, raise SettingsFileNotFoundError if no matching settings file + is found in any config directory, otherwise this method will return + an empty dict or an all-default instance of the validator class. + include_stack : boolean or list + Only used for recursive calls, provides a list of files included + so far to detect and prevent cycles. + validator_class : type[pydantic.BaseModel], optional + This model is used to validate the loaded settings. + + Returns + ------- + dict or validator_class + """ + + def backfill_settings(settings, backfill): + new_settings = backfill.copy() + new_settings.update(settings) + return new_settings + + if configs_dir_list is None: + configs_dir_list = self.get_configs_dir() + assert len(configs_dir_list) == len( + set(configs_dir_list) + ), f"repeating file names not allowed in config_dir list: {configs_dir_list}" + + args = parse_suffix_args(file_name) + file_name = args.filename + + assert isinstance(args.ROOTS, list) + assert (args.SUFFIX is not None and args.ROOTS) or ( + args.SUFFIX is None and not args.ROOTS + ), ( + "Expected to find both 'ROOTS' and 'SUFFIX' in %s, missing one" + % args.filename + ) + + if not file_name.lower().endswith(".yaml"): + file_name = "%s.yaml" % (file_name,) + + inheriting = False + settings = {} + if isinstance(include_stack, list): + source_file_paths = include_stack.copy() + else: + source_file_paths = [] + for dir in configs_dir_list: + file_path = os.path.join(dir, file_name) + if os.path.exists(file_path): + if inheriting: + # we must be inheriting + logger.debug( + "inheriting additional settings for %s from %s" + % (file_name, file_path) + ) + inheriting = True + + assert ( + file_path not in source_file_paths + ), f"read_settings_file - recursion in reading 'file_path' after loading: {source_file_paths}" + + with open(file_path) as f: + s = yaml.load(f, Loader=yaml.SafeLoader) + if s is None: + s = {} + + settings = backfill_settings(settings, s) + + # maintain a list of files we read from to improve error message when an expected setting is not found + source_file_paths += [file_path] + + include_file_name = s.get("include_settings", False) + if include_file_name: + # FIXME - prevent users from creating borgesian garden of branching paths? + # There is a lot of opportunity for confusion if this feature were over-used + # Maybe we insist that a file with an include directive is the 'end of the road' + # essentially the current settings firle is an alias for the included file + if len(s) > 1: + logger.error( + "'include_settings' must appear alone in settings file." + ) + additional_settings = list( + set(s.keys()).difference({"include_settings"}) + ) + logger.error( + f"Unexpected additional settings: {additional_settings}" + ) + raise RuntimeError( + "'include_settings' must appear alone in settings file." + ) + + logger.debug( + "including settings for %s from %s" + % (file_name, include_file_name) + ) + + # recursive call to read included file INSTEAD of the file with include_settings sepcified + s, source_file_paths = self.read_settings_file( + include_file_name, + mandatory=True, + include_stack=source_file_paths, + ) + + # FIXME backfill with the included file + settings = backfill_settings(settings, s) + + # we are done as soon as we read one file successfully + # unless if inherit_settings is set to true in this file + + if not s.get("inherit_settings", False): + break + + # if inheriting, continue and backfill settings from the next existing settings file configs_dir_list + + inherit_settings = s.get("inherit_settings") + if isinstance(inherit_settings, str): + inherit_file_name = inherit_settings + assert ( + os.path.join(dir, inherit_file_name) not in source_file_paths + ), f"circular inheritance of {inherit_file_name}: {source_file_paths}: " + # make a recursive call to switch inheritance chain to specified file + + logger.debug( + "inheriting additional settings for %s from %s" + % (file_name, inherit_file_name) + ) + s, source_file_paths = self.read_settings_file( + inherit_file_name, + mandatory=True, + include_stack=source_file_paths, + configs_dir_list=configs_dir_list, + ) + + # backfill with the inherited file + settings = backfill_settings(settings, s) + break # break the current inheritance chain (not as bad luck as breaking a chain-letter chain?...) + + if len(source_file_paths) > 0: + settings["source_file_paths"] = source_file_paths + + if mandatory and not settings: + raise SettingsFileNotFoundError(file_name, configs_dir_list) + + # Adds proto_ suffix for disaggregate accessibilities + if args.SUFFIX is not None and args.ROOTS: + settings = suffix_tables_in_settings(settings, args.SUFFIX, args.ROOTS) + + # we don't want to actually have inherit_settings as a settings + settings.pop("inherit_settings", None) + + if validator_class is not None: + settings = validator_class.parse_obj(settings) + + if include_stack: + # if we were called recursively, return an updated list of source_file_paths + return settings, source_file_paths + + else: + return settings + + def read_model_settings( + self, + file_name, + mandatory=False, + ): + # in the legacy implementation, this function has a default mandatory=False + return self.read_settings_file(file_name, mandatory=mandatory) + + def read_model_spec(self, file_name: str): + from activitysim.core import simulate + + return simulate.read_model_spec(self, file_name) + + def read_model_coefficients(self, model_settings=None, file_name=None): + from activitysim.core import simulate + + return simulate.read_model_coefficients( + self, model_settings=model_settings, file_name=file_name + ) + + def get_segment_coefficients(self, model_settings, segment_name): + from activitysim.core import simulate + + return simulate.get_segment_coefficients(self, model_settings, segment_name) diff --git a/activitysim/core/configuration/network.py b/activitysim/core/configuration/network.py index c047f117da..44b13da764 100644 --- a/activitysim/core/configuration/network.py +++ b/activitysim/core/configuration/network.py @@ -1,4 +1,14 @@ -from .base import Any, PydanticBase, Union +from __future__ import annotations + +from pathlib import Path +from typing import Literal + +from activitysim.core.configuration.base import ( + Any, + PydanticBase, + PydanticReadable, + Union, +) class DigitalEncoding(PydanticBase): @@ -137,7 +147,16 @@ class TAZ_Settings(PydanticBase): """ -class NetworkSettings(PydanticBase): +class MazToMazSettings(PydanticBase, extra="forbid"): + tables: list[str] + + max_blend_distance: dict[str, float] = None + + blend_distance_skim_name: str = None + """The name of the skim table used to blend distances for MAZs.""" + + +class NetworkSettings(PydanticReadable, extra="forbid"): """ Network level of service and skims settings @@ -145,6 +164,9 @@ class NetworkSettings(PydanticBase): usually called ``network_los.yaml``. """ + name: str = None + """Name of this network, not used for anything?""" + zone_system: int """Which zone system type is used. @@ -153,14 +175,14 @@ class NetworkSettings(PydanticBase): * 3 - MAZ, TAZ, and TAP """ - taz_skims: Union[str, TAZ_Settings] = None + taz_skims: Union[str, list[str], TAZ_Settings] = None """Instructions for how to load and pre-process skim matrices. - If given as a string, it is interpreted as the location for OMX file(s), - either as a single file or as a glob-matching pattern for multiple files. - The time period for the matrix must be represented at the end of the matrix - name and be seperated by a double_underscore (e.g. `BUS_IVT__AM` indicates base - skim BUS_IVT with a time period of AM. + If given as a string or a list of strings, it is interpreted as the location + for OMX file(s), either as a single file or as a glob-matching pattern for + multiple files. The time period for the matrix must be represented at the end + of the matrix name and be seperated by a double_underscore (e.g. `BUS_IVT__AM` + indicates base skim BUS_IVT with a time period of AM. Alternatively, this can be given as a nested dictionary defined via the TAZ_Settings class, which allows for ZARR transformation and pre-processing. @@ -189,5 +211,65 @@ class NetworkSettings(PydanticBase): runs. """ - cache_dir: str = None + network_cache_dir: str = None """alternate dir to read/write cache files (defaults to output_dir)""" + + #### 2 ZONE #### + + maz: str = None + """Filename for the MAZ data file. + + This file should contain the MAZ ID, TAZ, and land use and other MAZ attributes + """ + + maz_to_maz: MazToMazSettings = None + """Settings to manage maz-to-maz level of service in 2- and 3-zone models.""" + + #### 3 ZONE #### + + tap: str = None + """Filename for the TAP data file. + + This file should contain the MAZ ID, TAZ, and land use and other MAZ attributes + """ + + maz_to_tap: dict[str, Any] = None + """Settings to manage maz-to-tap level of service in 3-zone models.""" + + demographic_segments: Any = None + + tap_skims: Union[str, list[str]] = None + + tap_lines: str = None + """TAP lines filename.""" + + TVPB_SETTINGS: Any = None + + rebuild_tvpb_cache: bool = True + """ + rebuild and overwrite existing pre-computed TAP to TAP utilities cache + """ + + trace_tvpb_cache_as_csv: bool = False + """Write a CSV version of TVPB cache for tracing + + Not currently implemented.""" + + skim_dict_factory: Literal[ + "NumpyArraySkimFactory", + "MemMapSkimFactory", + ] = "NumpyArraySkimFactory" + """The skim dict factory to use. + + The MemMapSkimFactory is strictly experimental. + """ + + source_file_paths: list[Path] = None + """ + A list of source files from which these settings were loaded. + + This value should not be set by the user within the YAML settings files, + instead it is populated as those files are loaded. It is primarily + provided for debugging purposes, and does not actually affect the operation + of the model. + """ diff --git a/activitysim/core/configuration/top.py b/activitysim/core/configuration/top.py index f956bb6ea0..3fd38baee1 100644 --- a/activitysim/core/configuration/top.py +++ b/activitysim/core/configuration/top.py @@ -1,4 +1,9 @@ -from .base import PydanticBase, Union +from __future__ import annotations + +from pathlib import Path +from typing import Any, Literal + +from activitysim.core.configuration.base import PydanticBase, Union class InputTable(PydanticBase): @@ -9,14 +14,14 @@ class InputTable(PydanticBase): tablename: str """Name of the injected table""" - filename: str = None + filename: Path = None """ Name of the CSV or HDF5 file to read. If not provided, defaults to `input_store` """ - index_col: str = None + index_col: Union[str, None] = "NOTSET" """table column to use for the index""" rename_columns: dict[str, str] = None @@ -65,9 +70,23 @@ class InputTable(PydanticBase): and retained. """ + drop_columns: list[str] = None + """ + Columns to drop once read in to memory. + + Save only the columns needed for modeling or analysis to save on memory + and file I/O. If not given, all columns in the input file will be read + and retained. + """ + h5_tablename: str = None """table name if reading from HDF5 and different from `tablename`""" + dtypes: dict[str, str] = None + """ + dtypes for loaded columns + """ + class OutputTable(PydanticBase): tablename: str @@ -126,8 +145,17 @@ class OutputTables(PydanticBase): """ -class MultiprocessStepSlice(PydanticBase): - """Instructions on how to slice tables for each subprocess.""" +class MultiprocessStepSlice(PydanticBase, extra="forbid"): + """ + Instructions on how to slice tables for each subprocess. + + .. versionchanged:: 1.3 + + In ActivitySim versions 1.2 and earlier, slicing instructions for + multiprocess steps allowed for an "except" instruction, which has + been renamed to be "exclude" to avoid problems from using a reserved + Python keyword. + """ tables: list[str] """ @@ -147,12 +175,16 @@ class MultiprocessStepSlice(PydanticBase): names as the persons table. """ - exclude: Union[bool, str, list[str]] + exclude: Union[bool, str, list[str]] = None """ Optional list of tables not to slice even if they have a sliceable index name. Or set to `True` or "*" to exclude all tables not explicitly listed in `tables`. + + Note in ActivitySim versions 1.2 and earlier, this option was named "except" + instead of "exclude", but that is a reserved python keyword and cannot be + used as a Pydantic field name. """ @@ -178,8 +210,10 @@ class MultiprocessStep(PydanticBase): slice: MultiprocessStepSlice = None """Instructions on how to slice tables for each subprocess.""" + chunk_size: int = None + -class Settings(PydanticBase): +class Settings(PydanticBase, extra="allow", validate_assignment=True): """ The overall settings for the ActivitySim model system. @@ -192,7 +226,7 @@ class Settings(PydanticBase): the model. """ - models: list[str] + models: list[str] = None """ list of model steps to run - auto ownership, tour frequency, etc. @@ -210,13 +244,13 @@ class Settings(PydanticBase): half the number of available CPU cores, plus 1. """ - multiprocess_steps: list[MultiprocessStep] + multiprocess_steps: list[MultiprocessStep] = None """A list of multiprocess steps.""" resume_after: str = None """to resume running the data pipeline after the last successful checkpoint""" - input_table_list: list[InputTable] + input_table_list: list[InputTable] = None """list of table names, indices, and column re-maps for each table in `input_store`""" input_store: str = None @@ -235,21 +269,23 @@ class Settings(PydanticBase): If omitted or set to 0, ActivitySim will simulate all households. """ - trace_hh_id: Union[int, list] = None + trace_hh_id: int = None """ - Trace household id(s) + Trace this household id If omitted, no tracing is written out """ - trace_od: list[int] = None + trace_od: tuple[int, int] = None """ Trace origin, destination pair in accessibility calculation If omitted, no tracing is written out. """ - chunk_training_mode: str = None + chunk_training_mode: Literal[ + "disabled", "training", "production", "adaptive" + ] = "disabled" """ The method to use for chunk training. @@ -257,18 +293,71 @@ class Settings(PydanticBase): See :ref:`chunk_size` for more details. """ - chunk_size: int = None + chunk_size: int = 0 """ Approximate amount of RAM to allocate to ActivitySim for batch processing. See :ref:`chunk_size` for more details. """ - chunk_method: str = None + chunk_method: Literal[ + "bytes", + "uss", + "hybrid_uss", + "rss", + "hybrid_rss", + ] = "hybrid_uss" """ Memory use measure to use for chunking. - See :ref:`chunk_size`. + The following methods are supported to calculate memory overhead when chunking + is enabled: + + * "bytes" + expected rowsize based on actual size (as reported by numpy and + pandas) of explicitly allocated data this can underestimate overhead due + to transient data requirements of operations (e.g. merge, sort, transpose). + * "uss" + expected rowsize based on change in (unique set size) (uss) both as + a result of explicit data allocation, and readings by MemMonitor sniffer + thread that measures transient uss during time-consuming numpy and pandas + operations. + * "hybrid_uss" + hybrid_uss avoids problems with pure uss, especially with + small chunk sizes (e.g. initial training chunks) as numpy may recycle + cached blocks and show no increase in uss even though data was allocated + and logged. + * "rss" + like uss, but for resident set size (rss), which is the portion of + memory occupied by a process that is held in RAM. + * "hybrid_rss" + like hybrid_uss, but for rss + + RSS is reported by :py:meth:`psutil.Process.memory_info` and USS is reported by + :py:meth:`psutil.Process.memory_full_info`. USS is the memory which is private to + a process and which would be freed if the process were terminated. This is + the metric that most closely matches the rather vague notion of memory + "in use" (the meaning of which is difficult to pin down in operating systems + with virtual memory where memory can (but sometimes can't) be swapped or + mapped to disk. Previous testing found `hybrid_uss` performs best and is most + reliable and is therefore the default. + + For more, see :ref:`chunk_size`. + """ + + keep_chunk_logs: bool = True + """ + Whether to keep chunk logs when deleting other files. + """ + + default_initial_rows_per_chunk: int = 100 + """ + Default number of rows to use in initial chunking. + """ + + min_available_chunk_ratio: float = 0.05 + """ + minimum fraction of total chunk_size to reserve for adaptive chunking """ checkpoints: Union[bool, list] = True @@ -280,6 +369,11 @@ class Settings(PydanticBase): list of models to checkpoint. """ + checkpoint_format: Literal["hdf", "parquet"] = "parquet" + """ + Storage format to use when saving checkpoint files. + """ + check_for_variability: bool = False """ Debugging feature to find broken model specifications. @@ -414,13 +508,14 @@ class Settings(PydanticBase): This is generally a developer-only feature and not needed for regular usage of ActivitySim. - The data tables are written out before any annotation steps, but after - initial processing (renaming, filtering columns, recoding). + The data tables are written out to `/raw_tables` before any + annotation steps, but after initial processing (renaming, filtering columns, + recoding). """ disable_destination_sampling: bool = False - want_dest_choice_presampling: bool = False + want_dest_choice_presampling: bool = True testing_fail_trip_destination: bool = False @@ -439,7 +534,7 @@ class Settings(PydanticBase): developer-only feature for testing and development. """ - recode_pipeline_columns: bool = True + recode_pipeline_columns: bool = False """ Apply recoding instructions on input and final output for pipeline tables. @@ -457,3 +552,93 @@ class Settings(PydanticBase): """ keep_mem_logs: bool = False + + pipeline_complib: str = "NOTSET" + """ + Compression library to use when storing pipeline tables in an HDF5 file. + + .. versionadded:: 1.3 + """ + + treat_warnings_as_errors: bool = False + """ + Treat most warnings as errors. + + Use of this setting is not recommended outside of rigorous testing regimes. + + .. versionadded:: 1.3 + """ + + log_settings: tuple[str] = ( + "households_sample_size", + "chunk_size", + "chunk_method", + "chunk_training_mode", + "multiprocess", + "num_processes", + "resume_after", + "trace_hh_id", + "memory_profile", + "instrument", + ) + """ + Setting to log on startup. + """ + + hh_ids: Path = None + """ + Load only the household ids given in this file. + + The file need only contain the desired households ids, nothing else. + If given as a relative path (or just a file name), both the data and + config directories are searched, in that order, for the matching file. + """ + + source_file_paths: list[Path] = None + """ + A list of source files from which these settings were loaded. + + This value should not be set by the user within the YAML settings files, + instead it is populated as those files are loaded. It is primarily + provided for debugging purposes, and does not actually affect the operation + of the model. + """ + + inherit_settings: Union[bool, Path] = None + """ + Instruction on if and how to find other files that can provide settings. + + When this value is True, all config directories are searched in order for + additional files with the same filename. If other files are found they + are also loaded, but only settings values that are not already explicitly + set are applied. Alternatives, set this to a different file name, in which + case settings from that other file are loaded (again, backfilling unset + values only). Once the settings files are loaded, this value does not + have any other effect on the operation of the model(s). + """ + + rng_base_seed: Union[int, None] = 0 + """Base seed for pseudo-random number generator.""" + + duplicate_step_execution: Literal["error", "allow"] = "error" + """ + How activitysim should handle attempts to re-run a step with the same name. + + .. versionadded:: 1.3 + + * "error" + Attempts to re-run a step that has already been run and + checkpointed will raise a `RuntimeError`, halting model execution. + This is the default if no value is given. + * "allow" + Attempts to re-run a step are allowed, potentially overwriting + the results from the previous time that step was run. + """ + + other_settings: dict[str, Any] = None + + def _get_attr(self, attr): + try: + return getattr(self, attr) + except AttributeError: + return self.other_settings.get(attr) diff --git a/activitysim/abm/models/util/estimation.py b/activitysim/core/estimation.py similarity index 93% rename from activitysim/abm/models/util/estimation.py rename to activitysim/core/estimation.py index 6a5dbadf1f..8077e33180 100644 --- a/activitysim/abm/models/util/estimation.py +++ b/activitysim/core/estimation.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging import os @@ -8,8 +9,7 @@ import pandas as pd import yaml -from activitysim.abm.models.util import canonical_ids as cid -from activitysim.core import config, simulate +from activitysim.core import simulate, workflow from activitysim.core.util import reindex logger = logging.getLogger("estimation") @@ -29,11 +29,14 @@ def unlink_files(directory_path, file_types=("csv", "yaml")): print(e) -class Estimator(object): - def __init__(self, bundle_name, model_name, estimation_table_recipes): +class Estimator: + def __init__( + self, state: workflow.State, bundle_name, model_name, estimation_table_recipes + ): logger.info("Initialize Estimator for'%s'" % (model_name,)) + self.state = state self.bundle_name = bundle_name self.model_name = model_name self.settings_name = model_name @@ -50,7 +53,8 @@ def __init__(self, bundle_name, model_name, estimation_table_recipes): if self.bundle_name != self.model_name: # kind of inelegant to always delete these, but ok as they are redundantly recreated for each sub model unlink_files( - self.output_directory(bundle_directory=True), file_types=("csv", "yaml") + self.output_directory(bundle_directory=True), + file_types=("csv", "yaml"), ) # FIXME - not required? @@ -125,7 +129,8 @@ def output_directory(self, bundle_directory=False): assert self.model_name is not None dir = os.path.join( - config.output_file_path("estimation_data_bundle"), self.bundle_name + self.state.filesystem.get_output_dir("estimation_data_bundle"), + self.bundle_name, ) if bundle_directory: @@ -277,7 +282,9 @@ def write_coefficients( assert file_name is not None if coefficients_df is None: - coefficients_df = simulate.read_model_coefficients(file_name=file_name) + coefficients_df = self.state.filesystem.read_model_coefficients( + file_name=file_name + ) # preserve original config file name base_file_name = os.path.basename(file_name) @@ -288,7 +295,9 @@ def write_coefficients( def write_coefficients_template(self, model_settings): assert self.estimating - coefficients_df = simulate.read_model_coefficient_template(model_settings) + coefficients_df = simulate.read_model_coefficient_template( + self.state.filesystem, model_settings + ) tag = "coefficients_template" self.write_table(coefficients_df, tag, append=False) @@ -317,7 +326,7 @@ def copy_model_settings( self, settings_file_name, tag="model_settings", bundle_directory=False ): - input_path = config.base_settings_file_path(settings_file_name) + input_path = self.state.filesystem.get_config_file_path(settings_file_name) output_path = self.output_file_path(tag, "yaml", bundle_directory) @@ -445,7 +454,7 @@ def write_spec( assert file_name is None file_name = model_settings[tag] - input_path = config.config_file_path(file_name) + input_path = self.state.filesystem.get_config_file_path(file_name) table_name = tag # more readable than full spec file_name output_path = self.output_file_path(table_name, "csv", bundle_directory) @@ -462,15 +471,21 @@ def __init__(self): self.model_estimation_table_types = {} self.estimating = {} - def initialize_settings(self): + def initialize_settings(self, state): # FIXME - can't we just initialize in init and handle no-presence of settings file as not enabled if self.settings_initialized: return assert not self.settings_initialized - settings = config.read_model_settings(ESTIMATION_SETTINGS_FILE_NAME) - self.enabled = settings.get("enable", "True") + settings = state.filesystem.read_model_settings( + ESTIMATION_SETTINGS_FILE_NAME, mandatory=False + ) + if not settings: + # if the model settings file is not found, we are not in estimation mode. + self.enabled = False + else: + self.enabled = settings.get("enable", "True") self.bundles = settings.get("bundles", []) self.model_estimation_table_types = settings.get( @@ -488,7 +503,7 @@ def initialize_settings(self): table_name, ESTIMATION_SETTINGS_FILE_NAME, ) - file_path = config.data_file_path( + file_path = state.filesystem.get_data_file_path( table_info["file_name"], mandatory=True ) assert os.path.exists( @@ -507,21 +522,25 @@ def initialize_settings(self): self.settings_initialized = True - def begin_estimation(self, model_name, bundle_name=None): + def begin_estimation( + self, state: workflow.State, model_name: str, bundle_name=None + ) -> Estimator | None: """ begin estimating of model_name is specified as model to estimate, otherwise return False Parameters ---------- - model_name + state : workflow.State + model_name : str + bundle_name : str, optional Returns ------- - + Estimator or None """ # load estimation settings file if not self.settings_initialized: - self.initialize_settings() + self.initialize_settings(state) # global estimation setting if not self.enabled: @@ -558,6 +577,7 @@ def begin_estimation(self, model_name, bundle_name=None): ) self.estimating[model_name] = Estimator( + state, bundle_name, model_name, estimation_table_recipes=self.estimation_table_recipes[ diff --git a/activitysim/core/exceptions.py b/activitysim/core/exceptions.py new file mode 100644 index 0000000000..29d8f03a1a --- /dev/null +++ b/activitysim/core/exceptions.py @@ -0,0 +1,58 @@ +from __future__ import annotations + + +class PipelineError(ValueError): + """General class for errors in using a Pipeline.""" + + +class StateAccessError(PipelineError): + """Error trying to access a pipeline feature that is not yet initialized.""" + + +class TableTypeError(TypeError): + """Unable to return data in the format requested.""" + + +class DuplicateWorkflowNameError(ValueError): + """More than one workflow function is defined with the same name""" + + +class DuplicateWorkflowTableError(ValueError): + """More than one loadable table is defined with the same name""" + + +class DuplicateLoadableObjectError(ValueError): + """More than one loadable object is defined with the same name""" + + +class SettingsFileNotFoundError(FileNotFoundError): + def __init__(self, file_name, configs_dir): + self.file_name = file_name + self.configs_dir = configs_dir + + def __str__(self): + return repr(f"Settings file '{self.file_name}' not found in {self.configs_dir}") + + +class CheckpointFileNotFoundError(FileNotFoundError): + """The checkpoints file is not found.""" + + +class CheckpointNameNotFoundError(KeyError): + """The checkpoint_name is not found.""" + + +class TableNameNotFound(KeyError): + """The table_name is not found.""" + + +class MissingNameError(KeyError): + """The name is not found.""" + + +class ReadOnlyError(IOError): + """This object is read-only.""" + + +class MissingInputTableDefinition(RuntimeError): + """An input table definition was expected but not found.""" diff --git a/activitysim/core/expressions.py b/activitysim/core/expressions.py index 728b6d440b..b57eca94cf 100644 --- a/activitysim/core/expressions.py +++ b/activitysim/core/expressions.py @@ -1,18 +1,16 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging -from activitysim.core import assign, config, inject, simulate, tracing -from activitysim.core.util import ( - assign_in_place, - parse_suffix_args, - suffix_expressions_df_str, -) +from . import assign, config, simulate, tracing, workflow +from .util import assign_in_place, parse_suffix_args, suffix_expressions_df_str logger = logging.getLogger(__name__) -def compute_columns(df, model_settings, locals_dict={}, trace_label=None): +def compute_columns(state, df, model_settings, locals_dict={}, trace_label=None): """ Evaluate expressions_spec in context of df, with optional additional pipeline tables in locals @@ -40,7 +38,9 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): if isinstance(model_settings, str): model_settings_name = model_settings - model_settings = config.read_model_settings("%s.yaml" % model_settings) + model_settings = state.filesystem.read_model_settings( + "%s.yaml" % model_settings + ) assert model_settings, "Found no model settings for %s" % model_settings_name else: model_settings_name = "dict" @@ -80,7 +80,7 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): ) expressions_spec = assign.read_assignment_spec( - config.config_file_path(expressions_spec_name) + state.filesystem.get_config_file_path(expressions_spec_name), ) if suffix is not None and roots: @@ -90,7 +90,7 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): "Expected to find some assignment expressions in %s" % expressions_spec_name ) - tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} + tables = {t: state.get_dataframe(t) for t in helper_table_names} # if df was passed in, df might be a slice, or any other table, but DF is it's local alias assert df_name not in tables, "Did not expect to find df '%s' in TABLES" % df_name @@ -99,30 +99,44 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): # be nice and also give it to them as df? tables["df"] = df - _locals_dict = assign.local_utilities() + _locals_dict = assign.local_utilities(state) _locals_dict.update(locals_dict) _locals_dict.update(tables) # FIXME a number of asim model preprocessors want skim_dict - should they request it in model_settings.TABLES? - if config.setting("sharrow", False): - _locals_dict["skim_dict"] = inject.get_injectable("skim_dataset_dict", None) - else: - _locals_dict["skim_dict"] = inject.get_injectable("skim_dict", None) + try: + if state.settings.sharrow: + from activitysim.core.flow import skim_dataset_dict # noqa F401 + from activitysim.core.skim_dataset import skim_dataset # noqa F401 + + _locals_dict["skim_dict"] = state.get_injectable("skim_dataset_dict") + else: + _locals_dict["skim_dict"] = state.get_injectable("skim_dict") + except FileNotFoundError: + pass # maybe we don't even need the skims results, trace_results, trace_assigned_locals = assign.assign_variables( - expressions_spec, df, _locals_dict, trace_rows=tracing.trace_targets(df) + state, + expressions_spec, + df, + _locals_dict, + trace_rows=state.tracing.trace_targets(df), ) if trace_results is not None: - tracing.trace_df(trace_results, label=trace_label, slicer="NONE") + state.tracing.trace_df(trace_results, label=trace_label, slicer="NONE") if trace_assigned_locals: - tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) + state.tracing.write_csv( + trace_assigned_locals, file_name="%s_locals" % trace_label + ) return results -def assign_columns(df, model_settings, locals_dict={}, trace_label=None): +def assign_columns( + state: workflow.State, df, model_settings, locals_dict=None, trace_label=None +): """ Evaluate expressions in context of df and assign resulting target columns to df @@ -131,11 +145,13 @@ def assign_columns(df, model_settings, locals_dict={}, trace_label=None): Parameters - same as for compute_columns except df must not be None Returns - nothing since we modify df in place """ + if locals_dict is None: + locals_dict = {} assert df is not None assert model_settings is not None - results = compute_columns(df, model_settings, locals_dict, trace_label) + results = compute_columns(state, df, model_settings, locals_dict, trace_label) assign_in_place(df, results) @@ -145,13 +161,17 @@ def assign_columns(df, model_settings, locals_dict={}, trace_label=None): # ################################################################################################## -def annotate_preprocessors(df, locals_dict, skims, model_settings, trace_label): +def annotate_preprocessors( + state: workflow.State, df, locals_dict, skims, model_settings, trace_label +): locals_d = {} locals_d.update(locals_dict) locals_d.update(skims) preprocessor_settings = model_settings.get("preprocessor", []) + if preprocessor_settings is None: + preprocessor_settings = [] if not isinstance(preprocessor_settings, list): assert isinstance(preprocessor_settings, dict) preprocessor_settings = [preprocessor_settings] @@ -161,6 +181,7 @@ def annotate_preprocessors(df, locals_dict, skims, model_settings, trace_label): for model_settings in preprocessor_settings: results = compute_columns( + state, df=df, model_settings=model_settings, locals_dict=locals_d, diff --git a/activitysim/core/fast_mapping.py b/activitysim/core/fast_mapping.py index 5c3a4fd109..04c5f91739 100644 --- a/activitysim/core/fast_mapping.py +++ b/activitysim/core/fast_mapping.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numba as nb import numpy as np import pandas as pd diff --git a/activitysim/core/flow.py b/activitysim/core/flow.py index 88189ae76f..6d1e8e2579 100644 --- a/activitysim/core/flow.py +++ b/activitysim/core/flow.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import contextlib import glob import logging @@ -11,11 +13,11 @@ import numpy as np import pandas as pd -from .. import __version__ -from ..core import tracing -from . import config, inject -from .simulate_consts import SPEC_EXPRESSION_NAME, SPEC_LABEL_NAME -from .timetable import ( +import activitysim.core.skim_dataset # noqa: F401 +from activitysim import __version__ +from activitysim.core import tracing, workflow +from activitysim.core.simulate_consts import SPEC_EXPRESSION_NAME, SPEC_LABEL_NAME +from activitysim.core.timetable import ( sharrow_tt_adjacent_window_after, sharrow_tt_adjacent_window_before, sharrow_tt_max_time_block_available, @@ -133,7 +135,13 @@ def only_simple(x, exclude_keys=()): def get_flow( - spec, local_d, trace_label=None, choosers=None, interacts=None, zone_layer=None + state, + spec, + local_d, + trace_label=None, + choosers=None, + interacts=None, + zone_layer=None, ): extra_vars = only_simple(local_d) orig_col_name = local_d.get("orig_col_name", None) @@ -161,6 +169,7 @@ def get_flow( else: aux_vars = {} flow = new_flow( + state, spec, extra_vars, orig_col_name, @@ -208,7 +217,7 @@ def should_invalidate_cache_file(cache_filename, *source_filenames): return False -def scan_for_unused_names(tokens): +def scan_for_unused_names(state: workflow.State, tokens): """ Scan all spec files to find unused skim variable names. @@ -220,11 +229,11 @@ def scan_for_unused_names(tokens): ------- Set[str] """ - configs_dir_list = inject.get_injectable("configs_dir") + configs_dir_list = state.filesystem.get_configs_dir() configs_dir_list = ( [configs_dir_list] if isinstance(configs_dir_list, str) else configs_dir_list ) - assert isinstance(configs_dir_list, list) + assert isinstance(configs_dir_list, (list, tuple)) for directory in configs_dir_list: logger.debug(f"scanning for unused skims in {directory}") @@ -242,14 +251,15 @@ def scan_for_unused_names(tokens): return tokens -@inject.injectable(cache=True) -def skim_dataset_dict(skim_dataset): +@workflow.cached_object +def skim_dataset_dict(state: workflow.State, skim_dataset): from .skim_dataset import SkimDataset return SkimDataset(skim_dataset) def skims_mapping( + state: workflow.State, orig_col_name, dest_col_name, timeframe="tour", @@ -263,7 +273,7 @@ def skims_mapping( logger.info(f"- dest_col_name: {dest_col_name}") logger.info(f"- stop_col_name: {stop_col_name}") logger.info(f"- primary_origin_col_name: {primary_origin_col_name}") - skim_dataset = inject.get_injectable("skim_dataset") + skim_dataset = state.get_injectable("skim_dataset") if zone_layer == "maz" or zone_layer is None: odim = "omaz" if "omaz" in skim_dataset.dims else "otaz" ddim = "dmaz" if "dmaz" in skim_dataset.dims else "dtaz" @@ -435,6 +445,7 @@ def skims_mapping( def new_flow( + state: workflow.State, spec, extra_vars, orig_col_name, @@ -512,13 +523,10 @@ def new_flow( else: chooser_cols = list(choosers.columns) - cache_dir = os.path.join( - config.get_cache_dir(), - "__sharrowcache__", - ) - os.makedirs(cache_dir, exist_ok=True) + cache_dir = state.filesystem.get_sharrow_cache_dir() logger.debug(f"flow.cache_dir: {cache_dir}") skims_mapping_ = skims_mapping( + state, orig_col_name, dest_col_name, timeframe, @@ -719,6 +727,7 @@ def size_terms_on_flow(locals_d): def apply_flow( + state, spec, choosers, locals_d=None, @@ -773,6 +782,7 @@ def apply_flow( with logtime("apply_flow"): try: flow = get_flow( + state, spec, locals_d, trace_label, diff --git a/activitysim/core/inject.py b/activitysim/core/inject.py deleted file mode 100644 index 208a5658fd..0000000000 --- a/activitysim/core/inject.py +++ /dev/null @@ -1,250 +0,0 @@ -# ActivitySim -# See full license in LICENSE.txt. -import logging -import warnings - -from orca import orca - -_DECORATED_STEPS = {} -_DECORATED_TABLES = {} -_DECORATED_COLUMNS = {} -_DECORATED_INJECTABLES = {} -_BROADCASTS = [] - - -# we want to allow None (any anyting else) as a default value, so just choose an improbable string -_NO_DEFAULT = "throw error if missing" - -logger = logging.getLogger(__name__) - - -def step(): - def decorator(func): - name = func.__name__ - - logger.debug("inject step %s" % name) - - assert not _DECORATED_STEPS.get(name, False), ( - "step '%s' already decorated." % name - ) - if _DECORATED_STEPS.get(name, False): - warnings.warn( - f"step {name!r} already exists, ignoring default implementation." - ) - else: - _DECORATED_STEPS[name] = func - orca.add_step(name, func) - - return func - - return decorator - - -def custom_step(): - """ - This decorator allows custom steps to potentially overload existing steps. - """ - - def decorator(func): - name = func.__name__ - - logger.debug("inject step %s" % name) - - if _DECORATED_STEPS.get(name, False): - warnings.warn(f"step {name!r} already exists, overwriting it.") - _DECORATED_STEPS[name] = func - - orca.add_step(name, func) - - return func - - return decorator - - -def table(): - def decorator(func): - name = func.__name__ - - logger.debug("inject table %s" % name) - - assert not _DECORATED_TABLES.get(name, False), ( - "table '%s' already decorated." % name - ) - _DECORATED_TABLES[name] = func - - orca.add_table(name, func) - - return func - - return decorator - - -def injectable(cache=False, override=False): - def decorator(func): - name = func.__name__ - - logger.debug("inject injectable %s" % name) - - # insist on explicit override to ensure multiple definitions occur in correct order - assert override or not _DECORATED_INJECTABLES.get(name, False), ( - "injectable '%s' already defined. not overridden" % name - ) - - _DECORATED_INJECTABLES[name] = {"func": func, "cache": cache} - - orca.add_injectable(name, func, cache=cache) - - return func - - return decorator - - -def merge_tables(target, tables, columns=None): - return orca.merge_tables(target, tables, columns) - - -def add_step(name, func): - return orca.add_step(name, func) - - -def add_table(table_name, table, replace=False): - """ - Add new table and raise assertion error if the table already exists. - Silently replace if replace=True. - """ - if ( - not replace - and orca.is_table(table_name) - and orca.table_type(table_name) == "dataframe" - ): - logger.warning("inject add_table replacing existing table %s" % table_name) - assert False - - # FIXME - should add table.copy() instead, so it can't be modified behind our back? - return orca.add_table(table_name, table, cache=False) - - -# fixme remove? -def add_column(table_name, column_name, column, cache=False): - return orca.add_column(table_name, column_name, column, cache=cache) - - -def add_injectable(name, injectable, cache=False): - return orca.add_injectable(name, injectable, cache=cache) - - -def broadcast( - cast, onto, cast_on=None, onto_on=None, cast_index=False, onto_index=False -): - _BROADCASTS.append( - [ - (cast, onto), - dict( - cast_on=cast_on, - onto_on=onto_on, - cast_index=cast_index, - onto_index=onto_index, - ), - ] - ) - return orca.broadcast( - cast, - onto, - cast_on=cast_on, - onto_on=onto_on, - cast_index=cast_index, - onto_index=onto_index, - ) - - -def get_table(name, default=_NO_DEFAULT): - - if orca.is_table(name) or default == _NO_DEFAULT: - return orca.get_table(name) - else: - return default - - -def is_injectable(name): - - return orca.is_injectable(name) - - -def get_injectable(name, default=_NO_DEFAULT): - - if is_injectable(name) or default == _NO_DEFAULT: - return orca.get_injectable(name) - else: - return default - - -def remove_injectable(name): - - orca._INJECTABLES.pop(name, None) - - -def reinject_decorated_tables(steps=False): - """ - reinject the decorated tables (and columns) - - This function can be used to completely reset the global state for - ActivitySim. - """ - - logger.info("reinject_decorated_tables") - - # need to clear any non-decorated tables that were added during the previous run - orca._TABLES.clear() - orca._COLUMNS.clear() - orca._TABLE_CACHE.clear() - orca._COLUMN_CACHE.clear() - if steps: - orca._STEPS.clear() - orca._BROADCASTS.clear() - - for name, func in _DECORATED_TABLES.items(): - logger.debug("reinject decorated table %s" % name) - orca.add_table(name, func) - - for column_key, args in _DECORATED_COLUMNS.items(): - table_name, column_name = column_key - logger.debug("reinject decorated column %s.%s" % (table_name, column_name)) - orca.add_column(table_name, column_name, args["func"], cache=args["cache"]) - - for name, args in _DECORATED_INJECTABLES.items(): - logger.debug("reinject decorated injectable %s" % name) - orca.add_injectable(name, args["func"], cache=args["cache"]) - - if steps: - for name, func in _DECORATED_STEPS.items(): - logger.debug("reinject decorated step %s" % name) - orca.add_step(name, func) - for arg, kwarg in _BROADCASTS: - orca.broadcast(*arg, **kwarg) - - -def clear_cache(): - return orca.clear_cache() - - -def set_step_args(args=None): - - assert isinstance(args, dict) or args is None - orca.add_injectable("step_args", args) - - -def get_step_arg(arg_name, default=_NO_DEFAULT): - - args = orca.get_injectable("step_args") - - assert isinstance(args, dict) - if arg_name not in args and default == _NO_DEFAULT: - raise "step arg '%s' not found and no default" % arg_name - - return args.get(arg_name, default) - - -def dump_state(): - - print("_DECORATED_STEPS", list(_DECORATED_STEPS.keys())) - print("orca._STEPS", list(orca._STEPS.keys())) diff --git a/activitysim/core/input.py b/activitysim/core/input.py index 41bfdc1c07..51730fcf02 100644 --- a/activitysim/core/input.py +++ b/activitysim/core/input.py @@ -1,23 +1,27 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging import os -import warnings import pandas as pd -from activitysim.core import config, inject, util +from activitysim.core import util, workflow +from activitysim.core.configuration import InputTable +from activitysim.core.exceptions import MissingInputTableDefinition logger = logging.getLogger(__name__) def canonical_table_index_name(table_name): - table_index_names = inject.get_injectable("canonical_table_index_names", None) + from activitysim.abm.models.util import canonical_ids + + table_index_names = canonical_ids.CANONICAL_TABLE_INDEX_NAMES return table_index_names and table_index_names.get(table_name, None) -def read_input_table(tablename, required=True): +def read_input_table(state: workflow.State, tablename, required=True): """Reads input table name and returns cleaned DataFrame. Uses settings found in input_table_list in global settings file @@ -25,24 +29,28 @@ def read_input_table(tablename, required=True): Parameters ---------- tablename : string + settings : State Returns ------- pandas DataFrame """ - table_list = config.setting("input_table_list") - assert table_list is not None, "no input_table_list found in settings" + table_list = state.settings.input_table_list + if required and table_list is None: + raise AssertionError("no input_table_list found in settings") + if not required and table_list is None: + return None table_info = None for info in table_list: - if info["tablename"] == tablename: + if info.tablename == tablename: table_info = info if table_info is not None: - df = read_from_table_info(table_info) + df = read_from_table_info(table_info, state) else: if required: - raise RuntimeError( + raise MissingInputTableDefinition( f"could not find info for for tablename {tablename} in settings file" ) df = None @@ -50,7 +58,7 @@ def read_input_table(tablename, required=True): return df -def read_from_table_info(table_info): +def read_from_table_info(table_info: InputTable, state): """ Read input text files and return cleaned up DataFrame. @@ -65,28 +73,23 @@ def read_from_table_info(table_info): +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ - | column_map | list of input columns to rename from_name: to_name | - +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ - | drop_columns | list of column names of columns to drop | - +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ - input_store = config.setting("input_store", None) - create_input_store = config.setting("create_input_store", default=False) - - tablename = table_info.get("tablename") - data_filename = table_info.get("filename", input_store) - h5_tablename = table_info.get("h5_tablename") or tablename - drop_columns = table_info.get("drop_columns", None) - column_map = table_info.get("column_map", None) - keep_columns = table_info.get("keep_columns", None) - rename_columns = table_info.get("rename_columns", None) - recode_columns = table_info.get("recode_columns", None) - csv_dtypes = table_info.get("dtypes", {}) + input_store = state.settings.input_store + create_input_store = state.settings.create_input_store + + tablename = table_info.tablename + data_filename = table_info.filename or input_store + h5_tablename = table_info.h5_tablename or tablename + keep_columns = table_info.keep_columns + drop_columns = table_info.drop_columns + rename_columns = table_info.rename_columns + recode_columns = table_info.recode_columns + csv_dtypes = table_info.dtypes or {} # don't require a redundant index_col directive for canonical tables # but allow explicit disabling of assignment of index col for canonical tables, in which case, presumably, @@ -94,17 +97,17 @@ def read_from_table_info(table_info): canonical_index_col = canonical_table_index_name(tablename) # if there is an explicit index_col entry in table_info - if "index_col" in table_info: + if table_info.index_col != "NOTSET": # honor explicit index_col unless it conflicts with canonical name - index_col = table_info["index_col"] + index_col = table_info.index_col if canonical_index_col: if index_col: # if there is a non-empty index_col directive, it should be for canonical_table_index_name assert ( index_col == canonical_index_col - ), f"{tablename} index_col {table_info.get('index_col')} should be {index_col}" + ), f"{tablename} index_col {table_info.index_col} should be {index_col}" else: logger.info( f"Not assigning canonical index_col {tablename}.{canonical_index_col} " @@ -120,45 +123,39 @@ def read_from_table_info(table_info): assert tablename is not None, "no tablename provided" assert data_filename is not None, "no input file provided" - data_file_path = config.data_file_path(data_filename) + data_file_path = state.filesystem.get_data_file_path( + data_filename, alternative_suffixes=(".csv.gz", ".parquet") + ) - df = _read_input_file( - data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes + df = read_input_file( + str(data_file_path), h5_tablename=h5_tablename, csv_dtypes=csv_dtypes ) # logger.debug('raw %s table columns: %s' % (tablename, df.columns.values)) - logger.debug("raw %s table size: %s" % (tablename, util.df_size(df))) + logger.debug(f"raw {tablename} table size: {util.df_size(df)}") if create_input_store: - h5_filepath = config.output_file_path("input_data.h5") - logger.info("writing %s to %s" % (h5_tablename, h5_filepath)) - df.to_hdf(h5_filepath, key=h5_tablename, mode="a") - - csv_dir = config.output_file_path("input_data") - if not os.path.exists(csv_dir): - os.makedirs(csv_dir) # make directory if needed - df.to_csv(os.path.join(csv_dir, "%s.csv" % tablename), index=False) + raise NotImplementedError("the input store functionality has been disabled") + # h5_filepath = state.get_output_file_path("input_data.h5") + # logger.info("writing %s to %s" % (h5_tablename, h5_filepath)) + # df.to_hdf(h5_filepath, key=h5_tablename, mode="a") + # + # csv_dir = state.get_output_file_path("input_data") + # if not os.path.exists(csv_dir): + # os.makedirs(csv_dir) # make directory if needed + # df.to_csv(os.path.join(csv_dir, "%s.csv" % tablename), index=False) if drop_columns: logger.debug("dropping columns: %s" % drop_columns) df.drop(columns=drop_columns, inplace=True, errors="ignore") - if column_map: - warnings.warn( - "table_inf option 'column_map' renamed 'rename_columns'" - "Support for 'column_map' will be removed in future versions.", - FutureWarning, - ) - logger.debug("renaming columns: %s" % column_map) - df.rename(columns=column_map, inplace=True) - # rename columns first, so keep_columns can be a stable list of expected/required columns if rename_columns: logger.debug("renaming columns: %s" % rename_columns) df.rename(columns=rename_columns, inplace=True) # recode columns, can simplify data structure - if recode_columns and config.setting("recode_pipeline_columns", True): + if recode_columns and state.settings.recode_pipeline_columns: for colname, recode_instruction in recode_columns.items(): logger.info(f"recoding column {colname}: {recode_instruction}") if recode_instruction == "zero-based": @@ -177,10 +174,10 @@ def read_from_table_info(table_info): # We need to keep track if we have recoded the land_use # table's index to zero-based, as we need to disable offset # processing for legacy skim access. - config.override_setting("offset_preprocessing", True) + state.settings.offset_preprocessing = True else: source_table, lookup_col = recode_instruction.split(".") - parent_table = inject.get_table(source_table) + parent_table = state.get_dataframe(source_table) try: map_col = parent_table[f"_original_{lookup_col}"] except KeyError: @@ -228,14 +225,19 @@ def read_from_table_info(table_info): not df.columns.duplicated().any() ), f"duplicate columns names in {tablename}: {duplicate_column_names}" - logger.debug("%s table columns: %s" % (tablename, df.columns.values)) - logger.debug("%s table size: %s" % (tablename, util.df_size(df))) - logger.debug("%s index name: %s" % (tablename, df.index.name)) + logger.debug(f"{tablename} table columns: {df.columns.values}") + logger.debug(f"{tablename} table size: {util.df_size(df)}") + logger.debug(f"{tablename} index name: {df.index.name}") return df -def _read_input_file(filepath, h5_tablename=None, csv_dtypes=None): +def read_input_file(filepath: str, h5_tablename: str = None, csv_dtypes=None): + """ + Read data to a pandas DataFrame, inferring file type from filename extension. + """ + + filepath = str(filepath) assert os.path.exists(filepath), "input file not found: %s" % filepath if filepath.endswith(".csv") or filepath.endswith(".csv.gz"): @@ -243,12 +245,15 @@ def _read_input_file(filepath, h5_tablename=None, csv_dtypes=None): if filepath.endswith(".h5"): assert h5_tablename is not None, "must provide a tablename to read HDF5 table" - logger.info("reading %s table from %s" % (h5_tablename, filepath)) + logger.info(f"reading {h5_tablename} table from {filepath}") return pd.read_hdf(filepath, h5_tablename) - raise IOError( + if filepath.endswith(".parquet"): + return pd.read_parquet(filepath) + + raise OSError( "Unsupported file type: %s. " - "ActivitySim supports CSV and HDF5 files only" % filepath + "ActivitySim supports CSV, HDF5, and Parquet files only" % filepath ) diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index 62cabd861f..966f4a7d82 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -1,12 +1,20 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from . import chunk, config, interaction_simulate, logit, pipeline, tracing -from .simulate import set_skim_wrapper_targets +from activitysim.core import ( + chunk, + interaction_simulate, + logit, + simulate, + tracing, + workflow, +) logger = logging.getLogger(__name__) @@ -14,6 +22,7 @@ def make_sample_choices( + state: workflow.State, choosers, probs, alternatives, @@ -22,6 +31,7 @@ def make_sample_choices( alt_col_name, allow_zero_probs, trace_label, + chunk_sizer, ): """ @@ -61,13 +71,13 @@ def make_sample_choices( choosers = choosers[~zero_probs] # get sample_size rands for each chooser - rands = pipeline.get_rn_generator().random_for_df(probs, n=sample_size) + rands = state.get_rn_generator().random_for_df(probs, n=sample_size) # transform as we iterate over alternatives # reshape so rands[i] is in broadcastable (2-D) shape for cum_probs_arr # i.e rands[i] is a 2-D array of one alt choice rand for each chooser # rands = rands.T #.reshape(sample_size, -1, 1) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "rands", rands) # TODO: is `sample_choices_maker` more efficient? The order of samples changes, might change repro-randoms from .choosing import sample_choices_maker_preserve_ordering @@ -78,8 +88,8 @@ def make_sample_choices( alternatives.index.values, ) - chunk.log_df(trace_label, "choices_array", choices_array) - chunk.log_df(trace_label, "choice_probs_array", choice_probs_array) + chunk_sizer.log_df(trace_label, "choices_array", choices_array) + chunk_sizer.log_df(trace_label, "choice_probs_array", choice_probs_array) # explode to one row per chooser.index, alt_zone_id choices_df = pd.DataFrame( @@ -91,22 +101,23 @@ def make_sample_choices( } ) - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) del choices_array - chunk.log_df(trace_label, "choices_array", None) + chunk_sizer.log_df(trace_label, "choices_array", None) del rands - chunk.log_df(trace_label, "rands", None) + chunk_sizer.log_df(trace_label, "rands", None) del choice_probs_array - chunk.log_df(trace_label, "choice_probs_array", None) + chunk_sizer.log_df(trace_label, "choice_probs_array", None) # handing this off to caller - chunk.log_df(trace_label, "choices_df", None) + chunk_sizer.log_df(trace_label, "choices_df", None) return choices_df def _interaction_sample( + state: workflow.State, choosers, alternatives, spec, @@ -118,6 +129,7 @@ def _interaction_sample( locals_d=None, trace_label=None, zone_layer=None, + chunk_sizer=None, ): """ Run a MNL simulation in the situation in which alternatives must @@ -174,7 +186,7 @@ def _interaction_sample( number of duplicate picks for chooser, alt """ - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = state.tracing.has_trace_targets(choosers) trace_ids = None trace_rows = None num_choosers = len(choosers.index) @@ -182,8 +194,10 @@ def _interaction_sample( assert num_choosers > 0 if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) - tracing.trace_df( + state.tracing.trace_df( + choosers, tracing.extend_trace_label(trace_label, "choosers") + ) + state.tracing.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), slicer="NONE", @@ -201,7 +215,7 @@ def _interaction_sample( chooser_index_id = interaction_simulate.ALT_CHOOSER_ID if log_alt_losers else None - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow # - cross join choosers and alternatives (cartesian product) # for every chooser, there will be a row for each alternative @@ -216,6 +230,7 @@ def _interaction_sample( interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( + state, spec, choosers, locals_d, @@ -226,7 +241,7 @@ def _interaction_sample( extra_data=alternatives, zone_layer=zone_layer, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) if sharrow_enabled == "test" or True: interaction_utilities_sh, trace_eval_results_sh = ( interaction_utilities, @@ -234,18 +249,19 @@ def _interaction_sample( ) if not sharrow_enabled or (sharrow_enabled == "test"): interaction_df = logit.interaction_dataset( + state, choosers, alternatives, sample_size=alternative_count, chooser_index_id=chooser_index_id, ) - chunk.log_df(trace_label, "interaction_df", interaction_df) + chunk_sizer.log_df(trace_label, "interaction_df", interaction_df) assert alternative_count == len(interaction_df.index) / len(choosers.index) if skims is not None: - set_skim_wrapper_targets(interaction_df, skims) + simulate.set_skim_wrapper_targets(interaction_df, skims) # evaluate expressions from the spec multiply by coefficients and sum # spec is df with one row per spec expression and one col with utility coefficient @@ -253,11 +269,11 @@ def _interaction_sample( # utilities has utility value for element in the cross product of choosers and alternatives # interaction_utilities is a df with one utility column and one row per row in interaction_df if have_trace_targets: - trace_rows, trace_ids = tracing.interaction_trace_rows( + trace_rows, trace_ids = state.tracing.interaction_trace_rows( interaction_df, choosers, alternative_count ) - tracing.trace_df( + state.tracing.trace_df( interaction_df[trace_rows], tracing.extend_trace_label(trace_label, "interaction_df"), slicer="NONE", @@ -271,6 +287,7 @@ def _interaction_sample( interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( + state, spec, interaction_df, locals_d, @@ -280,12 +297,12 @@ def _interaction_sample( log_alt_losers=log_alt_losers, zone_layer=zone_layer, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) # ########### HWM - high water mark (point of max observed memory usage) del interaction_df - chunk.log_df(trace_label, "interaction_df", None) + chunk_sizer.log_df(trace_label, "interaction_df", None) if sharrow_enabled == "test": try: @@ -338,7 +355,7 @@ def _interaction_sample( raise if have_trace_targets and trace_ids is not None: - tracing.trace_interaction_eval_results( + state.tracing.trace_interaction_eval_results( trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, "eval"), @@ -346,7 +363,7 @@ def _interaction_sample( if have_trace_targets and trace_rows is not None: try: - tracing.trace_df( + state.tracing.trace_df( interaction_utilities[trace_rows], tracing.extend_trace_label(trace_label, "interaction_utilities"), slicer="NONE", @@ -355,7 +372,9 @@ def _interaction_sample( except ValueError: pass - tracing.dump_df(DUMP, interaction_utilities, trace_label, "interaction_utilities") + state.tracing.dump_df( + DUMP, interaction_utilities, trace_label, "interaction_utilities" + ) # reshape utilities (one utility column and one row per row in interaction_utilities) # to a dataframe with one row per chooser and one column per alternative @@ -363,35 +382,36 @@ def _interaction_sample( interaction_utilities.values.reshape(len(choosers), alternative_count), index=choosers.index, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) del interaction_utilities - chunk.log_df(trace_label, "interaction_utilities", None) + chunk_sizer.log_df(trace_label, "interaction_utilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( utilities, tracing.extend_trace_label(trace_label, "utils"), column_labels=["alternative", "utility"], ) - tracing.dump_df(DUMP, utilities, trace_label, "utilities") + state.tracing.dump_df(DUMP, utilities, trace_label, "utilities") # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative probs = logit.utils_to_probs( + state, utilities, allow_zero_probs=allow_zero_probs, trace_label=trace_label, trace_choosers=choosers, ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -420,6 +440,7 @@ def _interaction_sample( return choices_df else: choices_df = make_sample_choices( + state, choosers, probs, alternatives, @@ -428,12 +449,13 @@ def _interaction_sample( alt_col_name, allow_zero_probs=allow_zero_probs, trace_label=trace_label, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) # pick_count and pick_dup # pick_count is number of duplicate picks @@ -450,15 +472,15 @@ def _interaction_sample( # drop the duplicates choices_df = choices_df[~choices_df["pick_dup"]] del choices_df["pick_dup"] - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) # set index after groupby so we can trace on it choices_df.set_index(choosers.index.name, inplace=True) - tracing.dump_df(DUMP, choices_df, trace_label, "choices_df") + state.tracing.dump_df(DUMP, choices_df, trace_label, "choices_df") if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices_df, tracing.extend_trace_label(trace_label, "sampled_alternatives"), transpose=False, @@ -467,7 +489,7 @@ def _interaction_sample( # don't need this after tracing del choices_df["rand"] - chunk.log_df(trace_label, "choices_df", choices_df) + chunk_sizer.log_df(trace_label, "choices_df", choices_df) # - NARROW choices_df["prob"] = choices_df["prob"].astype(np.float32) @@ -478,6 +500,7 @@ def _interaction_sample( def interaction_sample( + state, choosers, alternatives, spec, @@ -561,11 +584,15 @@ def interaction_sample( sample_size = min(sample_size, len(alternatives.index)) result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label, chunk_tag - ): + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(state, choosers, trace_label, chunk_tag): choices = _interaction_sample( + state, chooser_chunk, alternatives, spec=spec, @@ -577,13 +604,14 @@ def interaction_sample( locals_d=locals_d, trace_label=chunk_trace_label, zone_layer=zone_layer, + chunk_sizer=chunk_sizer, ) if choices.shape[0] > 0: # might not be any if allow_zero_probs result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index 13e65f3848..274940b30c 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -1,17 +1,20 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from . import chunk, interaction_simulate, logit, tracing -from .simulate import set_skim_wrapper_targets +from activitysim.core import chunk, interaction_simulate, logit, tracing, workflow +from activitysim.core.simulate import set_skim_wrapper_targets logger = logging.getLogger(__name__) def _interaction_sample_simulate( + state: workflow.State, choosers, alternatives, spec, @@ -26,8 +29,9 @@ def _interaction_sample_simulate( trace_choice_name, estimator, skip_choice=False, + *, + chunk_sizer: chunk.ChunkSizer, ): - """ Run a MNL simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or @@ -96,11 +100,13 @@ def _interaction_sample_simulate( alternatives.index[last_repeat] ) - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = state.tracing.has_trace_targets(choosers) if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) - tracing.trace_df( + state.tracing.trace_df( + choosers, tracing.extend_trace_label(trace_label, "choosers") + ) + state.tracing.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), transpose=False, @@ -135,12 +141,14 @@ def _interaction_sample_simulate( interaction_simulate.ALT_CHOOSER_ID ] = interaction_df.index.values - chunk.log_df(trace_label, "interaction_df", interaction_df) + chunk_sizer.log_df(trace_label, "interaction_df", interaction_df) if have_trace_targets: - trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, choosers) + trace_rows, trace_ids = state.tracing.interaction_trace_rows( + interaction_df, choosers + ) - tracing.trace_df( + state.tracing.trace_df( interaction_df, tracing.extend_trace_label(trace_label, "interaction_df"), transpose=False, @@ -160,6 +168,7 @@ def _interaction_sample_simulate( interaction_utilities, trace_eval_results, ) = interaction_simulate.eval_interaction_utilities( + state, spec, interaction_df, locals_d, @@ -168,19 +177,19 @@ def _interaction_sample_simulate( estimator=estimator, log_alt_losers=log_alt_losers, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) del interaction_df - chunk.log_df(trace_label, "interaction_df", None) + chunk_sizer.log_df(trace_label, "interaction_df", None) if have_trace_targets: - tracing.trace_interaction_eval_results( + state.tracing.trace_interaction_eval_results( trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, "eval"), ) - tracing.trace_df( + state.tracing.trace_df( interaction_utilities, tracing.extend_trace_label(trace_label, "interaction_utilities"), transpose=False, @@ -195,7 +204,7 @@ def _interaction_sample_simulate( sample_counts = ( interaction_utilities.groupby(interaction_utilities.index).size().values ) - chunk.log_df(trace_label, "sample_counts", sample_counts) + chunk_sizer.log_df(trace_label, "sample_counts", sample_counts) # max number of alternatvies for any chooser max_sample_count = sample_counts.max() @@ -210,28 +219,28 @@ def _interaction_sample_simulate( inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts) del sample_counts - chunk.log_df(trace_label, "sample_counts", None) + chunk_sizer.log_df(trace_label, "sample_counts", None) # insert the zero-prob utilities to pad each alternative set to same size padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999) - chunk.log_df(trace_label, "padded_utilities", padded_utilities) + chunk_sizer.log_df(trace_label, "padded_utilities", padded_utilities) del inserts del interaction_utilities - chunk.log_df(trace_label, "interaction_utilities", None) + chunk_sizer.log_df(trace_label, "interaction_utilities", None) # reshape to array with one row per chooser, one column per alternative padded_utilities = padded_utilities.reshape(-1, max_sample_count) # convert to a dataframe with one row per chooser and one column per alternative utilities_df = pd.DataFrame(padded_utilities, index=choosers.index) - chunk.log_df(trace_label, "utilities_df", utilities_df) + chunk_sizer.log_df(trace_label, "utilities_df", utilities_df) del padded_utilities - chunk.log_df(trace_label, "padded_utilities", None) + chunk_sizer.log_df(trace_label, "padded_utilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( utilities_df, tracing.extend_trace_label(trace_label, "utilities"), column_labels=["alternative", "utility"], @@ -240,24 +249,25 @@ def _interaction_sample_simulate( # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative probs = logit.utils_to_probs( + state, utilities_df, allow_zero_probs=allow_zero_probs, trace_label=trace_label, trace_choosers=choosers, ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) if want_logsums: logsums = logit.utils_to_logsums( utilities_df, allow_zero_probs=allow_zero_probs ) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) del utilities_df - chunk.log_df(trace_label, "utilities_df", None) + chunk_sizer.log_df(trace_label, "utilities_df", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -277,14 +287,14 @@ def _interaction_sample_simulate( # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=choosers + state, probs, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "positions", positions) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "positions", positions) + chunk_sizer.log_df(trace_label, "rands", rands) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) # shouldn't have chosen any of the dummy pad utilities assert positions.max() < max_sample_count @@ -299,25 +309,25 @@ def _interaction_sample_simulate( # create a series with index from choosers and the index of the chosen alternative choices = pd.Series(choices, index=choosers.index) - chunk.log_df(trace_label, "choices", choices) + chunk_sizer.log_df(trace_label, "choices", choices) if allow_zero_probs and zero_probs.any() and zero_prob_choice_val is not None: # FIXME this is kind of gnarly, patch choice for zero_probs choices.loc[zero_probs] = zero_prob_choice_val if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices, tracing.extend_trace_label(trace_label, "choices"), columns=[None, trace_choice_name], ) - tracing.trace_df( + state.tracing.trace_df( rands, tracing.extend_trace_label(trace_label, "rands"), columns=[None, "rand"], ) if want_logsums: - tracing.trace_df( + state.tracing.trace_df( logsums, tracing.extend_trace_label(trace_label, "logsum"), columns=[None, "logsum"], @@ -327,15 +337,16 @@ def _interaction_sample_simulate( choices = choices.to_frame("choice") choices["logsum"] = logsums - chunk.log_df(trace_label, "choices", choices) + chunk_sizer.log_df(trace_label, "choices", choices) # handing this off to our caller - chunk.log_df(trace_label, "choices", None) + chunk_sizer.log_df(trace_label, "choices", None) return choices def interaction_sample_simulate( + state: workflow.State, choosers, alternatives, spec, @@ -353,7 +364,6 @@ def interaction_sample_simulate( estimator=None, skip_choice=False, ): - """ Run a simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or @@ -421,11 +431,12 @@ def interaction_sample_simulate( chooser_chunk, alternative_chunk, chunk_trace_label, + chunk_sizer, ) in chunk.adaptive_chunked_choosers_and_alts( - choosers, alternatives, chunk_size, trace_label, chunk_tag + state, choosers, alternatives, trace_label, chunk_tag, chunk_size=chunk_size ): - choices = _interaction_sample_simulate( + state, chooser_chunk, alternative_chunk, spec, @@ -440,11 +451,12 @@ def interaction_sample_simulate( trace_choice_name, estimator, skip_choice, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, f"result_list", result_list) + chunk_sizer.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index 38075e5ad0..88dbfc73d2 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -1,15 +1,18 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import time from builtins import zip from collections import OrderedDict from datetime import timedelta +from typing import Mapping import numpy as np import pandas as pd -from . import chunk, config, logit, simulate, tracing +from . import chunk, config, logit, simulate, tracing, workflow logger = logging.getLogger(__name__) @@ -19,6 +22,7 @@ def eval_interaction_utilities( + state, spec, df, locals_d, @@ -72,7 +76,7 @@ def eval_interaction_utilities( trace_label = tracing.extend_trace_label(trace_label, "eval_interaction_utils") logger.info("Running eval_interaction_utilities on %s rows" % df.shape[0]) - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow if locals_d is not None and locals_d.get("_sharrow_skip", False): sharrow_enabled = False @@ -84,8 +88,7 @@ def eval_interaction_utilities( trace_eval_results = None - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(state, trace_label) as chunk_sizer: assert len(spec.columns) == 1 # avoid altering caller's passed-in locals_d parameter (they may be looping) @@ -101,7 +104,6 @@ def eval_interaction_utilities( locals_d["df"] = df if sharrow_enabled: - from .flow import apply_flow spec_sh = spec.copy() @@ -170,6 +172,7 @@ def replace_in_index_level(mi, level, *repls): timelogger.mark("sharrow preamble", True, logger, trace_label) sh_util, sh_flow = apply_flow( + state, spec_sh, df, locals_d, @@ -178,12 +181,12 @@ def replace_in_index_level(mi, level, *repls): zone_layer=zone_layer, ) if sh_util is not None: - chunk.log_df(trace_label, "sh_util", sh_util) + chunk_sizer.log_df(trace_label, "sh_util", sh_util) utilities = pd.DataFrame( {"utility": sh_util.reshape(-1)}, index=df.index if extra_data is None else None, ) - chunk.log_df(trace_label, "sh_util", None) # hand off to caller + chunk_sizer.log_df(trace_label, "sh_util", None) # hand off to caller timelogger.mark("sharrow flow", True, logger, trace_label) else: @@ -211,14 +214,14 @@ def to_series(x): else: trace_eval_results = None - check_for_variability = config.setting("check_for_variability") + check_for_variability = state.settings.check_for_variability # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously utilities = pd.DataFrame({"utility": 0.0}, index=df.index) - chunk.log_df(trace_label, "eval.utilities", utilities) + chunk_sizer.log_df(trace_label, "eval.utilities", utilities) no_variability = has_missing_vals = 0 @@ -252,17 +255,15 @@ def to_series(x): for expr, label, coefficient in zip(exprs, labels, spec.iloc[:, 0]): try: - # - allow temps of form _od_DIST@od_skim['DIST'] if expr.startswith("_"): - target = expr[: expr.index("@")] rhs = expr[expr.index("@") + 1 :] v = to_series(eval(rhs, globals(), locals_d)) # update locals to allows us to ref previously assigned targets locals_d[target] = v - chunk.log_df( + chunk_sizer.log_df( trace_label, target, v ) # track temps stored in locals @@ -304,12 +305,10 @@ def to_series(x): utility = (v * coefficient).astype("float") if log_alt_losers: - assert ALT_CHOOSER_ID in df max_utils_by_chooser = utility.groupby(df[ALT_CHOOSER_ID]).max() if (max_utils_by_chooser < simulate.ALT_LOSER_UTIL).any(): - losers = max_utils_by_chooser[ max_utils_by_chooser < simulate.ALT_LOSER_UTIL ] @@ -328,7 +327,6 @@ def to_series(x): utilities.utility.values[:] += utility if trace_eval_results is not None: - # expressions should have been uniquified when spec was read # (though we could do it here if need be...) # expr = assign.uniquify_key(trace_eval_results, expr, template="{} # ({})") @@ -342,12 +340,19 @@ def to_series(x): trace_eval_results[k] = v[trace_rows] * coefficient del v - # chunk.log_df(trace_label, 'v', None) + # chunk_sizer.log_df(trace_label, 'v', None) except Exception as err: logger.exception( f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}" ) + if isinstance( + err, AssertionError + ) and "od pairs not in skim" in str(err): + logger.warning( + f"recode_pipeline_columns is set to {state.settings.recode_pipeline_columns}, " + f"you may want to check this" + ) raise err if estimator: @@ -379,11 +384,15 @@ def to_series(x): trace_eval_results = pd.concat( [df[trace_rows], trace_eval_results], axis=1 ) - chunk.log_df(trace_label, "eval.trace_eval_results", trace_eval_results) + chunk_sizer.log_df( + trace_label, "eval.trace_eval_results", trace_eval_results + ) - chunk.log_df(trace_label, "v", None) - chunk.log_df(trace_label, "eval.utilities", None) # out of out hands... - chunk.log_df(trace_label, "eval.trace_eval_results", None) + chunk_sizer.log_df(trace_label, "v", None) + chunk_sizer.log_df( + trace_label, "eval.utilities", None + ) # out of out hands... + chunk_sizer.log_df(trace_label, "eval.trace_eval_results", None) timelogger.mark("regular interact flow", True, logger, trace_label) else: @@ -441,7 +450,9 @@ def to_series(x): trace_eval_results.index = df[trace_rows].index except ValueError: pass - chunk.log_df(trace_label, "eval.trace_eval_results", trace_eval_results) + chunk_sizer.log_df( + trace_label, "eval.trace_eval_results", trace_eval_results + ) else: # in test mode, trace from non-sharrow exists trace_eval_results = pd.concat( @@ -456,7 +467,9 @@ def to_series(x): axis=1, ) trace_eval_results.index = df[trace_rows].index - chunk.log_df(trace_label, "eval.trace_eval_results", trace_eval_results) + chunk_sizer.log_df( + trace_label, "eval.trace_eval_results", trace_eval_results + ) # sh_utility_fat1 = np.dot(sh_utility_fat, spec.values) # sh_utility_fat2 = sh_flow.dot( @@ -469,7 +482,6 @@ def to_series(x): timelogger.mark("sharrow interact trace", True, logger, trace_label) if sharrow_enabled == "test": - try: if sh_util is not None: np.testing.assert_allclose( @@ -572,16 +584,18 @@ def to_series(x): def _interaction_simulate( - choosers, - alternatives, - spec, + state: workflow.State, + choosers: pd.DataFrame, + alternatives: pd.DataFrame, + spec: pd.DataFrame, skims=None, - locals_d=None, + locals_d: Mapping = None, sample_size=None, trace_label=None, trace_choice_name=None, log_alt_losers=False, estimator=None, + chunk_sizer=None, ): """ Run a MNL simulation in the situation in which alternatives must @@ -632,11 +646,13 @@ def _interaction_simulate( """ trace_label = tracing.extend_trace_label(trace_label, "interaction_simulate") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = state.tracing.has_trace_targets(choosers) if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) - tracing.trace_df( + state.tracing.trace_df( + choosers, tracing.extend_trace_label(trace_label, "choosers") + ) + state.tracing.trace_df( alternatives, tracing.extend_trace_label(trace_label, "alternatives"), slicer="NONE", @@ -667,7 +683,7 @@ def _interaction_simulate( alt_index_id = estimator.get_alt_id() if estimator else None chooser_index_id = ALT_CHOOSER_ID if log_alt_losers else None - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow interaction_utilities = None if locals_d is not None and locals_d.get("_sharrow_skip", False): @@ -686,6 +702,7 @@ def _interaction_simulate( trace_rows = trace_ids = None interaction_utilities, trace_eval_results = eval_interaction_utilities( + state, spec, choosers, locals_d, @@ -699,7 +716,7 @@ def _interaction_simulate( # set this index here as this is how later code extracts the chosen alt id's interaction_utilities.index = np.tile(alternatives.index, len(choosers)) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) # mem.trace_memory_info(f"{trace_label}.init interaction_utilities sh", force_garbage_collect=True) if sharrow_enabled == "test" or True: interaction_utilities_sh, trace_eval_results_sh = ( @@ -717,15 +734,15 @@ def _interaction_simulate( or (sharrow_enabled == "test") or interaction_utilities is None ): - interaction_df = logit.interaction_dataset( + state, choosers, alternatives, sample_size, alt_index_id=alt_index_id, chooser_index_id=chooser_index_id, ) - chunk.log_df(trace_label, "interaction_df", interaction_df) + chunk_sizer.log_df(trace_label, "interaction_df", interaction_df) if skims is not None: simulate.set_skim_wrapper_targets(interaction_df, skims) @@ -736,11 +753,11 @@ def _interaction_simulate( # utilities has utility value for element in the cross product of choosers and alternatives # interaction_utilities is a df with one utility column and one row per row in model_design if have_trace_targets: - trace_rows, trace_ids = tracing.interaction_trace_rows( + trace_rows, trace_ids = state.tracing.interaction_trace_rows( interaction_df, choosers, sample_size ) - tracing.trace_df( + state.tracing.trace_df( interaction_df[trace_rows], tracing.extend_trace_label(trace_label, "interaction_df"), slicer="NONE", @@ -750,6 +767,7 @@ def _interaction_simulate( trace_rows = trace_ids = None interaction_utilities, trace_eval_results = eval_interaction_utilities( + state, spec, interaction_df, locals_d, @@ -758,23 +776,23 @@ def _interaction_simulate( estimator=estimator, log_alt_losers=log_alt_losers, ) - chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + chunk_sizer.log_df(trace_label, "interaction_utilities", interaction_utilities) # mem.trace_memory_info(f"{trace_label}.init interaction_utilities", force_garbage_collect=True) # print(f"interaction_df {interaction_df.shape}") # print(f"interaction_utilities {interaction_utilities.shape}") del interaction_df - chunk.log_df(trace_label, "interaction_df", None) + chunk_sizer.log_df(trace_label, "interaction_df", None) if have_trace_targets: - tracing.trace_interaction_eval_results( + state.tracing.trace_interaction_eval_results( trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, "eval"), ) - tracing.trace_df( + state.tracing.trace_df( interaction_utilities[trace_rows], tracing.extend_trace_label(trace_label, "interaction_utils"), slicer="NONE", @@ -787,29 +805,29 @@ def _interaction_simulate( interaction_utilities.values.reshape(len(choosers), sample_size), index=choosers.index, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( utilities, tracing.extend_trace_label(trace_label, "utils"), column_labels=["alternative", "utility"], ) - tracing.dump_df(DUMP, utilities, trace_label, "utilities") + state.tracing.dump_df(DUMP, utilities, trace_label, "utilities") # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative probs = logit.utils_to_probs( - utilities, trace_label=trace_label, trace_choosers=choosers + state, utilities, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( probs, tracing.extend_trace_label(trace_label, "probs"), column_labels=["alternative", "probability"], @@ -819,10 +837,10 @@ def _interaction_simulate( # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = logit.make_choices( - probs, trace_label=trace_label, trace_choosers=choosers + state, probs, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "positions", positions) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "positions", positions) + chunk_sizer.log_df(trace_label, "rands", rands) # need to get from an integer offset into the alternative sample to the alternative index # that is, we want the index value of the row that is offset by rows into the @@ -834,15 +852,15 @@ def _interaction_simulate( # create a series with index from choosers and the index of the chosen alternative choices = pd.Series(choices, index=choosers.index) - chunk.log_df(trace_label, "choices", choices) + chunk_sizer.log_df(trace_label, "choices", choices) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices, tracing.extend_trace_label(trace_label, "choices"), columns=[None, trace_choice_name], ) - tracing.trace_df( + state.tracing.trace_df( rands, tracing.extend_trace_label(trace_label, "rands"), columns=[None, "rand"], @@ -852,6 +870,7 @@ def _interaction_simulate( def interaction_simulate( + state, choosers, alternatives, spec, @@ -864,7 +883,6 @@ def interaction_simulate( trace_choice_name=None, estimator=None, ): - """ Run a simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or @@ -919,11 +937,14 @@ def interaction_simulate( assert len(choosers) > 0 result_list = [] - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label - ): - + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(state, choosers, trace_label): choices = _interaction_simulate( + state, chooser_chunk, alternatives, spec, @@ -934,11 +955,12 @@ def interaction_simulate( trace_choice_name=trace_choice_name, log_alt_losers=log_alt_losers, estimator=estimator, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index ff3128450d..273e17c2d6 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -1,13 +1,15 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging from builtins import object import numpy as np import pandas as pd -from . import config, pipeline, tracing -from .choosing import choice_maker +from activitysim.core import tracing, workflow +from activitysim.core.choosing import choice_maker logger = logging.getLogger(__name__) @@ -19,7 +21,13 @@ def report_bad_choices( - bad_row_map, df, trace_label, msg, trace_choosers=None, raise_error=True + state: workflow.State, + bad_row_map, + df, + trace_label, + msg, + trace_choosers=None, + raise_error=True, ): """ @@ -59,7 +67,7 @@ def report_bad_choices( if trace_label: logger.info("dumping %s" % trace_label) - tracing.write_csv(df[:MAX_DUMP], file_name=trace_label, transpose=False) + state.tracing.write_csv(df[:MAX_DUMP], file_name=trace_label, transpose=False) # log the indexes of the first MAX_DUMP offending rows for idx in df.index[:MAX_PRINT].values: @@ -116,6 +124,7 @@ def utils_to_logsums(utils, exponentiated=False, allow_zero_probs=False): def utils_to_probs( + state: workflow.State, utils, trace_label=None, exponentiated=False, @@ -130,7 +139,7 @@ def utils_to_probs( utils : pandas.DataFrame Rows should be choosers and columns should be alternatives. - trace_label : str + trace_label : str, optional label for tracing bad utility or probability values exponentiated : bool @@ -180,6 +189,7 @@ def utils_to_probs( zero_probs = arr_sum == 0.0 if zero_probs.any(): report_bad_choices( + state, zero_probs, utils, trace_label=tracing.extend_trace_label(trace_label, "zero_prob_utils"), @@ -190,6 +200,7 @@ def utils_to_probs( inf_utils = np.isinf(arr_sum) if inf_utils.any(): report_bad_choices( + state, inf_utils, utils, trace_label=tracing.extend_trace_label(trace_label, "inf_exp_utils"), @@ -214,7 +225,13 @@ def utils_to_probs( return probs -def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=False): +def make_choices( + state: workflow.State, + probs: pd.DataFrame, + trace_label: str = None, + trace_choosers=None, + allow_bad_probs=False, +) -> tuple[pd.Series, pd.Series]: """ Make choices for each chooser from among a set of alternatives. @@ -252,6 +269,7 @@ def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=F if bad_probs.any() and not allow_bad_probs: report_bad_choices( + state, bad_probs, probs, trace_label=tracing.extend_trace_label(trace_label, "bad_probs"), @@ -259,7 +277,7 @@ def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=F trace_choosers=trace_choosers, ) - rands = pipeline.get_rn_generator().random_for_df(probs) + rands = state.get_rn_generator().random_for_df(probs) choices = pd.Series(choice_maker(probs.values, rands), index=probs.index) @@ -269,7 +287,12 @@ def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=F def interaction_dataset( - choosers, alternatives, sample_size=None, alt_index_id=None, chooser_index_id=None + state: workflow.State, + choosers, + alternatives, + sample_size=None, + alt_index_id=None, + chooser_index_id=None, ): """ Combine choosers and alternatives into one table for the purposes @@ -309,7 +332,7 @@ def interaction_dataset( alts_idx = np.arange(numalts) if sample_size < numalts: - sample = pipeline.get_rn_generator().choice_for_df( + sample = state.get_rn_generator().choice_for_df( choosers, alts_idx, sample_size, replace=False ) else: diff --git a/activitysim/core/los.py b/activitysim/core/los.py index c1c3a84517..9d21360983 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -1,16 +1,21 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os import warnings +from pathlib import Path +from typing import Any import numpy as np import pandas as pd +from pydantic import ValidationError -from activitysim.core import skim_dataset # noqa: F401 -from activitysim.core import config, inject, pathbuilder, skim_dictionary, tracing, util +from activitysim.core import config, input, pathbuilder, skim_dictionary, tracing, util from activitysim.core.cleaning import recode_based_on_table +from activitysim.core.configuration.network import NetworkSettings, TAZ_Settings from activitysim.core.skim_dict_factory import MemMapSkimFactory, NumpyArraySkimFactory from activitysim.core.skim_dictionary import NOT_IN_SKIM_ZONE_ID @@ -70,8 +75,8 @@ class Network_LOS(object): tap_tap_uid: TapTapUidCalculator """ - def __init__(self, los_settings_file_name=LOS_SETTINGS_FILE_NAME): - + def __init__(self, state, los_settings_file_name=LOS_SETTINGS_FILE_NAME): + self.state = state # Note: we require all skims to be of same dtype so they can share buffer - is that ok? # fixme is it ok to require skims be all the same type? if so, is this the right choice? self.skim_dtype_name = "float32" @@ -93,6 +98,7 @@ def __init__(self, los_settings_file_name=LOS_SETTINGS_FILE_NAME): self.los_settings_file_name = los_settings_file_name self.load_settings() + self.sharrow_enabled = state.settings.sharrow # dependency injection of skim factory (of type specified in skim_dict_factory setting) skim_dict_factory_name = self.setting("skim_dict_factory") @@ -117,8 +123,16 @@ def rebuild_tvpb_cache(self): ), f"Should not even be asking about rebuild_tvpb_cache if not THREE_ZONE" return self.setting("rebuild_tvpb_cache") - def setting(self, keys, default=""): + def get_network_cache_dir(self) -> Path: + if self.los_settings.network_cache_dir: + result = self.state.filesystem.get_working_subdir( + self.los_settings.network_cache_dir + ) + result.mkdir(parents=True, exist_ok=True) + return result + return self.state.filesystem.get_cache_dir() + def setting(self, keys, default: Any = ""): # if they dont specify a default, check the default defaults default = ( DEFAULT_SETTINGS.get(keys, "") @@ -130,20 +144,29 @@ def setting(self, keys, default=""): key_list = keys.split(".") s = self.los_settings for key in key_list[:-1]: - s = s.get(key) - if default == "": - assert isinstance( - s, dict - ), f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" + if isinstance(s, dict): + s = s.get(key, None) + else: + s = getattr(s, key, None) + if default == "" and s is None: + raise ValueError( + f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" + ) + # assert isinstance( + # s, dict + # ), f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" key = key_list[-1] # last key if default == "": - assert ( - key in s - ), f"Expected setting {keys} not found in in {LOS_SETTINGS_FILE_NAME}" + if isinstance(s, dict): + assert ( + key in s + ), f"Expected setting {keys} not found in in {LOS_SETTINGS_FILE_NAME}" + else: + assert hasattr(s, key) if isinstance(s, dict): return s.get(key, default) else: - return default + return getattr(s, key, default) def load_settings(self): """ @@ -151,52 +174,19 @@ def load_settings(self): """ try: - self.los_settings = config.read_settings_file( - self.los_settings_file_name, mandatory=True - ) - except config.SettingsFileNotFound as e: - - print( - f"los_settings_file_name {self.los_settings_file_name} not found - trying global settings" - ) - print(f"skims_file: {config.setting('skims_file')}") - print(f"skim_time_periods: {config.setting('skim_time_periods')}") - print(f"source_file_paths: {config.setting('source_file_paths')}") - print( - f"inject.get_injectable('configs_dir') {inject.get_injectable('configs_dir')}" + self.los_settings = NetworkSettings.read_settings_file( + self.state.filesystem, + file_name=self.los_settings_file_name, + mandatory=True, ) - - # look for legacy 'skims_file' setting in global settings file - if config.setting("skims_file"): - - warnings.warn( - "Support for 'skims_file' setting in global settings file will be removed." - "Use 'taz_skims' in network_los.yaml config file instead.", - FutureWarning, - ) - - # in which case, we also expect to find skim_time_periods in settings file - skim_time_periods = config.setting("skim_time_periods") - assert ( - skim_time_periods is not None - ), "'skim_time_periods' setting not found." - warnings.warn( - "Support for 'skim_time_periods' setting in global settings file will be removed." - "Put 'skim_time_periods' in network_los.yaml config file instead.", - FutureWarning, - ) - - self.los_settings = { - "taz_skims": config.setting("skims_file"), - "zone_system": ONE_ZONE, - "skim_time_periods": skim_time_periods, - } - - else: - raise e + except ValidationError as err: + err_msg = str(err) + print(err_msg) + raise + self.state.network_settings = self.los_settings # validate skim_time_periods - self.skim_time_periods = self.setting("skim_time_periods") + self.skim_time_periods = self.state.network_settings.skim_time_periods if "hours" in self.skim_time_periods: self.skim_time_periods["periods"] = self.skim_time_periods.pop("hours") warnings.warn( @@ -242,16 +232,24 @@ def load_skim_info(self): """ assert self.skim_dict_factory is not None # load taz skim_info - self.skims_info["taz"] = self.skim_dict_factory.load_skim_info("taz") + self.skims_info["taz"] = self.skim_dict_factory.load_skim_info( + self.state, "taz" + ) if self.zone_system == THREE_ZONE: # load tap skim_info - self.skims_info["tap"] = self.skim_dict_factory.load_skim_info("tap") + self.skims_info["tap"] = self.skim_dict_factory.load_skim_info( + self.state, "tap" + ) if self.zone_system == THREE_ZONE: # load this here rather than in load_data as it is required during multiprocessing to size TVPBCache - self.tap_df = pd.read_csv( - config.data_file_path(self.setting("tap"), mandatory=True) + self.tap_df = input.read_input_file( + self.state.filesystem.get_data_file_path( + self.setting("tap"), + mandatory=True, + alternative_suffixes=(".csv.gz", ".parquet"), + ) ).sort_values("TAP") self.tvpb = pathbuilder.TransitVirtualPathBuilder( self @@ -264,11 +262,14 @@ def load_data(self): # load maz tables if self.zone_system in [TWO_ZONE, THREE_ZONE]: - # maz file_name = self.setting("maz") - self.maz_taz_df = pd.read_csv( - config.data_file_path(file_name, mandatory=True) + self.maz_taz_df = input.read_input_file( + self.state.filesystem.get_data_file_path( + file_name, + mandatory=True, + alternative_suffixes=(".csv.gz", ".parquet"), + ) ) self.maz_taz_df = self.maz_taz_df[["MAZ", "TAZ"]].sort_values( by="MAZ" @@ -276,10 +277,10 @@ def load_data(self): # recode MAZs if needed self.maz_taz_df["MAZ"] = recode_based_on_table( - self.maz_taz_df["MAZ"], "land_use" + self.state, self.maz_taz_df["MAZ"], "land_use" ) self.maz_taz_df["TAZ"] = recode_based_on_table( - self.maz_taz_df["TAZ"], "land_use_taz" + self.state, self.maz_taz_df["TAZ"], "land_use_taz" ) self.maz_ceiling = self.maz_taz_df.MAZ.max() + 1 @@ -292,10 +293,30 @@ def load_data(self): else maz_to_maz_tables ) for file_name in maz_to_maz_tables: + df = input.read_input_file( + self.state.filesystem.get_data_file_path( + file_name, + mandatory=True, + alternative_suffixes=(".csv.gz", ".parquet"), + ) + ) - df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) - - df["i"] = df.OMAZ * self.maz_ceiling + df.DMAZ + # recode MAZs if needed + df["OMAZ"] = recode_based_on_table(self.state, df["OMAZ"], "land_use") + df["DMAZ"] = recode_based_on_table(self.state, df["DMAZ"], "land_use") + + if self.maz_ceiling > (1 << 31): + raise ValueError("maz ceiling too high, will overflow int64") + elif self.maz_ceiling > 32767: + # too many MAZs, or un-recoded MAZ ID's that are too large + # will overflow a 32-bit index, so upgrade to 64bit. + df["i"] = df.OMAZ.astype(np.int64) * np.int64( + self.maz_ceiling + ) + df.DMAZ.astype(np.int64) + else: + df["i"] = df.OMAZ.astype(np.int32) * np.int32( + self.maz_ceiling + ) + df.DMAZ.astype(np.int32) df.set_index("i", drop=True, inplace=True, verify_integrity=True) logger.debug( f"loading maz_to_maz table {file_name} with {len(df)} rows" @@ -315,38 +336,45 @@ def load_data(self): # load tap tables if self.zone_system == THREE_ZONE: - # tap_df should already have been loaded by load_skim_info because, # during multiprocessing, it is required by TapTapUidCalculator to size TVPBCache - # self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True)) + # self.tap_df = pd.read_csv(self.state.filesystem.get_data_file_path(self.setting('tap'), mandatory=True)) assert self.tap_df is not None # maz_to_tap_dfs - different sized sparse arrays with different columns, so we keep them seperate for mode, maz_to_tap_settings in self.setting("maz_to_tap").items(): - assert ( "table" in maz_to_tap_settings ), f"Expected setting maz_to_tap.{mode}.table not found in in {LOS_SETTINGS_FILE_NAME}" file_name = maz_to_tap_settings["table"] - df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) + df = input.read_input_file( + self.state.filesystem.get_data_file_path( + file_name, + mandatory=True, + alternative_suffixes=(".csv.gz", ".parquet"), + ) + ) # recode MAZs if needed - df["MAZ"] = recode_based_on_table(df["MAZ"], "land_use") + df["MAZ"] = recode_based_on_table(self.state, df["MAZ"], "land_use") # trim tap set # if provided, use tap_line_distance_col together with tap_lines table to trim the near tap set # to only include the nearest tap to origin when more than one tap serves the same line distance_col = maz_to_tap_settings.get("tap_line_distance_col") if distance_col: - if self.tap_lines_df is None: # load tap_lines on demand (required if they specify tap_line_distance_col) tap_lines_file_name = self.setting( "tap_lines", ) - self.tap_lines_df = pd.read_csv( - config.data_file_path(tap_lines_file_name, mandatory=True) + self.tap_lines_df = input.read_input_file( + self.state.filesystem.get_data_file_path( + tap_lines_file_name, + mandatory=True, + alternative_suffixes=(".csv.gz", ".parquet"), + ) ) # csv file has one row per TAP with space-delimited list of lines served by that TAP @@ -402,7 +430,7 @@ def load_data(self): ) if TRACE_TRIMMED_MAZ_TO_TAP_TABLES: - tracing.write_csv( + self.state.tracing.write_csv( df, file_name=f"trimmed_{maz_to_tap_settings['table']}", transpose=False, @@ -423,11 +451,11 @@ def load_data(self): self.maz_to_tap_dfs[mode] = df # create taz skim dict - if not config.setting("sharrow", False): + if not self.sharrow_enabled: assert "taz" not in self.skim_dicts # If offset_preprocessing was completed, then TAZ values # will be pre-offset and there's no need to re-offset them. - if config.setting("offset_preprocessing", False): + if self.state.settings.offset_preprocessing: _override_offset_int = 0 else: _override_offset_int = None @@ -441,7 +469,7 @@ def load_data(self): # create MazSkimDict facade if self.zone_system in [TWO_ZONE, THREE_ZONE]: - if not config.setting("sharrow", False): + if not self.sharrow_enabled: # create MazSkimDict facade skim_dict # (must have already loaded dependencies: taz skim_dict, maz_to_maz_df, and maz_taz_df) assert "maz" not in self.skim_dicts @@ -461,7 +489,7 @@ def load_data(self): # create tap skim dict if self.zone_system == THREE_ZONE: - if not config.setting("sharrow", False): + if not self.sharrow_enabled: assert "tap" not in self.skim_dicts tap_skim_dict = self.create_skim_dict("tap") self.skim_dicts["tap"] = tap_skim_dict @@ -473,6 +501,18 @@ def load_data(self): else: self.skim_dicts["tap"] = self.get_skim_dict("tap") + # check that the number of rows in land_use_taz matches the number of zones in the skims + if "land_use_taz" in self.state: + skims = self.get_skim_dict("taz") + if hasattr(skims, "zone_ids"): # SkimDict + assert len(skims.zone_ids) == len( + self.state.get_dataframe("land_use_taz") + ) + else: # SkimDataset + assert len(skims.dataset.indexes["otaz"]) == len( + self.state.get_dataframe("land_use_taz") + ) + def create_skim_dict(self, skim_tag, _override_offset_int=None): """ Create a new SkimDict of type specified by skim_tag (e.g. 'taz', 'maz' or 'tap') @@ -501,11 +541,15 @@ def create_skim_dict(self, skim_tag, _override_offset_int=None): "taz" in self.skim_dicts ), f"create_skim_dict 'maz': backing taz skim_dict not in skim_dicts" taz_skim_dict = self.skim_dicts["taz"] - skim_dict = skim_dictionary.MazSkimDict("maz", self, taz_skim_dict) + skim_dict = skim_dictionary.MazSkimDict( + self.state, "maz", self, taz_skim_dict + ) else: skim_info = self.skims_info[skim_tag] skim_data = self.skim_dict_factory.get_skim_data(skim_tag, skim_info) - skim_dict = skim_dictionary.SkimDict(skim_tag, skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict( + self.state, skim_tag, skim_info, skim_data + ) logger.debug(f"create_skim_dict {skim_tag} omx_shape {skim_dict.omx_shape}") @@ -529,6 +573,8 @@ def omx_file_names(self, skim_tag): list of str """ file_names = self.setting(f"{skim_tag}_skims") + if isinstance(file_names, TAZ_Settings): + file_names = file_names.omx if isinstance(file_names, dict): for i in ("file", "files", "omx"): if i in file_names: @@ -551,11 +597,13 @@ def zarr_file_name(self, skim_tag): Returns ------- - list of str + str """ skim_setting = self.setting(f"{skim_tag}_skims") if isinstance(skim_setting, dict): return skim_setting.get("zarr", None) + elif isinstance(skim_setting, TAZ_Settings): + return skim_setting.zarr else: return None @@ -597,7 +645,7 @@ def multiprocess(self): ------- bool """ - is_multiprocess = config.setting("multiprocess", False) + is_multiprocess = self.state.settings.multiprocess return is_multiprocess def load_shared_data(self, shared_data_buffers): @@ -625,7 +673,7 @@ def load_shared_data(self, shared_data_buffers): if self.zone_system == THREE_ZONE: assert self.tvpb is not None - if self.rebuild_tvpb_cache and not config.setting("resume_after", None): + if self.rebuild_tvpb_cache and not self.state.settings.resume_after: # delete old cache at start of new run so that stale cache is not loaded by load_data_to_buffer # when singleprocess, this call is made (later in program flow) in the initialize_los step self.tvpb.tap_cache.cleanup() @@ -678,12 +726,12 @@ def get_skim_dict(self, skim_tag): ------- SkimDict or subclass (e.g. MazSkimDict) """ - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = self.sharrow_enabled if sharrow_enabled and skim_tag in ("taz", "maz"): - skim_dataset = inject.get_injectable("skim_dataset") # non-global import avoids circular references from .skim_dataset import SkimDataset + skim_dataset = self.state.get_injectable("skim_dataset") if skim_tag == "maz": return SkimDataset(skim_dataset) else: @@ -694,7 +742,7 @@ def get_skim_dict(self, skim_tag): del skim_dataset.attrs[f"dim_redirection_{dd}"] return SkimDataset(skim_dataset) elif sharrow_enabled and skim_tag in ("tap"): - tap_dataset = inject.get_injectable("tap_dataset") + tap_dataset = self.state.get_injectable("tap_dataset") from .skim_dataset import SkimDataset return SkimDataset(tap_dataset) @@ -797,7 +845,7 @@ def get_tappairs3d(self, otap, dtap, dim3, key): return s.values - def skim_time_period_label(self, time_period): + def skim_time_period_label(self, time_period, fillna=None): """ convert time period times to skim time period labels (e.g. 9 -> 'AM') @@ -835,24 +883,32 @@ def skim_time_period_label(self, time_period): )[0] - 1 ) - result = self.skim_time_periods["labels"][bin] + if fillna is not None: + default = self.skim_time_periods["labels"][fillna] + result = self.skim_time_periods["labels"].get(bin, default=default) + else: + result = self.skim_time_periods["labels"][bin] else: result = pd.cut( time_period, self.skim_time_periods["periods"], labels=self.skim_time_periods["labels"], ordered=False, - ).astype(str) + ) + if fillna is not None: + default = self.skim_time_periods["labels"][fillna] + result = result.fillna(default) + result = result.astype(str) return result - def get_tazs(self): + def get_tazs(self, state): # FIXME - should compute on init? if self.zone_system == ONE_ZONE: - tazs = inject.get_table("land_use").index.values + tazs = state.get_dataframe("land_use").index.values else: try: - land_use_taz = inject.get_table("land_use_taz").to_frame() + land_use_taz = state.get_dataframe("land_use_taz") except (RuntimeError, KeyError): # land_use_taz is missing, use fallback tazs = self.maz_taz_df.TAZ.unique() @@ -878,17 +934,15 @@ def get_taps(self): assert isinstance(taps, np.ndarray) return taps - @property - def get_maz_to_taz_series(self): + def get_maz_to_taz_series(self, state): """ pd.Series: Index is the MAZ, value is the corresponding TAZ """ - sharrow_enabled = config.setting("sharrow", False) - if sharrow_enabled: + if self.sharrow_enabled: # FIXME:SHARROW - this assumes that both MAZ and TAZ have been recoded to # zero-based indexes, but what if that was not done? # Should we check it and error out here or bravely march forward? - skim_dataset = inject.get_injectable("skim_dataset") + skim_dataset = state.get_injectable("skim_dataset") maz_to_taz = skim_dataset["_digitized_otaz_of_omaz"].to_series() else: maz_to_taz = self.maz_taz_df[["MAZ", "TAZ"]].set_index("MAZ").TAZ @@ -913,7 +967,7 @@ def map_maz_to_taz(self, s): input_was_series = False else: input_was_series = True - out = s.map(self.get_maz_to_taz_series) + out = s.map(self.get_maz_to_taz_series(self.state)) if np.issubdtype(out, np.floating): if out.isna().any(): raise KeyError("failed in mapping MAZ to TAZ") diff --git a/activitysim/core/mem.py b/activitysim/core/mem.py index ae832f250c..d17b84adc8 100644 --- a/activitysim/core/mem.py +++ b/activitysim/core/mem.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import datetime import gc @@ -14,7 +15,7 @@ import pandas as pd import psutil -from activitysim.core import config, inject, util +from activitysim.core import config, util, workflow logger = logging.getLogger(__name__) @@ -44,28 +45,30 @@ def time_bin(timestamps): return pd.to_datetime(bin, unit="s", origin="unix") -def consolidate_logs(): +def consolidate_logs(state: workflow.State): """ Consolidate and aggregate subprocess mem logs """ - if not config.setting("multiprocess", False): + if not state.settings.multiprocess: return - delete_originals = not config.setting("keep_mem_logs", False) + delete_originals = not state.settings.keep_mem_logs omnibus_df = [] # for each multiprocess step - multiprocess_steps = config.setting("multiprocess_steps", []) + multiprocess_steps = state.settings.multiprocess_steps + if multiprocess_steps is not None: + multiprocess_steps = [i.dict() for i in multiprocess_steps] for step in multiprocess_steps: step_name = step.get("name", None) logger.debug(f"mem.consolidate_logs for step {step_name}") - glob_file_name = config.log_file_path( + glob_file_name = state.get_log_file_path( f"{step_name}*{MEM_LOG_FILE_NAME}", prefix=False ) - glob_files = glob.glob(glob_file_name) + glob_files = glob.glob(str(glob_file_name)) if not glob_files: continue @@ -126,7 +129,7 @@ def consolidate_logs(): util.delete_files(glob_files, f"mem.consolidate_logs.{step_name}") # write aggregate step log - output_path = config.log_file_path(f"mem_{step_name}.csv", prefix=False) + output_path = state.get_log_file_path(f"mem_{step_name}.csv", prefix=False) logger.debug( f"chunk.consolidate_logs writing step summary log for step {step_name} to {output_path}" ) @@ -138,7 +141,7 @@ def consolidate_logs(): omnibus_df = pd.concat(omnibus_df) omnibus_df = omnibus_df.sort_values("time") - output_path = config.log_file_path(OMNIBUS_LOG_FILE_NAME, prefix=False) + output_path = state.get_log_file_path(OMNIBUS_LOG_FILE_NAME, prefix=False) logger.debug(f"chunk.consolidate_logs writing omnibus log to {output_path}") omnibus_df.to_csv(output_path, mode="w", index=False) @@ -173,10 +176,13 @@ def log_global_hwm(): ) -def trace_memory_info(event, trace_ticks=0, force_garbage_collect=False): +def trace_memory_info(event, trace_ticks=0, force_garbage_collect=False, *, state): global MEM_TICK + if state is None: + raise ValueError("state cannot be None") + tick = time.time() if trace_ticks and (tick - MEM_TICK < trace_ticks): return @@ -235,9 +241,14 @@ def trace_memory_info(event, trace_ticks=0, force_garbage_collect=False): with mem_log_lock: MEM_LOG_HEADER = "process,pid,rss,full_rss,uss,event,children,time" - with config.open_log_file( - MEM_LOG_FILE_NAME, "a", header=MEM_LOG_HEADER, prefix=True - ) as log_file: + log_file = state.filesystem.open_log_file( + MEM_LOG_FILE_NAME, + "a", + header=MEM_LOG_HEADER, + prefix=state.get("log_file_prefix", None), + ) + + with log_file: print( f"{process_name}," f"{pid}," @@ -276,7 +287,7 @@ def get_rss(force_garbage_collect=False, uss=False): return info.rss, 0 -def shared_memory_size(data_buffers=None): +def shared_memory_size(data_buffers): """ return total size of the multiprocessing shared memory block in data_buffers @@ -288,7 +299,7 @@ def shared_memory_size(data_buffers=None): shared_size = 0 if data_buffers is None: - data_buffers = inject.get_injectable("data_buffers", {}) + data_buffers = {} for k, data_buffer in data_buffers.items(): if isinstance(data_buffer, str) and data_buffer.startswith("sh.Dataset:"): diff --git a/activitysim/core/memory_sidecar.py b/activitysim/core/memory_sidecar.py index b1d8816e36..7ad8d35ffd 100644 --- a/activitysim/core/memory_sidecar.py +++ b/activitysim/core/memory_sidecar.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import datetime import os import time diff --git a/activitysim/core/mp_tasks.py b/activitysim/core/mp_tasks.py index ebbf1cb37d..7d1ffc30e3 100644 --- a/activitysim/core/mp_tasks.py +++ b/activitysim/core/mp_tasks.py @@ -1,5 +1,8 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + +import glob import importlib import logging import multiprocessing @@ -8,13 +11,21 @@ import time import traceback from collections import OrderedDict +from pathlib import Path import numpy as np import pandas as pd import yaml -from activitysim.core import config, inject, mem, pipeline, tracing, util -from activitysim.core.config import setting +from activitysim.core import config, mem, tracing, util, workflow +from activitysim.core.configuration import FileSystem, Settings +from activitysim.core.workflow.checkpoint import ( + CHECKPOINT_NAME, + CHECKPOINT_TABLE_NAME, + FINAL_CHECKPOINT_NAME, + NON_TABLE_COLUMNS, + ParquetStore, +) logger = logging.getLogger(__name__) @@ -196,15 +207,14 @@ # FIXME - pathological knowledge of abm.tables.skims and abm.tables.shadow_pricing (see note above) -def log(msg, level, write_to_log_file=True): - +def log(state: workflow.State, msg, level, write_to_log_file=True): process_name = multiprocessing.current_process().name if not write_to_log_file: print(f"############ mp_tasks - {process_name} - {msg}") if write_to_log_file: - with config.open_log_file("mp_tasks_log.txt", "a") as log_file: + with state.filesystem.open_log_file("mp_tasks_log.txt", "a") as log_file: print(f"mp_tasks - {process_name} - {msg}", file=log_file) if write_to_log_file: @@ -212,31 +222,30 @@ def log(msg, level, write_to_log_file=True): logger.log(level, msg) -def debug(msg, write_to_log_file=True): - log(msg, level=logging.DEBUG, write_to_log_file=write_to_log_file) +def debug(state: workflow.State, msg, write_to_log_file=True): + log(state, msg, level=logging.DEBUG, write_to_log_file=write_to_log_file) -def info(msg, write_to_log_file=True): - log(msg, level=logging.INFO, write_to_log_file=write_to_log_file) +def info(state: workflow.State, msg, write_to_log_file=True): + log(state, msg, level=logging.INFO, write_to_log_file=write_to_log_file) -def warning(msg, write_to_log_file=True): - log(msg, level=logging.WARNING, write_to_log_file=write_to_log_file) +def warning(state: workflow.State, msg, write_to_log_file=True): + log(state, msg, level=logging.WARNING, write_to_log_file=write_to_log_file) -def error(msg, write_to_log_file=True): - log(msg, level=logging.ERROR, write_to_log_file=write_to_log_file) +def error(state: workflow.State, msg, write_to_log_file=True): + log(state, msg, level=logging.ERROR, write_to_log_file=write_to_log_file) -def exception(msg, write_to_log_file=True): - +def exception(state: workflow.State, msg, write_to_log_file=True): process_name = multiprocessing.current_process().name if not write_to_log_file: print(f"mp_tasks - {process_name} - {msg}") print(f"---\n{traceback.format_exc()}---") - with config.open_log_file("mp_tasks_log.txt", "a") as log_file: + with state.filesystem.open_log_file("mp_tasks_log.txt", "a") as log_file: print(f"---\nmp_tasks - {process_name} - {msg}", file=log_file) traceback.print_exc(limit=10, file=log_file) print("---", file=log_file) @@ -273,28 +282,21 @@ def pipeline_table_keys(pipeline_store): """ - checkpoints = pipeline_store[pipeline.CHECKPOINT_TABLE_NAME] - - # don't currently need this capability... - # if checkpoint_name: - # # specified checkpoint row as series - # i = checkpoints[checkpoints[pipeline.CHECKPOINT_NAME] == checkpoint_name].index[0] - # checkpoint = checkpoints.loc[i] - # else: + checkpoints = pipeline_store[CHECKPOINT_TABLE_NAME] # last checkpoint row as series checkpoint = checkpoints.iloc[-1] - checkpoint_name = checkpoint.loc[pipeline.CHECKPOINT_NAME] + checkpoint_name = checkpoint.loc[CHECKPOINT_NAME] # series with table name as index and checkpoint_name as value - checkpoint_tables = checkpoint[~checkpoint.index.isin(pipeline.NON_TABLE_COLUMNS)] + checkpoint_tables = checkpoint[~checkpoint.index.isin(NON_TABLE_COLUMNS)] # omit dropped tables with empty checkpoint name checkpoint_tables = checkpoint_tables[checkpoint_tables != ""] # hdf5 key is / checkpoint_tables = { - table_name: pipeline.pipeline_table_key(table_name, checkpoint_name) + table_name: workflow.State.pipeline_table_key(None, table_name, checkpoint_name) for table_name, checkpoint_name in checkpoint_tables.items() } @@ -302,7 +304,51 @@ def pipeline_table_keys(pipeline_store): return checkpoint_name, checkpoint_tables -def build_slice_rules(slice_info, pipeline_tables): +def parquet_pipeline_table_keys(pipeline_path: Path): + """ + return dict of current (as of last checkpoint) pipeline tables + and their checkpoint-specific hdf5_keys + + This facilitates reading pipeline tables directly from a 'raw' open pandas.HDFStore without + opening it as a pipeline (e.g. when apportioning and coalescing pipelines) + + We currently only ever need to do this from the last checkpoint, so the ability to specify + checkpoint_name is not required, and thus omitted. + + Returns + ------- + checkpoint_name : name of the checkpoint + checkpoint_tables : dict {: } + + """ + checkpoints = ParquetStore(pipeline_path).get_dataframe(CHECKPOINT_TABLE_NAME) + # pd.read_parquet( + # pipeline_path.joinpath(CHECKPOINT_TABLE_NAME, "None.parquet") + # ) + + # last checkpoint row as series + checkpoint = checkpoints.iloc[-1] + checkpoint_name = checkpoint.loc[CHECKPOINT_NAME] + + # series with table name as index and checkpoint_name as value + checkpoint_tables = checkpoint[~checkpoint.index.isin(NON_TABLE_COLUMNS)] + + # omit dropped tables with empty checkpoint name + checkpoint_tables = checkpoint_tables[checkpoint_tables != ""] + + # hdf5 key is / + checkpoint_tables = { + table_name: ParquetStore(pipeline_path) + ._store_table_path(table_name, checkpoint_name) + .relative_to(ParquetStore(pipeline_path)._directory) + for table_name, checkpoint_name in checkpoint_tables.items() + } + + # checkpoint name and series mapping table name to path for tables in that checkpoint + return checkpoint_name, checkpoint_tables + + +def build_slice_rules(state: workflow.State, slice_info, pipeline_tables): """ based on slice_info for current step from run_list, generate a recipe for slicing the tables in the pipeline (passed in tables parameter) @@ -383,6 +429,8 @@ def build_slice_rules(slice_info, pipeline_tables): slicer_table_names = slice_info["tables"] slicer_table_exceptions = slice_info.get("exclude", slice_info.get("except", [])) + if slicer_table_exceptions is None: + slicer_table_exceptions = [] primary_slicer = slicer_table_names[0] # - ensure that tables listed in slice_info appear in correct order and before any others @@ -400,7 +448,8 @@ def build_slice_rules(slice_info, pipeline_tables): # So don't change this behavior withoyt testing populationsim multiprocess! if slicer_table_exceptions is True: debug( - f"slice.except wildcard (True): excluding all tables not explicitly listed in slice.tables" + state, + f"slice.except wildcard (True): excluding all tables not explicitly listed in slice.tables", ) slicer_table_exceptions = [t for t in tables if t not in slicer_table_names] @@ -414,7 +463,6 @@ def build_slice_rules(slice_info, pipeline_tables): # build slice rules for loaded tables slice_rules = OrderedDict() for table_name, df in tables.items(): - rule = {} if table_name == primary_slicer: # slice primary apportion table @@ -454,14 +502,15 @@ def build_slice_rules(slice_info, pipeline_tables): for table_name, rule in slice_rules.items(): if rule["slice_by"] is not None: debug( - f"### table_name: {table_name} slice_rules: {slice_rules[table_name]}" + state, + f"### table_name: {table_name} slice_rules: {slice_rules[table_name]}", ) - debug(f"### slicer_ref_cols: {slicer_ref_cols}") + debug(state, f"### slicer_ref_cols: {slicer_ref_cols}") return slice_rules -def apportion_pipeline(sub_proc_names, step_info): +def apportion_pipeline(state: workflow.State, sub_proc_names, step_info): """ apportion pipeline for multiprocessing step @@ -472,7 +521,7 @@ def apportion_pipeline(sub_proc_names, step_info): Parameters ---------- - sub_proc_names : list of str + sub_proc_names : list[str] names of the sub processes to apportion step_info : dict step_info from multiprocess_steps for step we are apportioning pipeline tables for @@ -481,26 +530,28 @@ def apportion_pipeline(sub_proc_names, step_info): ------- creates apportioned pipeline files for each sub job """ - slice_info = step_info.get("slice", None) + if slice_info is None: + raise RuntimeError("missing slice_info.slice") multiprocess_step_name = step_info.get("name", None) - pipeline_file_name = inject.get_injectable("pipeline_file_name") + pipeline_file_name = state.get_injectable("pipeline_file_name") # ensure that if we are resuming, we don't apportion any tables from future model steps last_checkpoint_in_previous_multiprocess_step = step_info.get( "last_checkpoint_in_previous_multiprocess_step", None ) - assert last_checkpoint_in_previous_multiprocess_step is not None - pipeline.open_pipeline(resume_after=last_checkpoint_in_previous_multiprocess_step) + if last_checkpoint_in_previous_multiprocess_step is None: + raise RuntimeError("missing last_checkpoint_in_previous_multiprocess_step") + state.checkpoint.restore(resume_after=last_checkpoint_in_previous_multiprocess_step) # ensure all tables are in the pipeline - checkpointed_tables = pipeline.checkpointed_tables() + checkpointed_tables = state.checkpoint.list_tables() for table_name in slice_info["tables"]: if table_name not in checkpointed_tables: raise RuntimeError(f"slicer table {table_name} not found in pipeline") - checkpoints_df = pipeline.get_checkpoints() + checkpoints_df = state.checkpoint.get_inventory() # for the subprocess pipelines, keep only the last row of checkpoints and patch the last checkpoint name checkpoints_df = checkpoints_df.tail(1).copy() @@ -512,46 +563,120 @@ def apportion_pipeline(sub_proc_names, step_info): # patch last checkpoint name for all tables checkpoints_df[table_name] = checkpoint_name # load the dataframe - tables[table_name] = pipeline.get_table(table_name) + tables[table_name] = state.checkpoint.load_dataframe(table_name) - debug(f"loaded table {table_name} {tables[table_name].shape}") + debug(state, f"loaded table {table_name} {tables[table_name].shape}") - pipeline.close_pipeline() + state.checkpoint.close_store() # should only be one checkpoint (named ) assert len(checkpoints_df) == 1 # - build slice rules for loaded tables - slice_rules = build_slice_rules(slice_info, tables) + slice_rules = build_slice_rules(state, slice_info, tables) # - allocate sliced tables for each sub_proc num_sub_procs = len(sub_proc_names) for i in range(num_sub_procs): - # use well-known pipeline file name process_name = sub_proc_names[i] - pipeline_path = config.build_output_file_path( - pipeline_file_name, use_prefix=process_name + pipeline_path = state.get_output_file_path( + pipeline_file_name, prefix=process_name ) - # remove existing file - try: - os.unlink(pipeline_path) - except OSError: - pass + if state.settings.checkpoint_format == "hdf": + # remove existing file + try: + os.unlink(pipeline_path) + except OSError: + pass + + with pd.HDFStore(str(pipeline_path), mode="a") as pipeline_store: + # remember sliced_tables so we can cascade slicing to other tables + sliced_tables = {} + + # - for each table in pipeline + for table_name, rule in slice_rules.items(): + df = tables[table_name] + + if rule["slice_by"] is not None and num_sub_procs > len(df): + # almost certainly a configuration error + raise RuntimeError( + f"apportion_pipeline: multiprocess step {multiprocess_step_name} " + f"slice table {table_name} has fewer rows {df.shape} " + f"than num_processes ({num_sub_procs})." + ) + + if rule["slice_by"] == "primary": + # slice primary apportion table by num_sub_procs strides + # this hopefully yields a more random distribution + # (e.g.) households are ordered by size in input store + # we are assuming that the primary table index is unique + # otherwise we should slice by strides in df.index.unique + # we could easily work around this, but it seems likely this was an error on the user's part + assert not df.index.duplicated().any() + + primary_df = df[ + np.asanyarray(list(range(df.shape[0]))) % num_sub_procs == i + ] + sliced_tables[table_name] = primary_df + elif rule["slice_by"] == "index": + # slice a table with same index name as a known slicer + source_df = sliced_tables[rule["source"]] + sliced_tables[table_name] = df.loc[source_df.index] + elif rule["slice_by"] == "column": + # slice a table with a recognized slicer_column + source_df = sliced_tables[rule["source"]] + sliced_tables[table_name] = df[ + df[rule["column"]].isin(source_df.index) + ] + elif rule["slice_by"] is None: + # don't slice mirrored tables + sliced_tables[table_name] = df + else: + raise RuntimeError( + "Unrecognized slice rule '%s' for table %s" + % (rule["slice_by"], table_name) + ) - with pd.HDFStore(pipeline_path, mode="a") as pipeline_store: + # - write table to pipeline + hdf5_key = state.pipeline_table_key(table_name, checkpoint_name) + pipeline_store[hdf5_key] = sliced_tables[table_name] + + debug( + state, + f"writing checkpoints ({checkpoints_df.shape}) " + f"to {CHECKPOINT_TABLE_NAME} in {pipeline_path}", + ) + pipeline_store[CHECKPOINT_TABLE_NAME] = checkpoints_df + else: + # remove existing parquet files and directories + for pq_file in glob.glob(str(pipeline_path.joinpath("*", "*.parquet"))): + try: + os.unlink(pq_file) + except OSError: + pass + for pq_dir in glob.glob(str(pipeline_path.joinpath("*", "*"))): + try: + os.rmdir(pq_dir) + except OSError: + pass + for pq_dir in glob.glob(str(pipeline_path.joinpath("*"))): + try: + os.rmdir(pq_dir) + except OSError: + pass # remember sliced_tables so we can cascade slicing to other tables sliced_tables = {} + # YO pipeline_store + # - for each table in pipeline for table_name, rule in slice_rules.items(): - df = tables[table_name] if rule["slice_by"] is not None and num_sub_procs > len(df): - # almost certainly a configuration error raise RuntimeError( f"apportion_pipeline: multiprocess step {multiprocess_step_name} " @@ -592,17 +717,37 @@ def apportion_pipeline(sub_proc_names, step_info): ) # - write table to pipeline - hdf5_key = pipeline.pipeline_table_key(table_name, checkpoint_name) - pipeline_store[hdf5_key] = sliced_tables[table_name] + pipeline_path.joinpath(table_name).mkdir(parents=True, exist_ok=True) + + ParquetStore(pipeline_path).put( + table_name=table_name, + df=sliced_tables[table_name], + checkpoint_name=checkpoint_name, + ) + # + # sliced_tables[table_name].to_parquet( + # pipeline_path.joinpath(table_name, f"{checkpoint_name}.parquet") + # ) debug( + state, f"writing checkpoints ({checkpoints_df.shape}) " - f"to {pipeline.CHECKPOINT_TABLE_NAME} in {pipeline_path}" + f"to {CHECKPOINT_TABLE_NAME} in {pipeline_path}", + ) + pipeline_path.joinpath(CHECKPOINT_TABLE_NAME).mkdir( + parents=True, exist_ok=True + ) + ParquetStore(pipeline_path).put( + table_name=CHECKPOINT_TABLE_NAME, + df=checkpoints_df, + checkpoint_name=None, ) - pipeline_store[pipeline.CHECKPOINT_TABLE_NAME] = checkpoints_df + # checkpoints_df.to_parquet( + # pipeline_path.joinpath(CHECKPOINT_TABLE_NAME, f"None.parquet") + # ) -def coalesce_pipelines(sub_proc_names, slice_info): +def coalesce_pipelines(state: workflow.State, sub_proc_names, slice_info): """ Coalesce the data in the sub_processes apportioned pipelines back into a single pipeline @@ -613,7 +758,7 @@ def coalesce_pipelines(sub_proc_names, slice_info): Parameters ---------- - sub_proc_names : list of str + sub_proc_names : list[str] slice_info : dict slice_info from multiprocess_steps @@ -622,31 +767,37 @@ def coalesce_pipelines(sub_proc_names, slice_info): creates an omnibus pipeline with coalesced data from individual sub_proc pipelines """ - pipeline_file_name = inject.get_injectable("pipeline_file_name") + pipeline_file_name = state.get_injectable("pipeline_file_name") - debug(f"coalesce_pipelines to: {pipeline_file_name}") + debug(state, f"coalesce_pipelines to: {pipeline_file_name}") # - read all tables from first process pipeline # FIXME - note: assumes any new tables will be present in ALL subprocess pipelines tables = {} - pipeline_path = config.build_output_file_path( - pipeline_file_name, use_prefix=sub_proc_names[0] + pipeline_path = state.get_output_file_path( + pipeline_file_name, prefix=sub_proc_names[0] ) - with pd.HDFStore(pipeline_path, mode="r") as pipeline_store: + if state.settings.checkpoint_format == "hdf": + with pd.HDFStore(str(pipeline_path), mode="r") as pipeline_store: + # hdf5_keys is a dict mapping table_name to pipeline hdf5_key + checkpoint_name, hdf5_keys = pipeline_table_keys(pipeline_store) - # hdf5_keys is a dict mapping table_name to pipeline hdf5_key - checkpoint_name, hdf5_keys = pipeline_table_keys(pipeline_store) - - for table_name, hdf5_key in hdf5_keys.items(): - debug(f"loading table {table_name} {hdf5_key}") - tables[table_name] = pipeline_store[hdf5_key] + for table_name, hdf5_key in hdf5_keys.items(): + debug(state, f"loading table {table_name} {hdf5_key}") + tables[table_name] = pipeline_store[hdf5_key] + else: + checkpoint_name, hdf5_keys = parquet_pipeline_table_keys(pipeline_path) + pqstore = ParquetStore(pipeline_path, mode="r") + for table_name, parquet_path in hdf5_keys.items(): + debug(state, f"loading table {table_name} from {pqstore.filename}") + tables[table_name] = pqstore.get_dataframe(table_name) # slice.coalesce is an override list of omnibus tables created by subprocesses that should be coalesced, # whether or not they satisfy the slice rules. Ordinarily all tables qualify for slicing by the slice rules # will be coalesced, including any new tables created by the subprocess that have sliceable indexes or ref_cols. # Any other new tables that don't match the slice rules will be considered mirrored. This is usually the desired - # behavior, especially in activitysim abm models. However, if the "slice.except: True" wildcard is used, it + # behavior, especially in activitysim abm models. However, if the "slice.exclude: True" wildcard is used, it # prevents the inference for newly generated tables, and this directive permits explicit specification of # which new tables to coalesce. Populationsim uses this wildcard except directives to avoid having to list # many slice exceptions, and just lists weigh tables to coalesce. So don't change this behavior without testing @@ -663,7 +814,7 @@ def coalesce_pipelines(sub_proc_names, slice_info): # - use slice rules followed by apportion_pipeline to identify mirrored tables # (tables that are identical in every pipeline and so don't need to be concatenated) - slice_rules = build_slice_rules(slice_info, tables) + slice_rules = build_slice_rules(state, slice_info, tables) # table is mirrored if no slice rule or explicitly listed in slice_info.coalesce setting mirrored_table_names = [ @@ -674,43 +825,48 @@ def coalesce_pipelines(sub_proc_names, slice_info): mirrored_tables = {t: tables[t] for t in mirrored_table_names} omnibus_keys = {t: k for t, k in hdf5_keys.items() if t not in mirrored_table_names} - debug(f"coalesce_pipelines to: {pipeline_file_name}") - debug(f"mirrored_table_names: {mirrored_table_names}") - debug(f"omnibus_keys: {omnibus_keys}") + debug(state, f"coalesce_pipelines to: {pipeline_file_name}") + debug(state, f"mirrored_table_names: {mirrored_table_names}") + debug(state, f"omnibus_keys: {omnibus_keys}") # assemble lists of omnibus tables from all sub_processes omnibus_tables = {table_name: [] for table_name in omnibus_keys} for process_name in sub_proc_names: - pipeline_path = config.build_output_file_path( - pipeline_file_name, use_prefix=process_name + pipeline_path = state.get_output_file_path( + pipeline_file_name, prefix=process_name ) logger.info(f"coalesce pipeline {pipeline_path}") - with pd.HDFStore(pipeline_path, mode="r") as pipeline_store: + if state.settings.checkpoint_format == "hdf": + with pd.HDFStore(str(pipeline_path), mode="r") as pipeline_store: + for table_name, hdf5_key in omnibus_keys.items(): + omnibus_tables[table_name].append(pipeline_store[hdf5_key]) + else: + pqstore = ParquetStore(pipeline_path, mode="r") for table_name, hdf5_key in omnibus_keys.items(): - omnibus_tables[table_name].append(pipeline_store[hdf5_key]) + omnibus_tables[table_name].append(pqstore.get_dataframe(table_name)) # open pipeline, preserving existing checkpoints (so resume_after will work for prior steps) - pipeline.open_pipeline("_") + state.checkpoint.restore(resume_after="_") # - add mirrored tables to pipeline for table_name in mirrored_tables: df = mirrored_tables[table_name] - info(f"adding mirrored table {table_name} {df.shape}") - pipeline.replace_table(table_name, df) + info(state, f"adding mirrored table {table_name} {df.shape}") + state.add_table(table_name, df) # - concatenate omnibus tables and add them to pipeline for table_name in omnibus_tables: df = pd.concat(omnibus_tables[table_name], sort=False) - info(f"adding omnibus table {table_name} {df.shape}") - pipeline.replace_table(table_name, df) + info(state, f"adding omnibus table {table_name} {df.shape}") + state.add_table(table_name, df) - pipeline.add_checkpoint(checkpoint_name) + state.checkpoint.add(checkpoint_name) - pipeline.close_pipeline() + state.checkpoint.close_store() -def setup_injectables_and_logging(injectables, locutor=True): +def setup_injectables_and_logging(injectables, locutor: bool = True) -> workflow.State: """ Setup injectables (passed by parent process) within sub process @@ -728,22 +884,25 @@ def setup_injectables_and_logging(injectables, locutor=True): ------- injects injectables """ + state = workflow.State() + state = state.initialize_filesystem(**injectables) + state.settings = injectables.get("settings", Settings()) + # state.settings = Settings.parse_obj(injectables.get("settings_package", {})) # register abm steps and other abm-specific injectables # by default, assume we are running activitysim.abm # other callers (e.g. piopulationsim) will have to arrange to register their own steps and injectables # (presumably) in a custom run_simulation.py instead of using the 'activitysim run' command - if not inject.is_injectable("preload_injectables"): + if not "preload_injectables" in state: # register abm steps and other abm-specific injectables from activitysim import abm # noqa: F401 try: - for k, v in injectables.items(): - inject.add_injectable(k, v) + state.add_injectable(k, v) # re-import extension modules to register injectables - ext = inject.get_injectable("imported_extensions", default=()) + ext = state.get_injectable("imported_extensions", default=()) for e in ext: basepath, extpath = os.path.split(e) if not basepath: @@ -757,30 +916,34 @@ def setup_injectables_and_logging(injectables, locutor=True): finally: del sys.path[0] - inject.add_injectable("is_sub_task", True) - inject.add_injectable("locutor", locutor) + state.add_injectable("is_sub_task", True) + state.add_injectable("locutor", locutor) - config.filter_warnings() + config.filter_warnings(state) process_name = multiprocessing.current_process().name - inject.add_injectable("log_file_prefix", process_name) + state.add_injectable("log_file_prefix", process_name) except Exception as e: exception( + state, f"{type(e).__name__} exception while setting up injectables: {str(e)}", write_to_log_file=False, ) raise e try: - tracing.config_logger() + state.logging.config_logger() except Exception as e: - exception(f"{type(e).__name__} exception while configuring logger: {str(e)}") + exception( + state, f"{type(e).__name__} exception while configuring logger: {str(e)}" + ) raise e + return state -def adjust_chunk_size_for_shared_memory(chunk_size, data_buffers, num_processes): +def adjust_chunk_size_for_shared_memory(chunk_size, data_buffers, num_processes): # even if there is only one subprocess, # we are separate from parent who allocated the shared memory # so we still need to compensate for it @@ -815,7 +978,9 @@ def adjust_chunk_size_for_shared_memory(chunk_size, data_buffers, num_processes) return adjusted_chunk_size -def run_simulation(queue, step_info, resume_after, shared_data_buffer): +def run_simulation( + state: workflow.State, queue, step_info, resume_after, shared_data_buffer +): """ run step models as subtask @@ -844,53 +1009,54 @@ def run_simulation(queue, step_info, resume_after, shared_data_buffer): chunk_size, shared_data_buffer, num_processes ) - inject.add_injectable("data_buffers", shared_data_buffer) - inject.add_injectable("chunk_size", chunk_size) - inject.add_injectable("num_processes", num_processes) + state.add_injectable("data_buffers", shared_data_buffer) + state.add_injectable("chunk_size", chunk_size) + state.add_injectable("num_processes", num_processes) if resume_after: - info(f"resume_after {resume_after}") + info(state, f"resume_after {resume_after}") # if they specified a resume_after model, check to make sure it is checkpointed if ( resume_after != LAST_CHECKPOINT and resume_after - not in pipeline.get_checkpoints()[pipeline.CHECKPOINT_NAME].values + not in state.checkpoint.get_inventory()[CHECKPOINT_NAME].values ): # if not checkpointed, then fall back to last checkpoint - info(f"resume_after checkpoint '{resume_after}' not in pipeline.") + info(state, f"resume_after checkpoint '{resume_after}' not in pipeline.") resume_after = LAST_CHECKPOINT - pipeline.open_pipeline(resume_after) - last_checkpoint = pipeline.last_checkpoint() + state.checkpoint.restore(resume_after) + last_checkpoint = state.checkpoint.last_checkpoint.get(CHECKPOINT_NAME) if last_checkpoint in models: - info(f"Resuming model run list after {last_checkpoint}") + info(state, f"Resuming model run list after {last_checkpoint}") models = models[models.index(last_checkpoint) + 1 :] - assert inject.get_injectable("preload_injectables", None) + assert state.get_injectable("preload_injectables") t0 = tracing.print_elapsed_time() for model in models: - t1 = tracing.print_elapsed_time() try: - pipeline.run_model(model) + state.run.by_name(model) except Exception as e: - warning(f"{type(e).__name__} exception running {model} model: {str(e)}") + warning( + state, f"{type(e).__name__} exception running {model} model: {str(e)}" + ) raise e - tracing.log_runtime(model_name=model, start_time=t1) + state.run.log_runtime(model_name=model, start_time=t1) queue.put({"model": model, "time": time.time() - t1}) tracing.print_elapsed_time("run (%s models)" % len(models), t0) # add checkpoint with final tables even if not intermediate checkpointing checkpoint_name = step_info["name"] - pipeline.add_checkpoint(checkpoint_name) + state.checkpoint.add(checkpoint_name) - pipeline.close_pipeline() + state.checkpoint.close_store() """ @@ -898,7 +1064,9 @@ def run_simulation(queue, step_info, resume_after, shared_data_buffer): """ -def mp_run_simulation(locutor, queue, injectables, step_info, resume_after, **kwargs): +def mp_run_simulation( + locutor: bool, queue, injectables, step_info, resume_after, **kwargs +): """ mp entry point for run_simulation @@ -913,26 +1081,28 @@ def mp_run_simulation(locutor, queue, injectables, step_info, resume_after, **kw shared_data_buffers passed as kwargs to avoid picking dict """ - setup_injectables_and_logging(injectables, locutor=locutor) + state = setup_injectables_and_logging(injectables, locutor=locutor) debug( - f"mp_run_simulation {step_info['name']} locutor={inject.get_injectable('locutor', False)} " + state, + f"mp_run_simulation {step_info['name']} locutor={state.get_injectable('locutor', False)} ", ) try: - if step_info["num_processes"] > 1: pipeline_prefix = multiprocessing.current_process().name logger.debug(f"injecting pipeline_file_prefix '{pipeline_prefix}'") - inject.add_injectable("pipeline_file_prefix", pipeline_prefix) + state.add_injectable("pipeline_file_prefix", pipeline_prefix) shared_data_buffer = kwargs - run_simulation(queue, step_info, resume_after, shared_data_buffer) + run_simulation(state, queue, step_info, resume_after, shared_data_buffer) mem.log_global_hwm() # subprocess except Exception as e: - exception(f"{type(e).__name__} exception caught in mp_run_simulation: {str(e)}") + exception( + state, f"{type(e).__name__} exception caught in mp_run_simulation: {str(e)}" + ) raise e @@ -944,19 +1114,20 @@ def mp_apportion_pipeline(injectables, sub_proc_names, step_info): ---------- injectables : dict injectables from parent - sub_proc_names : list of str + sub_proc_names : list[str] names of the sub processes to apportion step_info : dict step_info for multiprocess_step we are apportioning """ - setup_injectables_and_logging(injectables) + state = setup_injectables_and_logging(injectables) try: - apportion_pipeline(sub_proc_names, step_info) + apportion_pipeline(state, sub_proc_names, step_info) except Exception as e: exception( - f"{type(e).__name__} exception caught in mp_apportion_pipeline: {str(e)}" + state, + f"{type(e).__name__} exception caught in mp_apportion_pipeline: {str(e)}", ) raise e @@ -976,20 +1147,22 @@ def mp_setup_skims(injectables, **kwargs): shared_data_buffers passed as kwargs to avoid picking dict """ - setup_injectables_and_logging(injectables) + state = setup_injectables_and_logging(injectables) - info("mp_setup_skims") + info(state, "mp_setup_skims") try: shared_data_buffer = kwargs - network_los_preload = inject.get_injectable("network_los_preload", None) + network_los_preload = state.get_injectable("network_los_preload", None) if network_los_preload is not None: network_los_preload.load_shared_data(shared_data_buffer) except Exception as e: - exception(f"{type(e).__name__} exception caught in mp_setup_skims: {str(e)}") + exception( + state, f"{type(e).__name__} exception caught in mp_setup_skims: {str(e)}" + ) raise e @@ -1001,19 +1174,20 @@ def mp_coalesce_pipelines(injectables, sub_proc_names, slice_info): ---------- injectables : dict injectables from parent - sub_proc_names : list of str + sub_proc_names : list[str] names of the sub processes to apportion slice_info : dict slice_info from multiprocess_steps """ - setup_injectables_and_logging(injectables) + state = setup_injectables_and_logging(injectables) try: - coalesce_pipelines(sub_proc_names, slice_info) + coalesce_pipelines(state, sub_proc_names, slice_info) except Exception as e: exception( - f"{type(e).__name__} exception caught in coalesce_pipelines: {str(e)}" + state, + f"{type(e).__name__} exception caught in coalesce_pipelines: {str(e)}", ) raise e @@ -1023,7 +1197,7 @@ def mp_coalesce_pipelines(injectables, sub_proc_names, slice_info): """ -def allocate_shared_skim_buffers(): +def allocate_shared_skim_buffers(state: workflow.State): """ This is called by the main process to allocate shared memory buffer to share with subprocs @@ -1035,9 +1209,9 @@ def allocate_shared_skim_buffers(): """ - info("allocate_shared_skim_buffer") + info(state, "allocate_shared_skim_buffer") - network_los = inject.get_injectable("network_los_preload", None) + network_los = state.get_injectable("network_los_preload", None) if network_los is not None: skim_buffers = network_los.allocate_shared_skim_buffers() else: @@ -1046,7 +1220,7 @@ def allocate_shared_skim_buffers(): return skim_buffers -def allocate_shared_shadow_pricing_buffers(): +def allocate_shared_shadow_pricing_buffers(state: workflow.State): """ This is called by the main process to allocate memory buffer to share with subprocs @@ -1055,9 +1229,9 @@ def allocate_shared_shadow_pricing_buffers(): multiprocessing.RawArray """ - info("allocate_shared_shadow_pricing_buffers") + info(state, "allocate_shared_shadow_pricing_buffers") - shadow_pricing_info = inject.get_injectable("shadow_pricing_info", None) + shadow_pricing_info = state.get_injectable("shadow_pricing_info", None) if shadow_pricing_info is not None: from activitysim.abm.tables import shadow_pricing @@ -1071,7 +1245,7 @@ def allocate_shared_shadow_pricing_buffers(): return shadow_pricing_buffers -def allocate_shared_shadow_pricing_buffers_choice(): +def allocate_shared_shadow_pricing_buffers_choice(state): """ This is called by the main process to allocate memory buffer to share with subprocs @@ -1080,9 +1254,9 @@ def allocate_shared_shadow_pricing_buffers_choice(): multiprocessing.RawArray """ - info("allocate_shared_shadow_pricing_buffers_choice") + info(state, "allocate_shared_shadow_pricing_buffers_choice") - shadow_pricing_choice_info = inject.get_injectable( + shadow_pricing_choice_info = state.get_injectable( "shadow_pricing_choice_info", None ) @@ -1090,7 +1264,9 @@ def allocate_shared_shadow_pricing_buffers_choice(): from activitysim.abm.tables import shadow_pricing shadow_pricing_buffers_choice = ( - shadow_pricing.buffers_for_shadow_pricing_choice(shadow_pricing_choice_info) + shadow_pricing.buffers_for_shadow_pricing_choice( + state, shadow_pricing_choice_info + ) ) else: shadow_pricing_buffers_choice = {} @@ -1099,6 +1275,7 @@ def allocate_shared_shadow_pricing_buffers_choice(): def run_sub_simulations( + state: workflow.State, injectables, shared_data_buffers, step_info, @@ -1138,7 +1315,7 @@ def run_sub_simulations( Returns ------- - completed : list of str + completed : list[str] names of sub_processes that completed successfully """ @@ -1149,11 +1326,12 @@ def log_queued_messages(): msg = queue.get(block=False) model_name = msg["model"] info( - f"{process.name} {model_name} : {tracing.format_elapsed_time(msg['time'])}" + state, + f"{process.name} {model_name} : {tracing.format_elapsed_time(msg['time'])}", ) - mem.trace_memory_info(f"{process.name}.{model_name}.completed") + state.trace_memory_info(f"{process.name}.{model_name}.completed") - def check_proc_status(): + def check_proc_status(state: workflow.State): # we want to drop 'completed' breadcrumb when it happens, lest we terminate # if fail_fast flag is set raise for p in procs: @@ -1162,31 +1340,41 @@ def check_proc_status(): elif p.exitcode == 0: # completed successfully if p.name not in completed: - info(f"process {p.name} completed") + info(state, f"process {p.name} completed") completed.add(p.name) - drop_breadcrumb(step_name, "completed", list(completed)) - mem.trace_memory_info(f"{p.name}.completed") + drop_breadcrumb(state, step_name, "completed", list(completed)) + state.trace_memory_info(f"{p.name}.completed") else: # process failed if p.name not in failed: - warning(f"process {p.name} failed with exitcode {p.exitcode}") + warning( + state, f"process {p.name} failed with exitcode {p.exitcode}" + ) failed.add(p.name) - mem.trace_memory_info(f"{p.name}.failed") + state.trace_memory_info(f"{p.name}.failed") if fail_fast: - warning(f"fail_fast terminating remaining running processes") + warning( + state, f"fail_fast terminating remaining running processes" + ) for op in procs: if op.exitcode is None: try: - info(f"terminating process {op.name}") + info(state, f"terminating process {op.name}") op.terminate() except Exception as e: - info(f"error terminating process {op.name}: {e}") + info( + state, + f"error terminating process {op.name}: {e}", + ) raise RuntimeError("Process %s failed" % (p.name,)) step_name = step_info["name"] t0 = tracing.print_elapsed_time() - info(f"run_sub_simulations step {step_name} models resume_after {resume_after}") + info( + state, + f"run_sub_simulations step {step_name} models resume_after {resume_after}", + ) # if resuming and some processes completed successfully in previous run if previously_completed: @@ -1200,7 +1388,8 @@ def check_proc_status(): name for name in process_names if name not in previously_completed ] info( - f"step {step_name}: skipping {len(previously_completed)} previously completed subprocedures" + state, + f"step {step_name}: skipping {len(previously_completed)} previously completed subprocedures", ) else: # if we are resuming after a specific model, then force all subprocesses to run @@ -1217,7 +1406,7 @@ def check_proc_status(): completed = set(previously_completed) failed = set([]) # so we can log process failure first time it happens - drop_breadcrumb(step_name, "completed", list(completed)) + drop_breadcrumb(state, step_name, "completed", list(completed)) for i, process_name in enumerate(process_names): q = multiprocessing.Queue() @@ -1231,11 +1420,11 @@ def check_proc_status(): resume_after=resume_after, ) - # debug(f"create_process {process_name} target={mp_run_simulation}") + # debug(state, f"create_process {process_name} target={mp_run_simulation}") # for k in args: - # debug(f"create_process {process_name} arg {k}={args[k]}") + # debug(state, f"create_process {process_name} arg {k}={args[k]}") # for k in shared_data_buffers: - # debug(f"create_process {process_name} shared_data_buffers {k}={shared_data_buffers[k]}") + # debug(state, f"create_process {process_name} shared_data_buffers {k}={shared_data_buffers[k]}") p = multiprocessing.Process( target=mp_run_simulation, @@ -1255,7 +1444,7 @@ def check_proc_status(): # - start processes for i, p in zip(list(range(num_simulations)), procs): - info(f"start process {p.name}") + info(state, f"start process {p.name}") p.start() """ @@ -1265,8 +1454,8 @@ def check_proc_status(): OSError: [WinError 1450] Insufficient system resources exist to complete the requested service. Judging by the commented-out assert, this (or a related) issue may have been around in some form for a while. - def __setstate__(self, state): - self.size, self.name = self._state = state + def __setstate__(self, state_): + self.size, self.name = self._state = state_ # Reopen existing mmap self.buffer = mmap.mmap(-1, self.size, tagname=self.name) # XXX Temporarily preventing buildbot failures while determining @@ -1276,32 +1465,32 @@ def __setstate__(self, state): if sys.platform == "win32": time.sleep(1) - mem.trace_memory_info(f"{p.name}.start") + state.trace_memory_info(f"{p.name}.start") while multiprocessing.active_children(): # log queued messages as they are received log_queued_messages() # monitor sub process status and drop breadcrumbs or fail_fast as they terminate - check_proc_status() + check_proc_status(state) # monitor memory usage - mem.trace_memory_info( + state.trace_memory_info( "run_sub_simulations.idle", trace_ticks=mem.MEM_PARENT_TRACE_TICK_LEN ) time.sleep(1) # clean up any messages or breadcrumbs that occurred while we slept log_queued_messages() - check_proc_status() + check_proc_status(state) # no need to join() explicitly since multiprocessing.active_children joins completed procs for p in procs: assert p.exitcode is not None if p.exitcode: - error(f"Process %s failed with exitcode {p.exitcode}") + error(state, f"Process %s failed with exitcode {p.exitcode}") assert p.name in failed else: - info(f"Process {p.name} completed with exitcode {p.exitcode}") + info(state, f"Process {p.name} completed with exitcode {p.exitcode}") assert p.name in completed t0 = tracing.print_elapsed_time("run_sub_simulations step %s" % step_name, t0) @@ -1309,7 +1498,7 @@ def __setstate__(self, state): return list(completed) -def run_sub_task(p): +def run_sub_task(state: workflow.State, p): """ Run process p synchroneously, @@ -1319,15 +1508,15 @@ def run_sub_task(p): ---------- p : multiprocessing.Process """ - info(f"#run_model running sub_process {p.name}") + info(state, f"#run_model running sub_process {p.name}") - mem.trace_memory_info(f"{p.name}.start") + state.trace_memory_info(f"{p.name}.start") t0 = tracing.print_elapsed_time() p.start() while multiprocessing.active_children(): - mem.trace_memory_info( + state.trace_memory_info( "run_sub_simulations.idle", trace_ticks=mem.MEM_PARENT_TRACE_TICK_LEN ) time.sleep(1) @@ -1336,16 +1525,16 @@ def run_sub_task(p): # p.join() t0 = tracing.print_elapsed_time("#run_model sub_process %s" % p.name, t0) - # info(f'{p.name}.exitcode = {p.exitcode}') + # info(state, f'{p.name}.exitcode = {p.exitcode}') - mem.trace_memory_info(f"run_model {p.name} completed") + state.trace_memory_info(f"run_model {p.name} completed") if p.exitcode: - error(f"Process {p.name} returned exitcode {p.exitcode}") + error(state, f"Process {p.name} returned exitcode {p.exitcode}") raise RuntimeError("Process %s returned exitcode %s" % (p.name, p.exitcode)) -def drop_breadcrumb(step_name, crumb, value=True): +def drop_breadcrumb(state: workflow.State, step_name, crumb, value=True): """ Add (crumb: value) to specified step in breadcrumbs and flush breadcrumbs to file run can be resumed with resume_after @@ -1366,13 +1555,13 @@ def drop_breadcrumb(step_name, crumb, value=True): ------- """ - breadcrumbs = inject.get_injectable("breadcrumbs", OrderedDict()) + breadcrumbs = state.get_injectable("breadcrumbs", OrderedDict()) breadcrumbs.setdefault(step_name, {"name": step_name})[crumb] = value - inject.add_injectable("breadcrumbs", breadcrumbs) - write_breadcrumbs(breadcrumbs) + state.add_injectable("breadcrumbs", breadcrumbs) + write_breadcrumbs(state, breadcrumbs) -def run_multiprocess(injectables): +def run_multiprocess(state: workflow.State, injectables): """ run the steps in run_list, possibly resuming after checkpoint specified by resume_after @@ -1402,9 +1591,9 @@ def run_multiprocess(injectables): dict of values to inject in sub-processes """ - mem.trace_memory_info("run_multiprocess.start") + state.trace_memory_info("run_multiprocess.start") - run_list = get_run_list() + run_list = get_run_list(state) if not run_list["multiprocess"]: raise RuntimeError( @@ -1415,77 +1604,77 @@ def run_multiprocess(injectables): old_breadcrumbs = run_list.get("breadcrumbs", {}) # raise error if any sub-process fails without waiting for others to complete - fail_fast = setting("fail_fast") - info(f"run_multiprocess fail_fast: {fail_fast}") + fail_fast = state.settings.fail_fast + info(state, f"run_multiprocess fail_fast: {fail_fast}") def skip_phase(phase): skip = old_breadcrumbs and old_breadcrumbs.get(step_name, {}).get(phase, False) if skip: - info(f"Skipping {step_name} {phase}") + info(state, f"Skipping {step_name} {phase}") return skip def find_breadcrumb(crumb, default=None): return old_breadcrumbs.get(step_name, {}).get(crumb, default) - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow # - allocate shared data shared_data_buffers = {} - mem.trace_memory_info("allocate_shared_skim_buffer.before") + state.trace_memory_info("allocate_shared_skim_buffer.before") t0 = tracing.print_elapsed_time() if not sharrow_enabled: - shared_data_buffers.update(allocate_shared_skim_buffers()) + shared_data_buffers.update(allocate_shared_skim_buffers(state)) t0 = tracing.print_elapsed_time("allocate shared skim buffer", t0) - mem.trace_memory_info("allocate_shared_skim_buffer.completed") + state.trace_memory_info("allocate_shared_skim_buffer.completed") # combine shared_skim_buffer and shared_shadow_pricing_buffer in shared_data_buffer t0 = tracing.print_elapsed_time() - shared_data_buffers.update(allocate_shared_shadow_pricing_buffers()) + shared_data_buffers.update(allocate_shared_shadow_pricing_buffers(state)) t0 = tracing.print_elapsed_time("allocate shared shadow_pricing buffer", t0) - mem.trace_memory_info("allocate_shared_shadow_pricing_buffers.completed") + state.trace_memory_info("allocate_shared_shadow_pricing_buffers.completed") # combine shared_shadow_pricing_buffers to pool choices across all processes t0 = tracing.print_elapsed_time() - shared_data_buffers.update(allocate_shared_shadow_pricing_buffers_choice()) + shared_data_buffers.update(allocate_shared_shadow_pricing_buffers_choice(state)) t0 = tracing.print_elapsed_time("allocate shared shadow_pricing choice buffer", t0) - mem.trace_memory_info("allocate_shared_shadow_pricing_buffers_choice.completed") + state.trace_memory_info("allocate_shared_shadow_pricing_buffers_choice.completed") + start_time = time.time() if sharrow_enabled: - start_time = time.time() shared_data_buffers["skim_dataset"] = "sh.Dataset:skim_dataset" # Loading skim_dataset must be done in the main process, not a subprocess, # so that this min process can hold on to the shared memory and then cleanly # release it on exit. - from . import flow # make injectable known # noqa: F401 + from . import flow, skim_dataset # make injectables known # noqa: F401 - inject.get_injectable("skim_dataset") + state.get_injectable("skim_dataset") tracing.print_elapsed_time("setup skim_dataset", t0) - mem.trace_memory_info("skim_dataset.completed") + state.trace_memory_info("skim_dataset.completed") # - mp_setup_skims else: # not sharrow_enabled if len(shared_data_buffers) > 0: start_time = time.time() run_sub_task( + state, multiprocessing.Process( target=mp_setup_skims, name="mp_setup_skims", args=(injectables,), kwargs=shared_data_buffers, - ) + ), ) tracing.print_elapsed_time("setup shared_data_buffers", t0) - mem.trace_memory_info("mp_setup_skims.completed") - tracing.log_runtime("mp_setup_skims", start_time=start_time, force=True) + state.trace_memory_info("mp_setup_skims.completed") + state.run.log_runtime("mp_setup_skims", start_time=start_time, force=True) # - for each step in run list for step_info in run_list["multiprocess_steps"]: - step_name = step_info["name"] num_processes = step_info["num_processes"] @@ -1500,16 +1689,17 @@ def find_breadcrumb(crumb, default=None): if not skip_phase("apportion") and num_processes > 1: start_time = time.time() run_sub_task( + state, multiprocessing.Process( target=mp_apportion_pipeline, name="%s_apportion" % step_name, args=(injectables, sub_proc_names, step_info), - ) + ), ) - tracing.log_runtime( + state.run.log_runtime( "%s_apportion" % step_name, start_time=start_time, force=True ) - drop_breadcrumb(step_name, "apportion") + drop_breadcrumb(state, step_name, "apportion") # - run_sub_simulations if not skip_phase("simulate"): @@ -1518,6 +1708,7 @@ def find_breadcrumb(crumb, default=None): previously_completed = find_breadcrumb("completed", default=[]) completed = run_sub_simulations( + state, injectables, shared_data_buffers, step_info, @@ -1532,33 +1723,34 @@ def find_breadcrumb(crumb, default=None): "%s processes failed in step %s" % (num_processes - len(completed), step_name) ) - drop_breadcrumb(step_name, "simulate") + drop_breadcrumb(state, step_name, "simulate") # - mp_coalesce_pipelines if not skip_phase("coalesce") and num_processes > 1: start_time = time.time() run_sub_task( + state, multiprocessing.Process( target=mp_coalesce_pipelines, name="%s_coalesce" % step_name, args=(injectables, sub_proc_names, slice_info), - ) + ), ) - tracing.log_runtime( + state.run.log_runtime( "%s_coalesce" % step_name, start_time=start_time, force=True ) - drop_breadcrumb(step_name, "coalesce") + drop_breadcrumb(state, step_name, "coalesce") # add checkpoint with final tables even if not intermediate checkpointing - if not pipeline.intermediate_checkpoint(): - pipeline.open_pipeline("_") - pipeline.add_checkpoint(pipeline.FINAL_CHECKPOINT_NAME) - pipeline.close_pipeline() + if not state.should_save_checkpoint(): + state.checkpoint.restore(resume_after="_") + state.checkpoint.add(FINAL_CHECKPOINT_NAME) + state.checkpoint.close_store() mem.log_global_hwm() # main process -def get_breadcrumbs(run_list): +def get_breadcrumbs(state: workflow.State, run_list): """ Read, validate, and annotate breadcrumb file from previous run @@ -1590,16 +1782,15 @@ def get_breadcrumbs(run_list): assert resume_after is not None # - read breadcrumbs file from previous run - breadcrumbs = read_breadcrumbs() + breadcrumbs = read_breadcrumbs(state) # - can't resume multiprocess without breadcrumbs file if not breadcrumbs: - error(f"empty breadcrumbs for resume_after '{resume_after}'") + error(state, f"empty breadcrumbs for resume_after '{resume_after}'") raise RuntimeError("empty breadcrumbs for resume_after '%s'" % resume_after) # if resume_after is specified by name if resume_after != LAST_CHECKPOINT: - # breadcrumbs for steps from previous run previous_steps = list(breadcrumbs.keys()) @@ -1616,7 +1807,7 @@ def get_breadcrumbs(run_list): resume_step_name = resume_step["name"] if resume_step_name not in previous_steps: - error(f"resume_after model '{resume_after}' not in breadcrumbs") + error(state, f"resume_after model '{resume_after}' not in breadcrumbs") raise RuntimeError( "resume_after model '%s' not in breadcrumbs" % resume_after ) @@ -1642,7 +1833,7 @@ def get_breadcrumbs(run_list): return breadcrumbs -def get_run_list(): +def get_run_list(state: workflow.State): """ validate and annotate run_list from settings @@ -1691,24 +1882,26 @@ def get_run_list(): validated and annotated run_list """ - models = setting("models", []) - multiprocess_steps = setting("multiprocess_steps", []) + models = state.settings.models + multiprocess_steps = state.settings.multiprocess_steps + if multiprocess_steps is not None: + multiprocess_steps = [i.dict() for i in multiprocess_steps] - resume_after = inject.get_injectable("resume_after", None) or setting( - "resume_after", None + resume_after = ( + state.get_injectable("resume_after", None) or state.settings.resume_after ) - multiprocess = inject.get_injectable("multiprocess", False) or setting( - "multiprocess", False + multiprocess = ( + state.get_injectable("multiprocess", False) or state.settings.multiprocess ) # default settings that can be overridden by settings in individual steps - global_chunk_size = setting("chunk_size", 0) or 0 - default_mp_processes = setting("num_processes", 0) or int( + global_chunk_size = state.settings.chunk_size + default_mp_processes = state.settings.num_processes or int( 1 + multiprocessing.cpu_count() / 2.0 ) if multiprocess and multiprocessing.cpu_count() == 1: - warning("Can't multiprocess because there is only 1 cpu") + warning(state, "Can't multiprocess because there is only 1 cpu") run_list = { "models": models, @@ -1725,7 +1918,6 @@ def get_run_list(): ) if multiprocess: - if not multiprocess_steps: raise RuntimeError( "multiprocess setting is %s but no multiprocess_steps setting" @@ -1758,7 +1950,7 @@ def get_run_list(): step_names.add(name) # - validate num_processes and assign default - num_processes = step.get("num_processes", 0) + num_processes = step.get("num_processes", 0) or 0 if not isinstance(num_processes, int) or num_processes < 0: raise RuntimeError( @@ -1766,14 +1958,18 @@ def get_run_list(): " in multiprocess_steps" % (num_processes, name) ) - if "slice" in step: + if "slice" in step and step["slice"] is not None: if num_processes == 0: - info(f"Setting num_processes = {num_processes} for step {name}") + info( + state, + f"Setting num_processes = {num_processes} for step {name}", + ) num_processes = default_mp_processes if num_processes > multiprocessing.cpu_count(): warning( + state, f"num_processes setting ({num_processes}) " - f"greater than cpu count ({ multiprocessing.cpu_count()})" + f"greater than cpu count ({ multiprocessing.cpu_count()})", ) else: if num_processes == 0: @@ -1864,7 +2060,7 @@ def get_run_list(): # - add resume breadcrumbs if resume_after: try: - breadcrumbs = get_breadcrumbs(run_list) + breadcrumbs = get_breadcrumbs(state, run_list) except IOError: # file does not exist, no resume_after is possible breadcrumbs = None resume_after = None @@ -1883,7 +2079,7 @@ def get_run_list(): # - write run list to output dir # use log_file_path so we use (optional) log subdir and prefix process name - with config.open_log_file("run_list.txt", "w") as f: + with state.filesystem.open_log_file("run_list.txt", "w") as f: print_run_list(run_list, f) return run_list @@ -1938,12 +2134,7 @@ def print_run_list(run_list, output_file=None): print(" ", v, file=output_file) -def breadcrumbs_file_path(): - # return path to breadcrumbs file in output_dir - return config.build_output_file_path("breadcrumbs.yaml") - - -def read_breadcrumbs(): +def read_breadcrumbs(state: workflow.State): """ Read breadcrumbs file from previous run @@ -1954,7 +2145,7 @@ def read_breadcrumbs(): ------- breadcrumbs : OrderedDict """ - file_path = breadcrumbs_file_path() + file_path = state.get_output_file_path("breadcrumbs.yaml") if not os.path.exists(file_path): raise IOError("Could not find saved breadcrumbs file '%s'" % file_path) with open(file_path, "r") as f: @@ -1964,7 +2155,7 @@ def read_breadcrumbs(): return breadcrumbs -def write_breadcrumbs(breadcrumbs): +def write_breadcrumbs(state: workflow.State, breadcrumbs): """ Write breadcrumbs file with execution history of multiprocess run @@ -1983,32 +2174,8 @@ def write_breadcrumbs(breadcrumbs): ---------- breadcrumbs : OrderedDict """ - with open(breadcrumbs_file_path(), "w") as f: + breadcrumbs_file_path = state.get_output_file_path("breadcrumbs.yaml") + with open(breadcrumbs_file_path, "w") as f: # write ordered dict as array breadcrumbs = [step for step in list(breadcrumbs.values())] yaml.dump(breadcrumbs, f) - - -def if_sub_task(if_is, if_isnt): - """ - select one of two values depending whether current process is primary process or subtask - - This is primarily intended for use in yaml files to select between (e.g.) logging levels - so main log file can display only warnings and errors from subtasks - - In yaml file, it can be used like this: - - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] - - - Parameters - ---------- - if_is : (any type) value to return if process is a subtask - if_isnt : (any type) value to return if process is not a subtask - - Returns - ------- - (any type) (one of parameters if_is or if_isnt) - """ - - return if_is if inject.get_injectable("is_sub_task", False) else if_isnt diff --git a/activitysim/core/pathbuilder.py b/activitysim/core/pathbuilder.py index 1199f19c57..01635b0ed6 100644 --- a/activitysim/core/pathbuilder.py +++ b/activitysim/core/pathbuilder.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import warnings from builtins import range @@ -10,14 +12,13 @@ from activitysim.core import ( assign, chunk, - config, expressions, - inject, logit, los, pathbuilder_cache, simulate, tracing, + workflow, ) from activitysim.core.pathbuilder_cache import memo from activitysim.core.util import reindex @@ -36,6 +37,7 @@ def compute_utilities( + state: workflow.State, network_los, model_settings, choosers, @@ -49,8 +51,7 @@ def compute_utilities( """ trace_label = tracing.extend_trace_label(trace_label, "compute_utils") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(state, trace_label) as chunk_sizer: logger.debug( f"{trace_label} Running compute_utilities with {choosers.shape[0]} choosers" ) @@ -59,7 +60,7 @@ def compute_utilities( locals_dict.update(model_constants) # we don't grok coefficients, but allow them to use constants in spec alt columns - spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + spec = state.filesystem.read_model_spec(file_name=model_settings["SPEC"]) for c in spec.columns: if c != simulate.SPEC_LABEL_NAME: spec[c] = spec[c].map(lambda s: model_constants.get(s, s)).astype(float) @@ -67,11 +68,11 @@ def compute_utilities( # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get("PREPROCESSOR") if preprocessor_settings: - # don't want to alter caller's dataframe choosers = choosers.copy() expressions.assign_columns( + state, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, @@ -79,24 +80,25 @@ def compute_utilities( ) utilities = simulate.eval_utilities( + state, spec, choosers, locals_d=locals_dict, trace_all_rows=trace, trace_label=trace_label, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) return utilities -class TransitVirtualPathBuilder(object): +class TransitVirtualPathBuilder: """ Transit virtual path builder for three zone systems """ def __init__(self, network_los): - self.network_los = network_los self.uid_calculator = pathbuilder_cache.TapTapUidCalculator(network_los) @@ -112,7 +114,7 @@ def __init__(self, network_los): def trace_df(self, df, trace_label, extension): assert len(df) > 0 - tracing.trace_df( + self.network_los.state.tracing.trace_df( df, label=tracing.extend_trace_label(trace_label, extension), slicer="NONE", @@ -146,10 +148,9 @@ def units_for_recipe(self, recipe): def compute_maz_tap_utilities( self, recipe, maz_od_df, chooser_attributes, leg, mode, trace_label, trace ): - trace_label = tracing.extend_trace_label(trace_label, f"maz_tap_utils.{leg}") - with chunk.chunk_log(trace_label): + with chunk.chunk_log(self.network_los.state, trace_label) as chunk_sizer: maz_tap_settings = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.maz_tap_settings.{mode}" ) @@ -192,11 +193,11 @@ def compute_maz_tap_utilities( for c in attribute_columns: utilities_df[c] = reindex(chooser_attributes[c], utilities_df["idx"]) - chunk.log_df(trace_label, "utilities_df", utilities_df) + chunk_sizer.log_df(trace_label, "utilities_df", utilities_df) if self.units_for_recipe(recipe) == "utility": - utilities_df[leg] = compute_utilities( + self.network_los.state, self.network_los, maz_tap_settings, utilities_df, @@ -206,21 +207,27 @@ def compute_maz_tap_utilities( trace_column_names=["idx", maz_col, tap_col] if trace else None, ) - chunk.log_df(trace_label, "utilities_df", utilities_df) # annotated + chunk_sizer.log_df( + trace_label, "utilities_df", utilities_df + ) # annotated else: - assignment_spec = assign.read_assignment_spec( - file_name=config.config_file_path(maz_tap_settings["SPEC"]) + file_name=self.network_los.state.filesystem.get_config_file_path( + maz_tap_settings["SPEC"] + ) ) results, _, _ = assign.assign_variables( - assignment_spec, utilities_df, model_constants + self.network_los.state, + assignment_spec, + utilities_df, + model_constants, ) assert len(results.columns == 1) utilities_df[leg] = results - chunk.log_df(trace_label, "utilities_df", utilities_df) + chunk_sizer.log_df(trace_label, "utilities_df", utilities_df) if trace: self.trace_df(utilities_df, trace_label, "utilities_df") @@ -233,7 +240,6 @@ def compute_maz_tap_utilities( def all_transit_paths( self, access_df, egress_df, chooser_attributes, trace_label, trace ): - trace_label = tracing.extend_trace_label(trace_label, "all_transit_paths") # deduped transit_df has one row per chooser for each boarding (btap) and alighting (atap) pair @@ -281,7 +287,7 @@ def compute_tap_tap_utilities( dataframe with 'idx' and 'omaz' columns egress_df: pandas.DataFrame dataframe with 'idx' and 'dmaz' columns - chooser_attributes: dict + chooser_attributes: pandas.DataFrame path_info trace_label: str trace: boolean @@ -295,8 +301,7 @@ def compute_tap_tap_utilities( trace_label = tracing.extend_trace_label(trace_label, "compute_tap_tap_utils") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(self.network_los.state, trace_label) as chunk_sizer: model_constants = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.CONSTANTS" ) @@ -309,7 +314,7 @@ def compute_tap_tap_utilities( access_df, egress_df, chooser_attributes, trace_label, trace ) # note: transit_df index is arbitrary - chunk.log_df(trace_label, "transit_df", transit_df) + chunk_sizer.log_df(trace_label, "transit_df", transit_df) # FIXME some expressions may want to know access mode - locals_dict = path_info.copy() @@ -320,7 +325,6 @@ def compute_tap_tap_utilities( # deduplicate transit_df to unique_transit_df with memo("#TVPB compute_tap_tap_utilities deduplicate transit_df"): - attribute_segments = self.network_los.setting( "TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments" ) @@ -342,12 +346,12 @@ def compute_tap_tap_utilities( ) unique_transit_df.set_index("uid", inplace=True) - chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) + chunk_sizer.log_df(trace_label, "unique_transit_df", unique_transit_df) transit_df = transit_df[ ["idx", "btap", "atap", "uid"] ] # don't need chooser columns - chunk.log_df(trace_label, "transit_df", transit_df) + chunk_sizer.log_df(trace_label, "transit_df", transit_df) logger.debug( f"#TVPB CACHE compute_tap_tap_utilities dedupe transit_df " @@ -361,6 +365,7 @@ def compute_tap_tap_utilities( with memo("#TVPB compute_tap_tap_utilities compute_utilities"): unique_utilities_df = compute_utilities( + self.network_los.state, self.network_los, tap_tap_settings, choosers=unique_transit_df, @@ -369,8 +374,10 @@ def compute_tap_tap_utilities( trace=trace, trace_column_names=chooser_columns if trace else None, ) - chunk.log_df(trace_label, "unique_utilities_df", unique_utilities_df) - chunk.log_df( + chunk_sizer.log_df( + trace_label, "unique_utilities_df", unique_utilities_df + ) + chunk_sizer.log_df( trace_label, "unique_transit_df", unique_transit_df ) # annotated @@ -384,15 +391,14 @@ def compute_tap_tap_utilities( how="left", ) self.trace_df(omnibus_df, trace_label, "unique_utilities_df") - chunk.log_df(trace_label, "omnibus_df", omnibus_df) + chunk_sizer.log_df(trace_label, "omnibus_df", omnibus_df) del omnibus_df - chunk.log_df(trace_label, "omnibus_df", None) + chunk_sizer.log_df(trace_label, "omnibus_df", None) assert num_unique_transit_rows == len(unique_utilities_df) # errcheck # redupe unique_transit_df back into transit_df with memo("#TVPB compute_tap_tap_utilities redupe transit_df"): - # idx = transit_df.index transit_df = pd.merge( transit_df, unique_utilities_df, left_on="uid", right_index=True @@ -402,7 +408,7 @@ def compute_tap_tap_utilities( # note: left merge on columns does not preserve index, # but transit_df index is arbitrary so no need to restore - chunk.log_df(trace_label, "transit_df", transit_df) + chunk_sizer.log_df(trace_label, "transit_df", transit_df) for c in unique_utilities_df: assert ERR_CHECK and not transit_df[c].isnull().any() @@ -410,9 +416,9 @@ def compute_tap_tap_utilities( if len(unique_transit_df) > 0: # if all rows were cached, then unique_utilities_df is just a ref to cache del unique_utilities_df - chunk.log_df(trace_label, "unique_utilities_df", None) + chunk_sizer.log_df(trace_label, "unique_utilities_df", None) - chunk.log_df(trace_label, "transit_df", None) + chunk_sizer.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, "transit_df") @@ -453,14 +459,13 @@ def lookup_tap_tap_utilities( trace_label = tracing.extend_trace_label(trace_label, "lookup_tap_tap_utils") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(self.network_los.state, trace_label) as chunk_sizer: with memo("#TVPB CACHE lookup_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths( access_df, egress_df, chooser_attributes, trace_label, trace=False ) # note: transit_df index is arbitrary - chunk.log_df(trace_label, "transit_df", transit_df) + chunk_sizer.log_df(trace_label, "transit_df", transit_df) if TRACE_COMPLEXITY: # diagnostic: log the omaz,dmaz pairs with the greatest number of virtual tap-tap paths @@ -496,7 +501,7 @@ def lookup_tap_tap_utilities( transit_df = transit_df[ ["idx", "btap", "atap"] ] # just needed chooser_columns for uid calculation - chunk.log_df(trace_label, "transit_df add uid index", transit_df) + chunk_sizer.log_df(trace_label, "transit_df add uid index", transit_df) with memo("#TVPB lookup_tap_tap_utilities reindex transit_df"): utilities = self.tap_cache.data @@ -508,7 +513,7 @@ def lookup_tap_tap_utilities( for c in self.uid_calculator.set_names: assert ERR_CHECK and not transit_df[c].isnull().any() - chunk.log_df(trace_label, "transit_df", None) + chunk_sizer.log_df(trace_label, "transit_df", None) return transit_df @@ -522,11 +527,9 @@ def compute_tap_tap_time( trace_label, trace, ): - trace_label = tracing.extend_trace_label(trace_label, "compute_tap_tap_time") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(self.network_los.state, trace_label) as chunk_sizer: model_constants = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.CONSTANTS" ) @@ -539,7 +542,7 @@ def compute_tap_tap_time( access_df, egress_df, chooser_attributes, trace_label, trace ) # note: transit_df index is arbitrary - chunk.log_df(trace_label, "transit_df", transit_df) + chunk_sizer.log_df(trace_label, "transit_df", transit_df) # some expressions may want to know access mode - locals_dict = path_info.copy() @@ -547,12 +550,13 @@ def compute_tap_tap_time( locals_dict.update(model_constants) assignment_spec = assign.read_assignment_spec( - file_name=config.config_file_path(tap_tap_settings["SPEC"]) + file_name=self.network_los.state.filesystem.get_config_file_path( + tap_tap_settings["SPEC"] + ) ) DEDUPE = True if DEDUPE: - # assign uid for reduping max_atap = transit_df.atap.max() + 1 transit_df["uid"] = transit_df.btap * max_atap + transit_df.atap @@ -564,7 +568,7 @@ def compute_tap_tap_time( ["btap", "atap", "uid"] + chooser_attribute_columns, ] unique_transit_df.set_index("uid", inplace=True) - chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) + chunk_sizer.log_df(trace_label, "unique_transit_df", unique_transit_df) logger.debug( f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}" @@ -572,7 +576,10 @@ def compute_tap_tap_time( # assign_variables results, _, _ = assign.assign_variables( - assignment_spec, unique_transit_df, locals_dict + self.network_los.state, + assignment_spec, + unique_transit_df, + locals_dict, ) assert len(results.columns == 1) unique_transit_df["transit"] = results @@ -585,12 +592,12 @@ def compute_tap_tap_time( del transit_df["uid"] del unique_transit_df - chunk.log_df(trace_label, "transit_df", transit_df) - chunk.log_df(trace_label, "unique_transit_df", None) + chunk_sizer.log_df(trace_label, "transit_df", transit_df) + chunk_sizer.log_df(trace_label, "unique_transit_df", None) else: results, _, _ = assign.assign_variables( - assignment_spec, transit_df, locals_dict + self.network_los.state, assignment_spec, transit_df, locals_dict ) assert len(results.columns == 1) transit_df["transit"] = results @@ -603,7 +610,7 @@ def compute_tap_tap_time( transit_df.drop(columns=chooser_attributes.columns, inplace=True) - chunk.log_df(trace_label, "transit_df", None) + chunk_sizer.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, "transit_df") @@ -621,9 +628,7 @@ def compute_tap_tap( trace_label, trace, ): - if self.units_for_recipe(recipe) == "utility": - if not self.tap_cache.is_open: with memo("#TVPB compute_tap_tap tap_cache.open"): self.tap_cache.open() @@ -675,11 +680,9 @@ def best_paths( trace_label, trace=False, ): - trace_label = tracing.extend_trace_label(trace_label, "best_paths") - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(self.network_los.state, trace_label) as chunk_sizer: path_settings = self.network_los.setting( f"TVPB_SETTINGS.{recipe}.path_types.{path_type}" ) @@ -700,7 +703,7 @@ def best_paths( .merge(transit_df, on=["idx", "atap", "btap"], how="inner") ) - chunk.log_df(trace_label, "path_df", path_df) + chunk_sizer.log_df(trace_label, "path_df", path_df) # transit sets are the transit_df non-join columns transit_sets = [ @@ -770,8 +773,9 @@ def build_virtual_path( filter_targets=None, trace=False, override_choices=None, + *, + chunk_sizer, ): - trace_label = tracing.extend_trace_label(trace_label, "build_virtual_path") # Tracing is implemented as a seperate, second call that operates ONLY on filter_targets @@ -832,7 +836,7 @@ def build_virtual_path( "seq": range(len(orig)), } ) - chunk.log_df(trace_label, "maz_od_df", maz_od_df) + chunk_sizer.log_df(trace_label, "maz_od_df", maz_od_df) self.trace_maz_tap(maz_od_df, access_mode, egress_mode) # for location choice, there will be multiple alt dest rows per chooser and duplicate orig.index values @@ -861,7 +865,7 @@ def build_virtual_path( trace_label=trace_label, trace=trace, ) - chunk.log_df(trace_label, "access_df", access_df) + chunk_sizer.log_df(trace_label, "access_df", access_df) with memo("#TVPB build_virtual_path egress_df"): egress_df = self.compute_maz_tap_utilities( @@ -873,7 +877,7 @@ def build_virtual_path( trace_label=trace_label, trace=trace, ) - chunk.log_df(trace_label, "egress_df", egress_df) + chunk_sizer.log_df(trace_label, "egress_df", egress_df) # L200 will drop all rows if all trips are intra-tap. if np.array_equal(access_df["btap"].values, egress_df["atap"].values): @@ -893,7 +897,7 @@ def build_virtual_path( trace_label=trace_label, trace=trace, ) - chunk.log_df(trace_label, "transit_df", transit_df) + chunk_sizer.log_df(trace_label, "transit_df", transit_df) # Cannot trace if df is empty. Prob happened at L200 if len(transit_df) == 0: @@ -910,24 +914,23 @@ def build_virtual_path( trace_label, trace, ) - chunk.log_df(trace_label, "path_df", path_df) + chunk_sizer.log_df(trace_label, "path_df", path_df) # now that we have created path_df, we are done with the dataframes for the separate legs del access_df - chunk.log_df(trace_label, "access_df", None) + chunk_sizer.log_df(trace_label, "access_df", None) del egress_df - chunk.log_df(trace_label, "egress_df", None) + chunk_sizer.log_df(trace_label, "egress_df", None) del transit_df - chunk.log_df(trace_label, "transit_df", None) + chunk_sizer.log_df(trace_label, "transit_df", None) if units == "utility": - # logsums with memo("#TVPB build_virtual_path logsums"): # one row per seq with utilities in columns # path_num 0-based to aligh with logit.make_choices 0-based choice indexes path_df["path_num"] = path_df.groupby("seq").cumcount() - chunk.log_df(trace_label, "path_df", path_df) + chunk_sizer.log_df(trace_label, "path_df", path_df) utilities_df = ( path_df[["seq", "path_num", units]] @@ -946,7 +949,7 @@ def build_virtual_path( UNAVAILABLE ) # set utilities for missing paths to UNAVAILABLE - chunk.log_df(trace_label, "utilities_df", utilities_df) + chunk_sizer.log_df(trace_label, "utilities_df", utilities_df) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. @@ -975,21 +978,24 @@ def build_virtual_path( np.nansum(np.exp(utilities_df.values), axis=1) == 0 ] zero_utilities_df.to_csv( - config.output_file_path("warning_utilities_df.csv"), + self.network_los.state.get_output_file_path( + "warning_utilities_df.csv" + ), index=True, ) if want_choices: - # orig index to identify appropriate random number channel to use making choices utilities_df.index = orig.index with memo("#TVPB build_virtual_path make_choices"): - probs = logit.utils_to_probs( - utilities_df, allow_zero_probs=True, trace_label=trace_label + self.network_los.state, + utilities_df, + allow_zero_probs=True, + trace_label=trace_label, ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) if trace: choices = override_choices @@ -1000,17 +1006,19 @@ def build_virtual_path( probs["choices"] = choices self.trace_df(probs, trace_label, "probs") else: - choices, rands = logit.make_choices( - probs, allow_bad_probs=True, trace_label=trace_label + self.network_los.state, + probs, + allow_bad_probs=True, + trace_label=trace_label, ) - chunk.log_df(trace_label, "rands", rands) + chunk_sizer.log_df(trace_label, "rands", rands) del rands - chunk.log_df(trace_label, "rands", None) + chunk_sizer.log_df(trace_label, "rands", None) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) # we need to get path_set, btap, atap from path_df row with same seq and path_num # drop seq join column, but keep path_num of choice to override_choices when tracing @@ -1031,19 +1039,18 @@ def build_virtual_path( logsum_df["logsum"] = logsums else: - assert len(logsums) == len(orig) logsum_df = pd.DataFrame({"logsum": logsums}, index=orig.index) - chunk.log_df(trace_label, "logsum_df", logsum_df) + chunk_sizer.log_df(trace_label, "logsum_df", logsum_df) del utilities_df - chunk.log_df(trace_label, "utilities_df", None) + chunk_sizer.log_df(trace_label, "utilities_df", None) if trace: self.trace_df(logsum_df, trace_label, "logsum_df") - chunk.log_df(trace_label, "logsum_df", logsum_df) + chunk_sizer.log_df(trace_label, "logsum_df", logsum_df) results = logsum_df else: @@ -1055,12 +1062,12 @@ def build_virtual_path( # zero-fill rows for O-D pairs where no best path exists because there was no tap-tap transit availability results = reindex(results, maz_od_df.idx).fillna(0.0) - chunk.log_df(trace_label, "results", results) + chunk_sizer.log_df(trace_label, "results", results) assert len(results) == len(orig) del path_df - chunk.log_df(trace_label, "path_df", None) + chunk_sizer.log_df(trace_label, "path_df", None) # diagnostic # maz_od_df['DIST'] = self.network_los.get_default_skim_dict().get('DIST').get(maz_od_df.omaz, maz_od_df.dmaz) @@ -1080,13 +1087,11 @@ def get_tvpb_logsum( recipe="tour_mode_choice", trace_label=None, ): - # assume they have given us a more specific name (since there may be more than one active wrapper) trace_label = trace_label or "get_tvpb_logsum" trace_label = tracing.extend_trace_label(trace_label, path_type) - with chunk.chunk_log(trace_label): - + with chunk.chunk_log(self.network_los.state, trace_label) as chunk_sizer: logsum_df = self.build_virtual_path( recipe, path_type, @@ -1096,14 +1101,15 @@ def get_tvpb_logsum( demographic_segment, want_choices=want_choices, trace_label=trace_label, + chunk_sizer=chunk_sizer, ) - trace_hh_id = inject.get_injectable("trace_hh_id", None) + trace_hh_id = self.network_los.state.settings.trace_hh_id if (all(logsum_df["logsum"] == UNAVAILABLE)) or (len(logsum_df) == 0): trace_hh_id = False if trace_hh_id: - filter_targets = tracing.trace_targets(orig) + filter_targets = self.network_los.state.tracing.trace_targets(orig) # choices from preceding run (because random numbers) override_choices = logsum_df["path_num"] if want_choices else None if filter_targets.any(): @@ -1119,19 +1125,19 @@ def get_tvpb_logsum( trace_label=trace_label, filter_targets=filter_targets, trace=True, + chunk_sizer=chunk_sizer, ) return logsum_df def get_tvpb_best_transit_time(self, orig, dest, tod): - # FIXME lots of pathological knowledge here as we are only called by accessibility directly from expressions trace_label = tracing.extend_trace_label("accessibility.tvpb_best_time", tod) recipe = "accessibility" path_type = "WTW" - with chunk.chunk_log(trace_label): + with chunk.chunk_log(self.network_los.state, trace_label) as chunk_sizer: result = self.build_virtual_path( recipe, path_type, @@ -1141,9 +1147,10 @@ def get_tvpb_best_transit_time(self, orig, dest, tod): demographic_segment=None, want_choices=False, trace_label=trace_label, + chunk_sizer=chunk_sizer, ) - trace_od = inject.get_injectable("trace_od", None) + trace_od = self.network_los.state.get_injectable("trace_od", None) if trace_od: filter_targets = (orig == trace_od[0]) & (dest == trace_od[1]) if filter_targets.any(): @@ -1158,6 +1165,7 @@ def get_tvpb_best_transit_time(self, orig, dest, tod): trace_label=trace_label, filter_targets=filter_targets, trace=True, + chunk_sizer=chunk_sizer, ) return result @@ -1173,7 +1181,6 @@ def wrap_logsum( trace_label=None, tag=None, ): - return TransitVirtualPathLogsumWrapper( self, orig_key, @@ -1204,7 +1211,6 @@ def __init__( trace_label, tag, ): - self.tvpb = pathbuilder assert hasattr(pathbuilder, "get_tvpb_logsum") @@ -1259,7 +1265,7 @@ def __getitem__(self, path_type): Parameters ---------- - key : hashable + path_type : hashable The key (identifier) for this skim object Returns @@ -1309,7 +1315,6 @@ def __getitem__(self, path_type): ) if (self.cache_choices) and (not all(logsum_df["logsum"] == UNAVAILABLE)): - # not tested on duplicate index because not currently needed # caching strategy does not require unique indexes but care would need to be taken to maintain alignment assert not orig.index.duplicated().any() diff --git a/activitysim/core/pathbuilder_cache.py b/activitysim/core/pathbuilder_cache.py index 5153cf1602..02f8db8dd0 100644 --- a/activitysim/core/pathbuilder_cache.py +++ b/activitysim/core/pathbuilder_cache.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import itertools import logging import multiprocessing @@ -11,7 +13,7 @@ import pandas as pd import psutil -from activitysim.core import config, inject, simulate, util +from activitysim.core import config, los, util logger = logging.getLogger(__name__) @@ -67,7 +69,7 @@ class TVPBCache(object): Transit virtual path builder cache for three zone systems """ - def __init__(self, network_los, uid_calculator, cache_tag): + def __init__(self, network_los: "los.Network_LOS", uid_calculator, cache_tag): # lightweight until opened @@ -83,12 +85,18 @@ def __init__(self, network_los, uid_calculator, cache_tag): @property def cache_path(self): file_type = "mmap" - return os.path.join(config.get_cache_dir(), f"{self.cache_tag}.{file_type}") + return os.path.join( + self.network_los.state.filesystem.get_cache_dir(), + f"{self.cache_tag}.{file_type}", + ) @property def csv_trace_path(self): file_type = "csv" - return os.path.join(config.get_cache_dir(), f"{self.cache_tag}.{file_type}") + return os.path.join( + self.network_los.state.filesystem.get_cache_dir(), + f"{self.cache_tag}.{file_type}", + ) def cleanup(self): """ @@ -105,7 +113,8 @@ def cleanup(self): while True: n += 1 candidate = os.path.join( - config.get_cache_dir(), f"{self.cache_tag}.{n}.mmap" + self.network_los.state.filesystem.get_cache_dir(), + f"{self.cache_tag}.{n}.mmap", ) if not os.path.isfile(candidate): self.cache_tag = f"{self.cache_tag}.{n}" @@ -307,7 +316,7 @@ def get_data_and_lock_from_buffers(self): ------- either multiprocessing.Array and lock or multiprocessing.RawArray and None according to RAWARRAY """ - data_buffers = inject.get_injectable("data_buffers", None) + data_buffers = self.network_los.state.get_injectable("data_buffers", None) assert self.cache_tag in data_buffers # internal error logger.debug(f"TVPBCache.get_data_and_lock_from_buffers") data_buffer = data_buffers[self.cache_tag] @@ -321,7 +330,7 @@ def get_data_and_lock_from_buffers(self): return data, lock -class TapTapUidCalculator(object): +class TapTapUidCalculator: """ Transit virtual path builder TAP to TAP unique ID calculator for three zone systems """ @@ -362,7 +371,11 @@ def __init__(self, network_los): spec_name = self.network_los.setting( f"TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.SPEC" ) - self.set_names = list(simulate.read_model_spec(file_name=spec_name).columns) + self.set_names = list( + self.network_los.state.filesystem.read_model_spec( + file_name=spec_name + ).columns + ) @property def fully_populated_shape(self): @@ -481,7 +494,7 @@ def each_scalar_attribute_combination(self): yield scalar_attributes - def scalar_attribute_combinations(self): + def scalar_attribute_combinations(self) -> pd.DataFrame: attribute_names = list(self.segmentation.keys()) attribute_tuples = self.attribute_combination_tuples x = [list(t) for t in attribute_tuples] diff --git a/activitysim/core/pipeline.py b/activitysim/core/pipeline.py deleted file mode 100644 index fe3ceca456..0000000000 --- a/activitysim/core/pipeline.py +++ /dev/null @@ -1,978 +0,0 @@ -# ActivitySim -# See full license in LICENSE.txt. -import datetime as dt -import logging -import os -from builtins import map, next, object - -import pandas as pd -from orca import orca - -from . import config, inject, mem, random, tracing, util -from .tracing import print_elapsed_time - -logger = logging.getLogger(__name__) - -# name of the checkpoint dict keys -# (which are also columns in the checkpoints dataframe stored in hte pipeline store) -TIMESTAMP = "timestamp" -CHECKPOINT_NAME = "checkpoint_name" -NON_TABLE_COLUMNS = [CHECKPOINT_NAME, TIMESTAMP] - -# name used for storing the checkpoints dataframe to the pipeline store -CHECKPOINT_TABLE_NAME = "checkpoints" - -# name of the first step/checkpoint created when the pipeline is started -INITIAL_CHECKPOINT_NAME = "init" -FINAL_CHECKPOINT_NAME = "final" - -# special value for resume_after meaning last checkpoint -LAST_CHECKPOINT = "_" - -# single character prefix for run_list model name to indicate that no checkpoint should be saved -NO_CHECKPOINT_PREFIX = "_" - - -class Pipeline(object): - def __init__(self): - self.init_state() - - def init_state(self): - - # most recent checkpoint - self.last_checkpoint = {} - - # array of checkpoint dicts - self.checkpoints = [] - - self.replaced_tables = {} - - self._rng = random.Random() - - self.open_files = {} - - self.pipeline_store = None - - self.is_open = False - - tracing.initialize_traceable_tables() - - def rng(self): - - return self._rng - - -_PIPELINE = Pipeline() - - -def is_open(): - return _PIPELINE.is_open - - -def is_readonly(): - if is_open(): - store = get_pipeline_store() - if store and store._mode == "r": - return True - return False - - -def pipeline_table_key(table_name, checkpoint_name): - if checkpoint_name: - key = f"{table_name}/{checkpoint_name}" - else: - key = f"/{table_name}" - return key - - -def close_on_exit(file, name): - assert name not in _PIPELINE.open_files - _PIPELINE.open_files[name] = file - - -def close_open_files(): - for name, file in _PIPELINE.open_files.items(): - print("Closing %s" % name) - file.close() - _PIPELINE.open_files.clear() - - -def open_pipeline_store(overwrite=False, mode="a"): - """ - Open the pipeline checkpoint store - - Parameters - ---------- - overwrite : bool - delete file before opening (unless resuming) - mode : {'a', 'w', 'r', 'r+'}, default 'a' - ``'r'`` - Read-only; no data can be modified. - ``'w'`` - Write; a new file is created (an existing file with the same - name would be deleted). - ``'a'`` - Append; an existing file is opened for reading and writing, - and if the file does not exist it is created. - ``'r+'`` - It is similar to ``'a'``, but the file must already exist. - """ - - if _PIPELINE.pipeline_store is not None: - raise RuntimeError("Pipeline store is already open!") - - pipeline_file_path = config.pipeline_file_path( - inject.get_injectable("pipeline_file_name") - ) - - if overwrite: - try: - if os.path.isfile(pipeline_file_path): - logger.debug("removing pipeline store: %s" % pipeline_file_path) - os.unlink(pipeline_file_path) - except Exception as e: - print(e) - logger.warning("Error removing %s: %s" % (pipeline_file_path, e)) - - _PIPELINE.pipeline_store = pd.HDFStore(pipeline_file_path, mode=mode) - - logger.debug(f"opened pipeline_store {pipeline_file_path}") - - -def get_pipeline_store(): - """ - Return the open pipeline hdf5 checkpoint store or return None if it not been opened - """ - return _PIPELINE.pipeline_store - - -def get_rn_generator(): - """ - Return the singleton random number object - - Returns - ------- - activitysim.random.Random - """ - return _PIPELINE.rng() - - -def read_df(table_name, checkpoint_name=None): - """ - Read a pandas dataframe from the pipeline store. - - We store multiple versions of all simulation tables, for every checkpoint in which they change, - so we need to know both the table_name and the checkpoint_name of hte desired table. - - The only exception is the checkpoints dataframe, which just has a table_name - - An error will be raised by HDFStore if the table is not found - - Parameters - ---------- - table_name : str - checkpoint_name : str - - Returns - ------- - df : pandas.DataFrame - the dataframe read from the store - - """ - - store = get_pipeline_store() - df = store[pipeline_table_key(table_name, checkpoint_name)] - - return df - - -def write_df(df, table_name, checkpoint_name=None): - """ - Write a pandas dataframe to the pipeline store. - - We store multiple versions of all simulation tables, for every checkpoint in which they change, - so we need to know both the table_name and the checkpoint_name to label the saved table - - The only exception is the checkpoints dataframe, which just has a table_name - - Parameters - ---------- - df : pandas.DataFrame - dataframe to store - table_name : str - also conventionally the injected table name - checkpoint_name : str - the checkpoint at which the table was created/modified - """ - - # coerce column names to str as unicode names will cause PyTables to pickle them - df.columns = df.columns.astype(str) - - store = get_pipeline_store() - - store[pipeline_table_key(table_name, checkpoint_name)] = df - - store.flush() - - -def rewrap(table_name, df=None): - """ - Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table - - if df is None, then get the dataframe from orca (table_name should be registered, or - an error will be thrown) which may involve evaluating added columns, etc. - - If the orca table already exists, deregister it along with any associated columns before - re-registering it. - - The net result is that the dataframe is a registered orca DataFrameWrapper table with no - computed or added columns. - - Parameters - ---------- - table_name - df - - Returns - ------- - the underlying df of the rewrapped table - """ - - logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None))) - - if orca.is_table(table_name): - - if df is None: - # logger.debug("rewrap - orca.get_table(%s)" % (table_name,)) - t = orca.get_table(table_name) - df = t.to_frame() - else: - # logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name,)) - # don't trigger function call of TableFuncWrapper - t = orca.get_raw_table(table_name) - - t.clear_cached() - - for column_name in orca.list_columns_for_table(table_name): - # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name))) - # fixme - orca._COLUMNS.pop((table_name, column_name), None) - - # remove from orca's table list - orca._TABLES.pop(table_name, None) - - assert df is not None - - orca.add_table(table_name, df) - - return df - - -def add_checkpoint(checkpoint_name): - """ - Create a new checkpoint with specified name, write all data required to restore the simulation - to its current state. - - Detect any changed tables , re-wrap them and write the current version to the pipeline store. - Write the current state of the random number generator. - - Parameters - ---------- - checkpoint_name : str - """ - timestamp = dt.datetime.now() - - logger.debug("add_checkpoint %s timestamp %s" % (checkpoint_name, timestamp)) - - for table_name in registered_tables(): - - # if we have not already checkpointed it or it has changed - # FIXME - this won't detect if the orca table was modified - if len(orca.list_columns_for_table(table_name)): - # rewrap the changed orca table as a unitary DataFrame-backed DataFrameWrapper table - df = rewrap(table_name) - elif ( - table_name not in _PIPELINE.last_checkpoint - or table_name in _PIPELINE.replaced_tables - ): - df = orca.get_table(table_name).to_frame() - else: - continue - - logger.debug( - "add_checkpoint '%s' table '%s' %s" - % (checkpoint_name, table_name, util.df_size(df)) - ) - write_df(df, table_name, checkpoint_name) - - # remember which checkpoint it was last written - _PIPELINE.last_checkpoint[table_name] = checkpoint_name - - _PIPELINE.replaced_tables.clear() - - _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = checkpoint_name - _PIPELINE.last_checkpoint[TIMESTAMP] = timestamp - - # append to the array of checkpoint history - _PIPELINE.checkpoints.append(_PIPELINE.last_checkpoint.copy()) - - # create a pandas dataframe of the checkpoint history, one row per checkpoint - checkpoints = pd.DataFrame(_PIPELINE.checkpoints) - - # convert empty values to str so PyTables doesn't pickle object types - for c in checkpoints.columns: - checkpoints[c] = checkpoints[c].fillna("") - - # write it to the store, overwriting any previous version (no way to simply extend) - write_df(checkpoints, CHECKPOINT_TABLE_NAME) - - -def registered_tables(): - """ - Return a list of the names of all currently registered dataframe tables - """ - return [name for name in orca.list_tables() if orca.table_type(name) == "dataframe"] - - -def checkpointed_tables(): - """ - Return a list of the names of all checkpointed tables - """ - - return [ - name - for name, checkpoint_name in _PIPELINE.last_checkpoint.items() - if checkpoint_name and name not in NON_TABLE_COLUMNS - ] - - -def load_checkpoint(checkpoint_name): - """ - Load dataframes and restore random number channel state from pipeline hdf5 file. - This restores the pipeline state that existed at the specified checkpoint in a prior simulation. - This allows us to resume the simulation after the specified checkpoint - - Parameters - ---------- - checkpoint_name : str - model_name of checkpoint to load (resume_after argument to open_pipeline) - """ - - logger.info("load_checkpoint %s" % (checkpoint_name)) - - checkpoints = read_df(CHECKPOINT_TABLE_NAME) - - if checkpoint_name == LAST_CHECKPOINT: - checkpoint_name = checkpoints[CHECKPOINT_NAME].iloc[-1] - logger.info("loading checkpoint '%s'" % checkpoint_name) - - try: - # truncate rows after target checkpoint - i = checkpoints[checkpoints[CHECKPOINT_NAME] == checkpoint_name].index[0] - checkpoints = checkpoints.loc[:i] - - # if the store is not open in read-only mode, - # write it to the store to ensure so any subsequent checkpoints are forgotten - if not is_readonly(): - write_df(checkpoints, CHECKPOINT_TABLE_NAME) - - except IndexError: - msg = "Couldn't find checkpoint '%s' in checkpoints" % (checkpoint_name,) - print(checkpoints[CHECKPOINT_NAME]) - logger.error(msg) - raise RuntimeError(msg) - - # convert pandas dataframe back to array of checkpoint dicts - checkpoints = checkpoints.to_dict(orient="records") - - # drop tables with empty names - for checkpoint in checkpoints: - for key in list(checkpoint.keys()): - if key not in NON_TABLE_COLUMNS and not checkpoint[key]: - del checkpoint[key] - - # patch _CHECKPOINTS array of dicts - _PIPELINE.checkpoints = checkpoints - - # patch _CHECKPOINTS dict with latest checkpoint info - _PIPELINE.last_checkpoint.clear() - _PIPELINE.last_checkpoint.update(_PIPELINE.checkpoints[-1]) - - logger.info( - "load_checkpoint %s timestamp %s" - % (checkpoint_name, _PIPELINE.last_checkpoint["timestamp"]) - ) - - tables = checkpointed_tables() - - loaded_tables = {} - for table_name in tables: - # read dataframe from pipeline store - df = read_df(table_name, checkpoint_name=_PIPELINE.last_checkpoint[table_name]) - logger.info("load_checkpoint table %s %s" % (table_name, df.shape)) - # register it as an orca table - rewrap(table_name, df) - loaded_tables[table_name] = df - if table_name == "land_use" and "_original_zone_id" in df.columns: - # The presence of _original_zone_id indicates this table index was - # decoded to zero-based, so we need to disable offset - # processing for legacy skim access. - # TODO: this "magic" column name should be replaced with a mechanism - # to write and recover particular settings from the pipeline - # store, but we don't have that mechanism yet - config.override_setting("offset_preprocessing", True) - - # register for tracing in order that tracing.register_traceable_table wants us to register them - traceable_tables = inject.get_injectable("traceable_tables", []) - - for table_name in traceable_tables: - if table_name in loaded_tables: - tracing.register_traceable_table(table_name, loaded_tables[table_name]) - - # add tables of known rng channels - rng_channels = inject.get_injectable("rng_channels", []) - if rng_channels: - logger.debug("loading random channels %s" % rng_channels) - for table_name in rng_channels: - if table_name in loaded_tables: - logger.debug("adding channel %s" % (table_name,)) - _PIPELINE.rng().add_channel(table_name, loaded_tables[table_name]) - - -def split_arg(s, sep, default=""): - """ - split str s in two at first sep, returning empty string as second result if no sep - """ - r = s.split(sep, 2) - r = list(map(str.strip, r)) - - arg = r[0] - - if len(r) == 1: - val = default - else: - val = r[1] - val = {"true": True, "false": False}.get(val.lower(), val) - - return arg, val - - -def run_model(model_name): - """ - Run the specified model and add checkpoint for model_name - - Since we use model_name as checkpoint name, the same model may not be run more than once. - - Parameters - ---------- - model_name : str - model_name is assumed to be the name of a registered orca step - """ - - if not is_open(): - raise RuntimeError("Pipeline not initialized! Did you call open_pipeline?") - - # can't run same model more than once - if model_name in [ - checkpoint[CHECKPOINT_NAME] for checkpoint in _PIPELINE.checkpoints - ]: - raise RuntimeError("Cannot run model '%s' more than once" % model_name) - - _PIPELINE.rng().begin_step(model_name) - - # check for args - if "." in model_name: - step_name, arg_string = model_name.split(".", 1) - args = dict( - (k, v) - for k, v in ( - split_arg(item, "=", default=True) for item in arg_string.split(";") - ) - ) - else: - step_name = model_name - args = {} - - # check for no_checkpoint prefix - if step_name[0] == NO_CHECKPOINT_PREFIX: - step_name = step_name[1:] - checkpoint = False - else: - checkpoint = intermediate_checkpoint(model_name) - - inject.set_step_args(args) - - mem.trace_memory_info(f"pipeline.run_model {model_name} start") - - t0 = print_elapsed_time() - logger.info(f"#run_model running step {step_name}") - - instrument = config.setting("instrument", None) - if instrument is not None: - try: - from pyinstrument import Profiler - except ImportError: - instrument = False - if isinstance(instrument, (list, set, tuple)): - if step_name not in instrument: - instrument = False - else: - instrument = True - - if instrument: - with Profiler() as profiler: - orca.run([step_name]) - out_file = config.profiling_file_path(f"{step_name}.html") - with open(out_file, "wt") as f: - f.write(profiler.output_html()) - else: - orca.run([step_name]) - - t0 = print_elapsed_time( - "#run_model completed step '%s'" % model_name, t0, debug=True - ) - mem.trace_memory_info(f"pipeline.run_model {model_name} finished") - - inject.set_step_args(None) - - _PIPELINE.rng().end_step(model_name) - if checkpoint: - add_checkpoint(model_name) - else: - logger.info("##### skipping %s checkpoint for %s" % (step_name, model_name)) - - -def open_pipeline(resume_after=None, mode="a"): - """ - Start pipeline, either for a new run or, if resume_after, loading checkpoint from pipeline. - - If resume_after, then we expect the pipeline hdf5 file to exist and contain - checkpoints from a previous run, including a checkpoint with name specified in resume_after - - Parameters - ---------- - resume_after : str or None - name of checkpoint to load from pipeline store - mode : {'a', 'w', 'r', 'r+'}, default 'a' - same as for typical opening of H5Store. Ignored unless resume_after - is not None. This is here to allow read-only pipeline for benchmarking. - """ - - if is_open(): - raise RuntimeError("Pipeline is already open!") - - _PIPELINE.init_state() - _PIPELINE.is_open = True - - get_rn_generator().set_base_seed(inject.get_injectable("rng_base_seed", 0)) - - if resume_after: - # open existing pipeline - logger.debug("open_pipeline - open existing pipeline") - open_pipeline_store(overwrite=False, mode=mode) - try: - load_checkpoint(resume_after) - except KeyError as err: - if "checkpoints" in err.args[0]: - # no checkpoints initialized, fall back to restart - _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME - add_checkpoint(INITIAL_CHECKPOINT_NAME) - else: - raise - else: - # open new, empty pipeline - logger.debug("open_pipeline - new, empty pipeline") - open_pipeline_store(overwrite=True) - # - not sure why I thought we needed this? - # could have exogenous tables or prng instantiation under some circumstance?? - _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME - # empty table, in case they have turned off all checkpointing - add_checkpoint(INITIAL_CHECKPOINT_NAME) - - logger.debug("open_pipeline complete") - - -def last_checkpoint(): - """ - - Returns - ------- - last_checkpoint: str - name of last checkpoint - """ - - assert is_open(), f"Pipeline is not open." - - return _PIPELINE.last_checkpoint[CHECKPOINT_NAME] - - -def close_pipeline(): - """ - Close any known open files - """ - - assert is_open(), f"Pipeline is not open." - - close_open_files() - - _PIPELINE.pipeline_store.close() - - _PIPELINE.init_state() - - logger.debug("close_pipeline") - - -def intermediate_checkpoint(checkpoint_name=None): - - checkpoints = config.setting("checkpoints", True) - - if checkpoints is True or checkpoints is False: - return checkpoints - - assert isinstance( - checkpoints, list - ), f"setting 'checkpoints'' should be True or False or a list" - - return checkpoint_name in checkpoints - - -def run(models, resume_after=None, memory_sidecar_process=None): - """ - run the specified list of models, optionally loading checkpoint and resuming after specified - checkpoint. - - Since we use model_name as checkpoint name, the same model may not be run more than once. - - If resume_after checkpoint is specified and a model with that name appears in the models list, - then we only run the models after that point in the list. This allows the user always to pass - the same list of models, but specify a resume_after point if desired. - - Parameters - ---------- - models : [str] - list of model_names - resume_after : str or None - model_name of checkpoint to load checkpoint and AFTER WHICH to resume model run - memory_sidecar_process : MemorySidecar, optional - Subprocess that monitors memory usage - - returns: - nothing, but with pipeline open - """ - - t0 = print_elapsed_time() - - open_pipeline(resume_after) - t0 = print_elapsed_time("open_pipeline", t0) - - if resume_after == LAST_CHECKPOINT: - resume_after = _PIPELINE.last_checkpoint[CHECKPOINT_NAME] - - if resume_after: - logger.info("resume_after %s" % resume_after) - if resume_after in models: - models = models[models.index(resume_after) + 1 :] - - mem.trace_memory_info("pipeline.run before preload_injectables") - - # preload any bulky injectables (e.g. skims) not in pipeline - if inject.get_injectable("preload_injectables", None): - if memory_sidecar_process: - memory_sidecar_process.set_event("preload_injectables") - t0 = print_elapsed_time("preload_injectables", t0) - - mem.trace_memory_info("pipeline.run after preload_injectables") - - t0 = print_elapsed_time() - for model in models: - if memory_sidecar_process: - memory_sidecar_process.set_event(model) - t1 = print_elapsed_time() - run_model(model) - mem.trace_memory_info(f"pipeline.run after {model}") - - tracing.log_runtime(model_name=model, start_time=t1) - - if memory_sidecar_process: - memory_sidecar_process.set_event("finalizing") - - # add checkpoint with final tables even if not intermediate checkpointing - if not intermediate_checkpoint(): - add_checkpoint(FINAL_CHECKPOINT_NAME) - - mem.trace_memory_info("pipeline.run after run_models") - - t0 = print_elapsed_time("run_model (%s models)" % len(models), t0) - - # don't close the pipeline, as the user may want to read intermediate results from the store - - -def get_table(table_name, checkpoint_name=None): - """ - Return pandas dataframe corresponding to table_name - - if checkpoint_name is None, return the current (most recent) version of the table. - The table can be a checkpointed table or any registered orca table (e.g. function table) - - if checkpoint_name is specified, return table as it was at that checkpoint - (the most recently checkpointed version of the table at or before checkpoint_name) - - Parameters - ---------- - table_name : str - checkpoint_name : str or None - - Returns - ------- - df : pandas.DataFrame - """ - - assert is_open(), f"Pipeline is not open." - - # orca table not in checkpoints (e.g. a merged table) - if table_name not in _PIPELINE.last_checkpoint and orca.is_table(table_name): - if checkpoint_name is not None: - raise RuntimeError( - "get_table: checkpoint_name ('%s') not supported" - "for non-checkpointed table '%s'" % (checkpoint_name, table_name) - ) - - return orca.get_table(table_name).to_frame() - - # if they want current version of table, no need to read from pipeline store - if checkpoint_name is None: - - if table_name not in _PIPELINE.last_checkpoint: - raise RuntimeError("table '%s' never checkpointed." % table_name) - - if not _PIPELINE.last_checkpoint[table_name]: - raise RuntimeError("table '%s' was dropped." % table_name) - - # return orca.get_table(table_name).local - return orca.get_table(table_name).to_frame() - - # find the requested checkpoint - checkpoint = next( - (x for x in _PIPELINE.checkpoints if x["checkpoint_name"] == checkpoint_name), - None, - ) - if checkpoint is None: - raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name) - - # find the checkpoint that table was written to store - last_checkpoint_name = checkpoint.get(table_name, None) - - if not last_checkpoint_name: - raise RuntimeError( - "table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name) - ) - - # if this version of table is same as current - if _PIPELINE.last_checkpoint.get(table_name, None) == last_checkpoint_name: - return orca.get_table(table_name).to_frame() - - return read_df(table_name, last_checkpoint_name) - - -def get_checkpoints(): - """ - Get pandas dataframe of info about all checkpoints stored in pipeline - - pipeline doesn't have to be open - - Returns - ------- - checkpoints_df : pandas.DataFrame - - """ - - store = get_pipeline_store() - - if store is not None: - df = store[CHECKPOINT_TABLE_NAME] - else: - pipeline_file_path = config.pipeline_file_path( - orca.get_injectable("pipeline_file_name") - ) - df = pd.read_hdf(pipeline_file_path, CHECKPOINT_TABLE_NAME) - - # non-table columns first (column order in df is random because created from a dict) - table_names = [name for name in df.columns.values if name not in NON_TABLE_COLUMNS] - - df = df[NON_TABLE_COLUMNS + table_names] - - return df - - -def replace_table(table_name, df): - """ - Add or replace a orca table, removing any existing added orca columns - - The use case for this function is a method that calls to_frame on an orca table, modifies - it and then saves the modified. - - orca.to_frame returns a copy, so no changes are saved, and adding multiple column with - add_column adds them in an indeterminate order. - - Simply replacing an existing the table "behind the pipeline's back" by calling orca.add_table - risks pipeline to failing to detect that it has changed, and thus not checkpoint the changes. - - Parameters - ---------- - table_name : str - orca/pipeline table name - df : pandas DataFrame - """ - - assert is_open(), f"Pipeline is not open." - - if df.columns.duplicated().any(): - logger.error( - "replace_table: dataframe '%s' has duplicate columns: %s" - % (table_name, df.columns[df.columns.duplicated()]) - ) - - raise RuntimeError( - "replace_table: dataframe '%s' has duplicate columns: %s" - % (table_name, df.columns[df.columns.duplicated()]) - ) - - rewrap(table_name, df) - - _PIPELINE.replaced_tables[table_name] = True - - -def extend_table(table_name, df, axis=0): - """ - add new table or extend (add rows) to an existing table - - Parameters - ---------- - table_name : str - orca/inject table name - df : pandas DataFrame - """ - - assert is_open(), f"Pipeline is not open." - - assert axis in [0, 1] - - if orca.is_table(table_name): - - table_df = orca.get_table(table_name).to_frame() - - if axis == 0: - # don't expect indexes to overlap - assert len(table_df.index.intersection(df.index)) == 0 - missing_df_str_columns = [ - c - for c in table_df.columns - if c not in df.columns and table_df[c].dtype == "O" - ] - else: - # expect indexes be same - assert table_df.index.equals(df.index) - new_df_columns = [c for c in df.columns if c not in table_df.columns] - df = df[new_df_columns] - - # preserve existing column order - df = pd.concat([table_df, df], sort=False, axis=axis) - - # backfill missing df columns that were str (object) type in table_df - if axis == 0: - for c in missing_df_str_columns: - df[c] = df[c].fillna("") - - replace_table(table_name, df) - - return df - - -def drop_table(table_name): - - assert is_open(), f"Pipeline is not open." - - if orca.is_table(table_name): - - logger.debug("drop_table dropping orca table '%s'" % table_name) - - # don't trigger function call of TableFuncWrapper - t = orca.get_raw_table(table_name) - t.clear_cached() - - for column_name in orca.list_columns_for_table(table_name): - # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name))) - orca._COLUMNS.pop((table_name, column_name), None) - - # remove from orca's table list - orca._TABLES.pop(table_name, None) - - if table_name in _PIPELINE.replaced_tables: - - logger.debug("drop_table forgetting replaced_tables '%s'" % table_name) - del _PIPELINE.replaced_tables[table_name] - - if table_name in _PIPELINE.last_checkpoint: - - logger.debug("drop_table removing table %s from last_checkpoint" % table_name) - - _PIPELINE.last_checkpoint[table_name] = "" - - -def is_table(table_name): - return orca.is_table(table_name) - - -def cleanup_pipeline(): - """ - Cleanup pipeline after successful run - - Open main pipeline if not already open (will be closed if multiprocess) - Create a single-checkpoint pipeline file with latest version of all checkpointed tables, - Delete main pipeline and any subprocess pipelines - - Called if cleanup_pipeline_after_run setting is True - - Returns - ------- - nothing, but with changed state: pipeline file that was open on call is closed and deleted - - """ - # we don't expect to be called unless cleanup_pipeline_after_run setting is True - assert config.setting("cleanup_pipeline_after_run", False) - - if not is_open(): - open_pipeline("_") - - assert is_open(), f"Pipeline is not open." - - FINAL_PIPELINE_FILE_NAME = ( - f"final_{inject.get_injectable('pipeline_file_name', 'pipeline')}" - ) - FINAL_CHECKPOINT_NAME = "final" - - final_pipeline_file_path = config.build_output_file_path(FINAL_PIPELINE_FILE_NAME) - - # keep only the last row of checkpoints and patch the last checkpoint name - checkpoints_df = get_checkpoints().tail(1).copy() - checkpoints_df["checkpoint_name"] = FINAL_CHECKPOINT_NAME - - with pd.HDFStore(final_pipeline_file_path, mode="w") as final_pipeline_store: - - for table_name in checkpointed_tables(): - # patch last checkpoint name for all tables - checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME - - table_df = get_table(table_name) - logger.debug( - f"cleanup_pipeline - adding table {table_name} {table_df.shape}" - ) - - final_pipeline_store[table_name] = table_df - - final_pipeline_store[CHECKPOINT_TABLE_NAME] = checkpoints_df - - close_pipeline() - - logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") - tracing.delete_output_files("h5", ignore=[final_pipeline_file_path]) diff --git a/activitysim/core/random.py b/activitysim/core/random.py index dc34e27291..9960e98e27 100644 --- a/activitysim/core/random.py +++ b/activitysim/core/random.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import hashlib import logging @@ -66,7 +67,6 @@ class SimpleChannel(object): """ def __init__(self, channel_name, base_seed, domain_df, step_name): - self.base_seed = base_seed # ensure that every channel is different, even for the same df index values and max_steps @@ -100,7 +100,6 @@ def init_row_states_for_step(self, row_states): assert self.step_name if self.step_name and not row_states.empty: - row_states["row_seed"] = ( self.base_seed + self.channel_seed + self.step_seed + row_states.index ) % _MAX_SEED @@ -164,7 +163,6 @@ def begin_step(self, step_name): self.multi_choice_offset = None def end_step(self, step_name): - assert self.step_name == step_name self.step_name = None @@ -197,7 +195,6 @@ def _generators_for_df(self, df): prng = np.random.RandomState() for row in df_row_states.itertuples(): - prng.seed(row.row_seed) if row.offset: @@ -375,7 +372,6 @@ def choice_for_df(self, df, step_name, a, size, replace): class Random(object): def __init__(self): - self.channels = {} # dict mapping df index name to channel name @@ -415,7 +411,8 @@ def begin_step(self, step_name): pipeline step name """ - assert self.step_name is None + if self.step_name is not None: + raise ValueError(f"already in step {self.step_name}") assert step_name is not None self.step_name = step_name @@ -438,7 +435,9 @@ def end_step(self, step_name): step_name : str name of current step (just a consistency check) """ - assert self.step_name is not None + if self.step_name is None: + # maybe a step was aborted, this is fine + return assert self.step_name == step_name for c in self.channels: @@ -470,7 +469,6 @@ def add_channel(self, channel_name, domain_df): """ if channel_name in self.channels: - assert channel_name == self.index_to_channel[domain_df.index.name] logger.debug( "Random: extending channel '%s' %s ids" diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index 6c0fb902d9..1763d17bf2 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -1,18 +1,30 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging import time import warnings -from builtins import range from collections import OrderedDict +from collections.abc import Callable from datetime import timedelta import numpy as np import pandas as pd -from . import assign, chunk, config, logit, pathbuilder, pipeline, tracing, util -from .simulate_consts import ( +from activitysim.core import ( + assign, + chunk, + config, + configuration, + logit, + pathbuilder, + tracing, + util, + workflow, +) +from activitysim.core.estimation import Estimator +from activitysim.core.simulate_consts import ( ALT_LOSER_UTIL, SPEC_DESCRIPTION_NAME, SPEC_EXPRESSION_NAME, @@ -21,12 +33,16 @@ logger = logging.getLogger(__name__) +CustomChooser_T = Callable[ + [workflow.State, pd.DataFrame, pd.DataFrame, pd.DataFrame, str], + tuple[pd.Series, pd.Series], +] -def random_rows(df, n): +def random_rows(state: workflow.State, df, n): # only sample if df has more than n rows if len(df.index) > n: - prng = pipeline.get_rn_generator().get_global_rng() + prng = state.get_rn_generator().get_global_rng() return df.take(prng.choice(len(df), size=n, replace=False)) else: @@ -34,7 +50,6 @@ def random_rows(df, n): def uniquify_spec_index(spec): - # uniquify spec index inplace # ensure uniqueness of spec index by appending comment with dupe count # this allows us to use pandas dot to compute_utilities @@ -49,15 +64,15 @@ def uniquify_spec_index(spec): assert spec.index.is_unique -def read_model_alts(file_name, set_index=None): - file_path = config.config_file_path(file_name) +def read_model_alts(state: workflow.State, file_name, set_index=None): + file_path = state.filesystem.get_config_file_path(file_name) df = pd.read_csv(file_path, comment="#") if set_index: df.set_index(set_index, inplace=True) return df -def read_model_spec(file_name): +def read_model_spec(filesystem: configuration.FileSystem, file_name: str): """ Read a CSV model specification into a Pandas DataFrame or Series. @@ -91,9 +106,9 @@ def read_model_spec(file_name): assert isinstance(file_name, str) if not file_name.lower().endswith(".csv"): - file_name = "%s.csv" % (file_name,) + file_name = f"{file_name}.csv" - file_path = config.config_file_path(file_name) + file_path = filesystem.get_config_file_path(file_name) try: spec = pd.read_csv(file_path, comment="#") @@ -121,10 +136,13 @@ def read_model_spec(file_name): return spec -def read_model_coefficients(model_settings=None, file_name=None): +def read_model_coefficients( + filesystem: configuration.FileSystem, model_settings=None, file_name=None +): """ Read the coefficient file specified by COEFFICIENTS model setting """ + assert isinstance(filesystem, configuration.FileSystem) if model_settings is None: assert file_name is not None @@ -138,7 +156,7 @@ def read_model_coefficients(model_settings=None, file_name=None): file_name = model_settings["COEFFICIENTS"] logger.debug(f"read_model_coefficients file_name {file_name}") - file_path = config.config_file_path(file_name) + file_path = filesystem.get_config_file_path(file_name) try: coefficients = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: @@ -161,7 +179,14 @@ def read_model_coefficients(model_settings=None, file_name=None): return coefficients -def spec_for_segment(model_settings, spec_id, segment_name, estimator): +@workflow.func +def spec_for_segment( + state: workflow.State, + model_settings, + spec_id: str, + segment_name: str, + estimator: Estimator | None, +) -> pd.DataFrame: """ Select spec for specified segment from omnibus spec containing columns for each segment @@ -179,7 +204,7 @@ def spec_for_segment(model_settings, spec_id, segment_name, estimator): """ spec_file_name = model_settings[spec_id] - spec = read_model_spec(file_name=spec_file_name) + spec = read_model_spec(state.filesystem, file_name=spec_file_name) if len(spec.columns) > 1: # if spec is segmented @@ -203,14 +228,16 @@ def spec_for_segment(model_settings, spec_id, segment_name, estimator): return spec - coefficients = read_model_coefficients(model_settings) + coefficients = state.filesystem.read_model_coefficients(model_settings) - spec = eval_coefficients(spec, coefficients, estimator) + spec = eval_coefficients(state, spec, coefficients, estimator) return spec -def read_model_coefficient_template(model_settings): +def read_model_coefficient_template( + filesystem: configuration.FileSystem, model_settings +): """ Read the coefficient template specified by COEFFICIENT_TEMPLATE model setting """ @@ -223,7 +250,7 @@ def read_model_coefficient_template(model_settings): coefficients_file_name = model_settings["COEFFICIENT_TEMPLATE"] - file_path = config.config_file_path(coefficients_file_name) + file_path = filesystem.get_config_file_path(coefficients_file_name) try: template = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: @@ -250,29 +277,31 @@ def read_model_coefficient_template(model_settings): return template -def dump_mapped_coefficients(model_settings): +def dump_mapped_coefficients(state: workflow.State, model_settings): """ dump template_df with coefficient values """ - coefficients_df = read_model_coefficients(model_settings) - template_df = read_model_coefficient_template(model_settings) + coefficients_df = state.filesystem.read_model_coefficients(model_settings) + template_df = read_model_coefficient_template(state.filesystem, model_settings) for c in template_df.columns: template_df[c] = template_df[c].map(coefficients_df.value) coefficients_template_file_name = model_settings["COEFFICIENT_TEMPLATE"] - file_path = config.output_file_path(coefficients_template_file_name) + file_path = state.get_output_file_path(coefficients_template_file_name) template_df.to_csv(file_path, index=True) logger.info(f"wrote mapped coefficient template to {file_path}") coefficients_file_name = model_settings["COEFFICIENTS"] - file_path = config.output_file_path(coefficients_file_name) + file_path = state.get_output_file_path(coefficients_file_name) coefficients_df.to_csv(file_path, index=True) logger.info(f"wrote raw coefficients to {file_path}") -def get_segment_coefficients(model_settings, segment_name): +def get_segment_coefficients( + filesystem: configuration.FileSystem, model_settings, segment_name +): """ Return a dict mapping generic coefficient names to segment-specific coefficient values @@ -325,7 +354,9 @@ def get_segment_coefficients(model_settings, segment_name): if legacy: constants = config.get_model_constants(model_settings) - legacy_coeffs_file_path = config.config_file_path(model_settings[legacy]) + legacy_coeffs_file_path = filesystem.get_config_file_path( + model_settings[legacy] + ) omnibus_coefficients = pd.read_csv( legacy_coeffs_file_path, comment="#", index_col="coefficient_name" ) @@ -333,8 +364,8 @@ def get_segment_coefficients(model_settings, segment_name): omnibus_coefficients[segment_name], constants=constants ) else: - coefficients_df = read_model_coefficients(model_settings) - template_df = read_model_coefficient_template(model_settings) + coefficients_df = filesystem.read_model_coefficients(model_settings) + template_df = read_model_coefficient_template(filesystem, model_settings) coefficients_col = ( template_df[segment_name].map(coefficients_df.value).astype(float) ) @@ -355,7 +386,6 @@ def get_segment_coefficients(model_settings, segment_name): def eval_nest_coefficients(nest_spec, coefficients, trace_label): def replace_coefficients(nest): if isinstance(nest, dict): - assert "coefficient" in nest coefficient_name = nest["coefficient"] if isinstance(coefficient_name, str): @@ -380,8 +410,12 @@ def replace_coefficients(nest): return nest_spec -def eval_coefficients(spec, coefficients, estimator): - +def eval_coefficients( + state: workflow.State, + spec: pd.DataFrame, + coefficients: dict | pd.DataFrame, + estimator: Estimator | None, +) -> pd.DataFrame: spec = spec.copy() # don't clobber input spec if isinstance(coefficients, pd.DataFrame): @@ -399,7 +433,7 @@ def eval_coefficients(spec, coefficients, estimator): spec[c].apply(lambda x: eval(str(x), {}, coefficients)).astype(np.float32) ) - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow if sharrow_enabled: # keep all zero rows, reduces the number of unique flows to compile and store. return spec @@ -418,6 +452,7 @@ def eval_coefficients(spec, coefficients, estimator): def eval_utilities( + state, spec, choosers, locals_d=None, @@ -429,6 +464,8 @@ def eval_utilities( log_alt_losers=False, zone_layer=None, spec_sh=None, + *, + chunk_sizer, ): """ Evaluate a utility function as defined in a spec file. @@ -475,7 +512,7 @@ def eval_utilities( """ start_time = time.time() - sharrow_enabled = config.setting("sharrow", False) + sharrow_enabled = state.settings.sharrow expression_values = None @@ -496,10 +533,11 @@ def eval_utilities( from .flow import apply_flow # import inside func to prevent circular imports locals_dict = {} - locals_dict.update(config.get_global_constants()) + locals_dict.update(state.get_global_constants()) if locals_d is not None: locals_dict.update(locals_d) sh_util, sh_flow = apply_flow( + state, spec_sh, choosers, locals_dict, @@ -515,11 +553,10 @@ def eval_utilities( # fixme - restore tracing and _check_for_variability if utilities is None or estimator or sharrow_enabled == "test": - trace_label = tracing.extend_trace_label(trace_label, "eval_utils") # avoid altering caller's passed-in locals_d parameter (they may be looping) - locals_dict = assign.local_utilities() + locals_dict = assign.local_utilities(state) if locals_d is not None: locals_dict.update(locals_d) @@ -535,11 +572,10 @@ def eval_utilities( exprs = spec.index expression_values = np.empty((spec.shape[0], choosers.shape[0])) - chunk.log_df(trace_label, "expression_values", expression_values) + chunk_sizer.log_df(trace_label, "expression_values", expression_values) i = 0 for expr, coefficients in zip(exprs, spec.values): - try: with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. @@ -576,7 +612,7 @@ def eval_utilities( expression_values[i] = expression_value i += 1 - chunk.log_df(trace_label, "expression_values", expression_values) + chunk_sizer.log_df(trace_label, "expression_values", expression_values) if estimator: df = pd.DataFrame( @@ -597,17 +633,16 @@ def eval_utilities( timelogger.mark("simple flow", False) utilities = pd.DataFrame(data=utilities, index=choosers.index, columns=spec.columns) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) timelogger.mark("assemble utilities") # sometimes tvpb will drop rows on the fly and we wind up with an empty # table of choosers. this will just bypass tracing in that case. if (trace_all_rows or have_trace_targets) and (len(choosers) > 0): - if trace_all_rows: trace_targets = pd.Series(True, index=choosers.index) else: - trace_targets = tracing.trace_targets(choosers) + trace_targets = state.tracing.trace_targets(choosers) assert trace_targets.any() # since they claimed to have targets... # get int offsets of the trace_targets (offsets of bool=True values) @@ -647,14 +682,14 @@ def eval_utilities( expression_values_df = None if expression_values_sh is not None: - tracing.trace_df( + state.tracing.trace_df( expression_values_sh, tracing.extend_trace_label(trace_label, "expression_values_sh"), slicer=None, transpose=False, ) if expression_values_df is not None: - tracing.trace_df( + state.tracing.trace_df( expression_values_df, tracing.extend_trace_label(trace_label, "expression_values"), slicer=None, @@ -662,11 +697,10 @@ def eval_utilities( ) if len(spec.columns) > 1: - for c in spec.columns: name = f"expression_value_{c}" - tracing.trace_df( + state.tracing.trace_df( expression_values_df.multiply(spec[c].values, axis=0), tracing.extend_trace_label(trace_label, name), slicer=None, @@ -717,10 +751,10 @@ def eval_utilities( timelogger.mark("sharrow test", True, logger, trace_label) del expression_values - chunk.log_df(trace_label, "expression_values", None) + chunk_sizer.log_df(trace_label, "expression_values", None) # no longer our problem - but our caller should re-log this... - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) end_time = time.time() logger.info( @@ -730,7 +764,7 @@ def eval_utilities( return utilities -def eval_variables(exprs, df, locals_d=None): +def eval_variables(state: workflow.State, exprs, df, locals_d=None): """ Evaluate a set of variable expressions from a spec in the context of a given data table. @@ -765,7 +799,7 @@ def eval_variables(exprs, df, locals_d=None): """ # avoid altering caller's passed-in locals_d parameter (they may be looping) - locals_dict = assign.local_utilities() + locals_dict = assign.local_utilities(state) if locals_d is not None: locals_dict.update(locals_d) globals_dict = {} @@ -773,7 +807,6 @@ def eval_variables(exprs, df, locals_d=None): locals_dict["df"] = df def to_array(x): - if x is None or np.isscalar(x): a = np.asanyarray([x] * len(df.index)) elif isinstance(x, pd.Series): @@ -875,46 +908,47 @@ def set_skim_wrapper_targets(df, skims): pass -def _check_for_variability(expression_values, trace_label): - """ - This is an internal method which checks for variability in each - expression - under the assumption that you probably wouldn't be using a - variable (in live simulations) if it had no variability. This is a - warning to the user that they might have constructed the variable - incorrectly. It samples 1000 rows in order to not hurt performance - - it's likely that if 1000 rows have no variability, the whole dataframe - will have no variability. - """ - - if trace_label is None: - trace_label = "_check_for_variability" - - sample = random_rows(expression_values, min(1000, len(expression_values))) - - no_variability = has_missing_vals = 0 - for i in range(len(sample.columns)): - v = sample.iloc[:, i] - if v.min() == v.max(): - col_name = sample.columns[i] - logger.info( - "%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], col_name) - ) - no_variability += 1 - # FIXME - how could this happen? Not sure it is really a problem? - if np.count_nonzero(v.isnull().values) > 0: - col_name = sample.columns[i] - logger.info("%s: missing values in: %s" % (trace_label, col_name)) - has_missing_vals += 1 - - if no_variability > 0: - logger.warning( - "%s: %s columns have no variability" % (trace_label, no_variability) - ) - - if has_missing_vals > 0: - logger.warning( - "%s: %s columns have missing values" % (trace_label, has_missing_vals) - ) +# +# def _check_for_variability(expression_values, trace_label): +# """ +# This is an internal method which checks for variability in each +# expression - under the assumption that you probably wouldn't be using a +# variable (in live simulations) if it had no variability. This is a +# warning to the user that they might have constructed the variable +# incorrectly. It samples 1000 rows in order to not hurt performance - +# it's likely that if 1000 rows have no variability, the whole dataframe +# will have no variability. +# """ +# +# if trace_label is None: +# trace_label = "_check_for_variability" +# +# sample = random_rows(expression_values, min(1000, len(expression_values))) +# +# no_variability = has_missing_vals = 0 +# for i in range(len(sample.columns)): +# v = sample.iloc[:, i] +# if v.min() == v.max(): +# col_name = sample.columns[i] +# logger.info( +# "%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], col_name) +# ) +# no_variability += 1 +# # FIXME - how could this happen? Not sure it is really a problem? +# if np.count_nonzero(v.isnull().values) > 0: +# col_name = sample.columns[i] +# logger.info("%s: missing values in: %s" % (trace_label, col_name)) +# has_missing_vals += 1 +# +# if no_variability > 0: +# logger.warning( +# "%s: %s columns have no variability" % (trace_label, no_variability) +# ) +# +# if has_missing_vals > 0: +# logger.warning( +# "%s: %s columns have missing values" % (trace_label, has_missing_vals) +# ) def compute_nested_exp_utilities(raw_utilities, nest_spec): @@ -942,7 +976,6 @@ def compute_nested_exp_utilities(raw_utilities, nest_spec): nested_utilities = pd.DataFrame(index=raw_utilities.index) for nest in logit.each_nest(nest_spec, post_order=True): - name = nest.name if nest.is_leaf: @@ -968,7 +1001,9 @@ def compute_nested_exp_utilities(raw_utilities, nest_spec): return nested_utilities -def compute_nested_probabilities(nested_exp_utilities, nest_spec, trace_label): +def compute_nested_probabilities( + state: workflow.State, nested_exp_utilities, nest_spec, trace_label +): """ compute nested probabilities for nest leafs and nodes probability for nest alternatives is simply the alternatives's local (to nest) probability @@ -991,8 +1026,8 @@ def compute_nested_probabilities(nested_exp_utilities, nest_spec, trace_label): nested_probabilities = pd.DataFrame(index=nested_exp_utilities.index) for nest in logit.each_nest(nest_spec, type="node", post_order=False): - probs = logit.utils_to_probs( + state, nested_exp_utilities[nest.alternatives], trace_label=trace_label, exponentiated=True, @@ -1028,7 +1063,6 @@ def compute_base_probabilities(nested_probabilities, nests, spec): base_probabilities = pd.DataFrame(index=nested_probabilities.index) for nest in logit.each_nest(nests, type="leaf", post_order=False): - # skip root: it has a prob of 1 but we didn't compute a nested probability column for it ancestors = nest.ancestors[1:] @@ -1043,16 +1077,19 @@ def compute_base_probabilities(nested_probabilities, nests, spec): def eval_mnl( + state: workflow.State, choosers, spec, locals_d, - custom_chooser, + custom_chooser: CustomChooser_T, estimator, log_alt_losers=False, want_logsums=False, trace_label=None, trace_choice_name=None, trace_column_names=None, + *, + chunk_sizer, ): """ Run a simulation for when the model spec does not involve alternative @@ -1077,7 +1114,7 @@ def eval_mnl( locals_d : Dict or None This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ - custom_chooser : function(probs, choosers, spec, trace_label) returns choices, rands + custom_chooser : function(state, probs, choosers, spec, trace_label) returns choices, rands custom alternative to logit.make_choices estimator : Estimator object called to report intermediate table results (used for estimation) @@ -1100,12 +1137,13 @@ def eval_mnl( assert not want_logsums trace_label = tracing.extend_trace_label(trace_label, "eval_mnl") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = state.tracing.has_trace_targets(choosers) if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + state.tracing.trace_df(choosers, "%s.choosers" % trace_label) utilities = eval_utilities( + state, spec, choosers, locals_d, @@ -1114,63 +1152,65 @@ def eval_mnl( have_trace_targets=have_trace_targets, estimator=estimator, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( utilities, "%s.utilities" % trace_label, column_labels=["alternative", "utility"], ) probs = logit.utils_to_probs( - utilities, trace_label=trace_label, trace_choosers=choosers + state, utilities, trace_label=trace_label, trace_choosers=choosers ) - chunk.log_df(trace_label, "probs", probs) + chunk_sizer.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, "utilities", None) + chunk_sizer.log_df(trace_label, "utilities", None) if have_trace_targets: # report these now in case make_choices throws error on bad_choices - tracing.trace_df( + state.tracing.trace_df( probs, "%s.probs" % trace_label, column_labels=["alternative", "probability"], ) if custom_chooser: - choices, rands = custom_chooser( - probs=probs, choosers=choosers, spec=spec, trace_label=trace_label - ) + choices, rands = custom_chooser(state, probs, choosers, spec, trace_label) else: - choices, rands = logit.make_choices(probs, trace_label=trace_label) + choices, rands = logit.make_choices(state, probs, trace_label=trace_label) del probs - chunk.log_df(trace_label, "probs", None) + chunk_sizer.log_df(trace_label, "probs", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices, "%s.choices" % trace_label, columns=[None, trace_choice_name] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + state.tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) return choices def eval_nl( + state: workflow.State, choosers, spec, nest_spec, locals_d, - custom_chooser, + custom_chooser: CustomChooser_T, estimator, log_alt_losers=False, want_logsums=False, trace_label=None, trace_choice_name=None, trace_column_names=None, + *, + chunk_sizer: chunk.ChunkSizer, ): """ Run a nested-logit simulation for when the model spec does not involve alternative @@ -1211,16 +1251,17 @@ def eval_nl( trace_label = tracing.extend_trace_label(trace_label, "eval_nl") assert trace_label - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = state.tracing.has_trace_targets(choosers) logit.validate_nest_spec(nest_spec, trace_label) if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + state.tracing.trace_df(choosers, "%s.choosers" % trace_label) choosers, spec_sh = _preprocess_tvpb_logsums_on_choosers(choosers, spec, locals_d) raw_utilities = eval_utilities( + state, spec_sh, choosers, locals_d, @@ -1230,11 +1271,12 @@ def eval_nl( estimator=estimator, trace_column_names=trace_column_names, spec_sh=spec_sh, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "raw_utilities", raw_utilities) + chunk_sizer.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( raw_utilities, "%s.raw_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1242,13 +1284,13 @@ def eval_nl( # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities(raw_utilities, nest_spec) - chunk.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) del raw_utilities - chunk.log_df(trace_label, "raw_utilities", None) + chunk_sizer.log_df(trace_label, "raw_utilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( nested_exp_utilities, "%s.nested_exp_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1256,20 +1298,20 @@ def eval_nl( # probabilities of alternatives relative to siblings sharing the same nest nested_probabilities = compute_nested_probabilities( - nested_exp_utilities, nest_spec, trace_label=trace_label + state, nested_exp_utilities, nest_spec, trace_label=trace_label ) - chunk.log_df(trace_label, "nested_probabilities", nested_probabilities) + chunk_sizer.log_df(trace_label, "nested_probabilities", nested_probabilities) if want_logsums: # logsum of nest root logsums = pd.Series(np.log(nested_exp_utilities.root), index=choosers.index) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) del nested_exp_utilities - chunk.log_df(trace_label, "nested_exp_utilities", None) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( nested_probabilities, "%s.nested_probabilities" % trace_label, column_labels=["alternative", "probability"], @@ -1279,13 +1321,13 @@ def eval_nl( base_probabilities = compute_base_probabilities( nested_probabilities, nest_spec, spec ) - chunk.log_df(trace_label, "base_probabilities", base_probabilities) + chunk_sizer.log_df(trace_label, "base_probabilities", base_probabilities) del nested_probabilities - chunk.log_df(trace_label, "nested_probabilities", None) + chunk_sizer.log_df(trace_label, "nested_probabilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( base_probabilities, "%s.base_probabilities" % trace_label, column_labels=["alternative", "probability"], @@ -1297,8 +1339,8 @@ def eval_nl( no_choices = (base_probabilities.sum(axis=1) - 1).abs() > BAD_PROB_THRESHOLD if no_choices.any(): - logit.report_bad_choices( + state, no_choices, base_probabilities, trace_label=tracing.extend_trace_label(trace_label, "bad_probs"), @@ -1308,25 +1350,28 @@ def eval_nl( if custom_chooser: choices, rands = custom_chooser( - probs=base_probabilities, - choosers=choosers, - spec=spec, - trace_label=trace_label, + state, + base_probabilities, + choosers, + spec, + trace_label, ) else: - choices, rands = logit.make_choices(base_probabilities, trace_label=trace_label) + choices, rands = logit.make_choices( + state, base_probabilities, trace_label=trace_label + ) del base_probabilities - chunk.log_df(trace_label, "base_probabilities", None) + chunk_sizer.log_df(trace_label, "base_probabilities", None) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( choices, "%s.choices" % trace_label, columns=[None, trace_choice_name] ) - tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) + state.tracing.trace_df(rands, f"{trace_label}.rands", columns=[None, "rand"]) if want_logsums: - tracing.trace_df( - logsums, "%s.logsums" % trace_label, columns=[None, "logsum"] + state.tracing.trace_df( + logsums, f"{trace_label}.logsums", columns=[None, "logsum"] ) if want_logsums: @@ -1336,19 +1381,23 @@ def eval_nl( return choices +@workflow.func def _simple_simulate( + state: workflow.State, choosers, spec, nest_spec, skims=None, locals_d=None, - custom_chooser=None, + custom_chooser: CustomChooser_T = None, log_alt_losers=False, want_logsums=False, estimator=None, trace_label=None, trace_choice_name=None, trace_column_names=None, + *, + chunk_sizer, ): """ Run an MNL or NL simulation for when the model spec does not involve alternative @@ -1376,7 +1425,7 @@ def _simple_simulate( locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ - custom_chooser : Estimator object + custom_chooser : CustomChooser_T estimator : function(df, label, table_name) called to report intermediate table results (used for estimation) @@ -1400,6 +1449,7 @@ def _simple_simulate( if nest_spec is None: choices = eval_mnl( + state, choosers, spec, locals_d, @@ -1410,9 +1460,11 @@ def _simple_simulate( trace_label=trace_label, trace_choice_name=trace_choice_name, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) else: choices = eval_nl( + state, choosers, spec, nest_spec, @@ -1424,6 +1476,7 @@ def _simple_simulate( trace_label=trace_label, trace_choice_name=trace_choice_name, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) return choices @@ -1449,12 +1502,12 @@ def list_of_skims(skims): def simple_simulate( + state: workflow.State, choosers, spec, nest_spec, skims=None, locals_d=None, - chunk_size=0, custom_chooser=None, log_alt_losers=False, want_logsums=False, @@ -1475,11 +1528,14 @@ def simple_simulate( result_list = [] # segment by person type and pick the right spec for each person type - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label - ): - + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers(state, choosers, trace_label): choices = _simple_simulate( + state, chooser_chunk, spec, nest_spec, @@ -1492,11 +1548,12 @@ def simple_simulate( trace_label=chunk_trace_label, trace_choice_name=trace_choice_name, trace_column_names=trace_column_names, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -1507,12 +1564,12 @@ def simple_simulate( def simple_simulate_by_chunk_id( + state: workflow.State, choosers, spec, nest_spec, skims=None, locals_d=None, - chunk_size=0, custom_chooser=None, log_alt_losers=False, want_logsums=False, @@ -1523,15 +1580,16 @@ def simple_simulate_by_chunk_id( """ chunk_by_chunk_id wrapper for simple_simulate """ - + choices = None result_list = [] for ( i, chooser_chunk, chunk_trace_label, - ) in chunk.adaptive_chunked_choosers_by_chunk_id(choosers, chunk_size, trace_label): - + chunk_sizer, + ) in chunk.adaptive_chunked_choosers_by_chunk_id(state, choosers, trace_label): choices = _simple_simulate( + state, chooser_chunk, spec, nest_spec, @@ -1543,11 +1601,12 @@ def simple_simulate_by_chunk_id( estimator=estimator, trace_label=chunk_trace_label, trace_choice_name=trace_choice_name, + chunk_sizer=chunk_sizer, ) result_list.append(choices) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -1555,7 +1614,9 @@ def simple_simulate_by_chunk_id( return choices -def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): +def eval_mnl_logsums( + state: workflow.State, choosers, spec, locals_d, trace_label=None, *, chunk_sizer +): """ like eval_nl except return logsums instead of making choices @@ -1568,21 +1629,27 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): # FIXME - untested and not currently used by any models... trace_label = tracing.extend_trace_label(trace_label, "eval_mnl_logsums") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = state.tracing.has_trace_targets(choosers) logger.debug("running eval_mnl_logsums") # trace choosers if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + state.tracing.trace_df(choosers, "%s.choosers" % trace_label) utilities = eval_utilities( - spec, choosers, locals_d, trace_label, have_trace_targets + state, + spec, + choosers, + locals_d, + trace_label, + have_trace_targets, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "utilities", utilities) + chunk_sizer.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( utilities, "%s.raw_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1592,11 +1659,11 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): # logsum is log of exponentiated utilities summed across columns of each chooser row logsums = np.log(np.exp(utilities.values).sum(axis=1)) logsums = pd.Series(logsums, index=choosers.index) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) # trace utilities if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( logsums, "%s.logsums" % trace_label, column_labels=["alternative", "logsum"] ) @@ -1682,7 +1749,16 @@ def _replace_in_level(multiindex, level_name, *args, **kwargs): return choosers, spec_sh -def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): +def eval_nl_logsums( + state: workflow.State, + choosers, + spec, + nest_spec, + locals_d, + trace_label=None, + *, + chunk_sizer: chunk.ChunkSizer, +): """ like eval_nl except return logsums instead of making choices @@ -1693,7 +1769,7 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): """ trace_label = tracing.extend_trace_label(trace_label, "eval_nl_logsums") - have_trace_targets = tracing.has_trace_targets(choosers) + have_trace_targets = state.tracing.has_trace_targets(choosers) logit.validate_nest_spec(nest_spec, trace_label) @@ -1701,20 +1777,22 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): # trace choosers if have_trace_targets: - tracing.trace_df(choosers, "%s.choosers" % trace_label) + state.tracing.trace_df(choosers, "%s.choosers" % trace_label) raw_utilities = eval_utilities( + state, spec_sh, choosers, locals_d, trace_label=trace_label, have_trace_targets=have_trace_targets, spec_sh=spec_sh, + chunk_sizer=chunk_sizer, ) - chunk.log_df(trace_label, "raw_utilities", raw_utilities) + chunk_sizer.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: - tracing.trace_df( + state.tracing.trace_df( raw_utilities, "%s.raw_utilities" % trace_label, column_labels=["alternative", "utility"], @@ -1722,36 +1800,44 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): # - exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities(raw_utilities, nest_spec) - chunk.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) del raw_utilities # done with raw_utilities - chunk.log_df(trace_label, "raw_utilities", None) + chunk_sizer.log_df(trace_label, "raw_utilities", None) # - logsums logsums = np.log(nested_exp_utilities.root) logsums = pd.Series(logsums, index=choosers.index) - chunk.log_df(trace_label, "logsums", logsums) + chunk_sizer.log_df(trace_label, "logsums", logsums) if have_trace_targets: # add logsum to nested_exp_utilities for tracing nested_exp_utilities["logsum"] = logsums - tracing.trace_df( + state.tracing.trace_df( nested_exp_utilities, "%s.nested_exp_utilities" % trace_label, column_labels=["alternative", "utility"], ) - tracing.trace_df( + state.tracing.trace_df( logsums, "%s.logsums" % trace_label, column_labels=["alternative", "logsum"] ) del nested_exp_utilities # done with nested_exp_utilities - chunk.log_df(trace_label, "nested_exp_utilities", None) + chunk_sizer.log_df(trace_label, "nested_exp_utilities", None) return logsums def _simple_simulate_logsums( - choosers, spec, nest_spec, skims=None, locals_d=None, trace_label=None + state: workflow.State, + choosers, + spec, + nest_spec, + skims=None, + locals_d=None, + trace_label=None, + *, + chunk_sizer, ): """ like simple_simulate except return logsums instead of making choices @@ -1766,16 +1852,31 @@ def _simple_simulate_logsums( set_skim_wrapper_targets(choosers, skims) if nest_spec is None: - logsums = eval_mnl_logsums(choosers, spec, locals_d, trace_label=trace_label) + logsums = eval_mnl_logsums( + state, + choosers, + spec, + locals_d, + trace_label=trace_label, + chunk_sizer=chunk_sizer, + ) else: logsums = eval_nl_logsums( - choosers, spec, nest_spec, locals_d, trace_label=trace_label + state, + choosers, + spec, + nest_spec, + locals_d, + trace_label=trace_label, + chunk_sizer=chunk_sizer, ) return logsums +@workflow.func def simple_simulate_logsums( + state: workflow.State, choosers, spec, nest_spec, @@ -1799,17 +1900,28 @@ def simple_simulate_logsums( result_list = [] # segment by person type and pick the right spec for each person type - for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( - choosers, chunk_size, trace_label, chunk_tag + for ( + i, + chooser_chunk, + chunk_trace_label, + chunk_sizer, + ) in chunk.adaptive_chunked_choosers( + state, choosers, trace_label, chunk_tag, chunk_size=chunk_size ): - logsums = _simple_simulate_logsums( - chooser_chunk, spec, nest_spec, skims, locals_d, chunk_trace_label + state, + chooser_chunk, + spec, + nest_spec, + skims, + locals_d, + chunk_trace_label, + chunk_sizer=chunk_sizer, ) result_list.append(logsums) - chunk.log_df(trace_label, "result_list", result_list) + chunk_sizer.log_df(trace_label, "result_list", result_list) if len(result_list) > 1: logsums = pd.concat(result_list) diff --git a/activitysim/core/skim_dataset.py b/activitysim/core/skim_dataset.py index 9c28082fce..759ecdead8 100644 --- a/activitysim/core/skim_dataset.py +++ b/activitysim/core/skim_dataset.py @@ -1,15 +1,22 @@ +from __future__ import annotations + import glob import logging import os +import time +from functools import partial +from pathlib import Path import numpy as np import openmatrix import pandas as pd import sharrow as sh +import xarray as xr -from . import config -from . import flow as __flow # noqa, keep this here for side effects? -from . import inject +from activitysim.core import config +from activitysim.core import flow as __flow # noqa: 401 +from activitysim.core import workflow +from activitysim.core.input import read_input_file logger = logging.getLogger(__name__) @@ -110,6 +117,8 @@ def lookup(self, orig, dest, key): orig = np.asanyarray(orig).astype(int) dest = np.asanyarray(dest).astype(int) + some_missing = (orig.min() < 0) or (dest.min() < 0) + # TODO offset mapper if required positions = {self.odim: orig, self.ddim: dest} @@ -126,6 +135,8 @@ def lookup(self, orig, dest, key): **positions, _name=key ) # Dataset.iat as implemented by sharrow strips data encoding + if some_missing: + result[(orig < 0) | (dest < 0)] = np.nan result = result.to_series() if use_index is not None: @@ -229,7 +240,8 @@ def set_df(self, df): } if self.time_key: if ( - np.issubdtype(df[self.time_key].dtype, np.integer) + not df[self.time_key].dtype == "category" + and np.issubdtype(df[self.time_key].dtype, np.integer) and df[self.time_key].max() < self.dataset.dims["time_period"] ): logger.info(f"natural use for time_period={self.time_key}") @@ -237,14 +249,19 @@ def set_df(self, df): else: logger.info(f"vectorize lookup for time_period={self.time_key}") positions["time_period"] = pd.Series( - np.vectorize(self.time_map.get)(df[self.time_key]), + np.vectorize(self.time_map.get, "I")(df[self.time_key], 0), index=df.index, ) if POSITIONS_AS_DICT: self.positions = {} for k, v in positions.items(): - self.positions[k] = v.astype(int) + try: + self.positions[k] = v.astype(int) + except TypeError: + # possibly some missing values that are not relevant, + # fill with zeros to continue. + self.positions[k] = v.fillna(0).astype(int) else: self.positions = pd.DataFrame(positions).astype(int) @@ -295,8 +312,10 @@ def lookup(self, key, reverse=False): main_key, time_key = key if time_key in self.time_map: if isinstance(x, dict): - x["time_period"] = np.full_like( - x[self.odim], fill_value=self.time_map[time_key] + # np.broadcast_to saves memory over np.full_like, since we + # don't ever write to this array. + x["time_period"] = np.broadcast_to( + self.time_map[time_key], x[self.odim].shape ) else: x = x.assign(time_period=self.time_map[time_key]) @@ -432,7 +451,7 @@ def _use_existing_backing_if_valid(backing, omx_file_paths, skim_tag): def _dedupe_time_periods(network_los_preload): - raw_time_periods = network_los_preload.los_settings["skim_time_periods"]["labels"] + raw_time_periods = network_los_preload.los_settings.skim_time_periods["labels"] # deduplicate time period names time_periods = [] for t in raw_time_periods: @@ -490,22 +509,27 @@ def _apply_digital_encoding(dataset, digital_encodings): return dataset -def _scan_for_unused_names(tokens): +def _scan_for_unused_names(state, tokens): """ Scan all spec files to find unused skim variable names. Parameters ---------- + state : State tokens : Collection[str] Returns ------- Set[str] """ - configs_dir_list = inject.get_injectable("configs_dir") + configs_dir_list = state.filesystem.get_configs_dir() configs_dir_list = ( - [configs_dir_list] if isinstance(configs_dir_list, str) else configs_dir_list + [configs_dir_list] + if isinstance(configs_dir_list, (str, Path)) + else configs_dir_list ) + if isinstance(configs_dir_list, tuple): + configs_dir_list = list(configs_dir_list) assert isinstance(configs_dir_list, list) for directory in configs_dir_list: @@ -524,10 +548,10 @@ def _scan_for_unused_names(tokens): return tokens -def _drop_unused_names(dataset): +def _drop_unused_names(state, dataset): logger.info("scanning for unused skims") tokens = set(dataset.variables.keys()) - set(dataset.coords.keys()) - unused_tokens = _scan_for_unused_names(tokens) + unused_tokens = _scan_for_unused_names(state, tokens) if unused_tokens: baggage = dataset.digital_encoding.baggage(None) unused_tokens -= baggage @@ -571,7 +595,7 @@ def load_sparse_maz_skims( maz2taz_file_name : str maz_to_maz_tables : Collection[] max_blend_distance : optional - data_file_resolver : function + data_file_resolver : function or Callable Returns ------- @@ -580,12 +604,12 @@ def load_sparse_maz_skims( from ..core.los import THREE_ZONE, TWO_ZONE if data_file_resolver is None: - data_file_resolver = config.data_file_path + raise ValueError("missing file resolver") if zone_system in [TWO_ZONE, THREE_ZONE]: - # maz - maz_taz = pd.read_csv(data_file_resolver(maz2taz_file_name, mandatory=True)) + maz_filename = data_file_resolver(maz2taz_file_name, mandatory=True) + maz_taz = read_input_file(maz_filename) maz_taz = maz_taz[["MAZ", "TAZ"]].set_index("MAZ").sort_index() # MAZ alignment is ensured here, so no re-alignment check is @@ -623,8 +647,7 @@ def load_sparse_maz_skims( max_blend_distance = {"DEFAULT": max_blend_distance} for file_name in maz_to_maz_tables: - - df = pd.read_csv(data_file_resolver(file_name, mandatory=True)) + df = read_input_file(data_file_resolver(file_name, mandatory=True)) if remapper is not None: df.OMAZ = df.OMAZ.map(remapper.get) df.DMAZ = df.DMAZ.map(remapper.get) @@ -647,38 +670,39 @@ def load_sparse_maz_skims( return dataset -def load_skim_dataset_to_shared_memory(skim_tag="taz"): +def load_skim_dataset_to_shared_memory(state, skim_tag="taz") -> xr.Dataset: """ Load skims from disk into shared memory. Parameters ---------- + state : State skim_tag : str, default "taz" Returns ------- xarray.Dataset """ - from ..core.los import ONE_ZONE + from activitysim.core.los import ONE_ZONE # TODO:SHARROW: taz and maz are the same - network_los_preload = inject.get_injectable("network_los_preload", None) + network_los_preload = state.get_injectable("network_los_preload") if network_los_preload is None: raise ValueError("missing network_los_preload") # find which OMX files are to be used. - omx_file_paths = config.expand_input_file_list( + omx_file_paths = state.filesystem.expand_input_file_list( network_los_preload.omx_file_names(skim_tag), ) zarr_file = network_los_preload.zarr_file_name(skim_tag) - if config.setting("disable_zarr", False): + if state.settings.disable_zarr: # we can disable the zarr optimizations by setting the `disable_zarr` # flag in the master config file to True zarr_file = None if zarr_file is not None: - zarr_file = os.path.join(config.get_cache_dir(), zarr_file) + zarr_file = os.path.join(state.filesystem.get_cache_dir(), zarr_file) max_float_precision = network_los_preload.skim_max_float_precision(skim_tag) @@ -694,30 +718,35 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): ) backing = f"memmap:{mmap_file}" - land_use = inject.get_table("land_use") + land_use = state.get_dataframe("land_use") - if f"_original_{land_use.index.name}" in land_use.to_frame(): - land_use_zone_ids = land_use.to_frame()[f"_original_{land_use.index.name}"] + if f"_original_{land_use.index.name}" in land_use: + land_use_zone_ids = land_use[f"_original_{land_use.index.name}"] remapper = dict(zip(land_use_zone_ids, land_use_zone_ids.index)) else: remapper = None d = _use_existing_backing_if_valid(backing, omx_file_paths, skim_tag) + do_not_save_zarr = False if d is None: time_periods = _dedupe_time_periods(network_los_preload) if zarr_file: logger.info(f"looking for zarr skims at {zarr_file}") if zarr_file and os.path.exists(zarr_file): - # TODO: check if the OMX skims or sparse MAZ are modified more - # recently than the cached ZARR versions; if so do not use - # the ZARR + from .util import latest_file_modification_time + logger.info("found zarr skims, loading them") - d = sh.dataset.from_zarr_with_attr(zarr_file).max_float_precision( - max_float_precision - ) - else: - if zarr_file: + d = sh.dataset.from_zarr_with_attr(zarr_file) + zarr_write_time = d.attrs.get("ZARR_WRITE_TIME", 0) + if zarr_write_time < latest_file_modification_time(omx_file_paths): + logger.warning("zarr skims older than omx, not using them") + do_not_save_zarr = True + d = None + else: + d = d.max_float_precision(max_float_precision) + if d is None: + if zarr_file and not do_not_save_zarr: logger.info("did not find zarr skims, loading omx") d = sh.dataset.from_omx_3d( [openmatrix.open_file(f, mode="r") for f in omx_file_paths], @@ -744,7 +773,9 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): if zarr_digital_encoding: d = _apply_digital_encoding(d, zarr_digital_encoding) logger.info(f"writing zarr skims to {zarr_file}") - d.to_zarr_with_attr(zarr_file) + d.attrs["ZARR_WRITE_TIME"] = time.time() + if not do_not_save_zarr: + d.to_zarr_with_attr(zarr_file) if skim_tag in ("taz", "maz"): # load sparse MAZ skims, if any @@ -764,9 +795,13 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): max_blend_distance=network_los_preload.setting( "maz_to_maz.max_blend_distance", default={} ), + data_file_resolver=partial( + state.filesystem.get_data_file_path, + alternative_suffixes=(".csv.gz", ".parquet"), + ), ) - d = _drop_unused_names(d) + d = _drop_unused_names(state, d) # apply non-zarr dependent digital encoding d = _apply_digital_encoding(d, skim_digital_encoding) @@ -817,11 +852,11 @@ def load_skim_dataset_to_shared_memory(skim_tag="taz"): return d.shm.to_shared_memory(backing, mode="r") -@inject.injectable(cache=True) -def skim_dataset(): - return load_skim_dataset_to_shared_memory() +@workflow.cached_object +def skim_dataset(state: workflow.State) -> xr.Dataset: + return load_skim_dataset_to_shared_memory(state) -@inject.injectable(cache=True) -def tap_dataset(): - return load_skim_dataset_to_shared_memory("tap") +@workflow.cached_object +def tap_dataset(state: workflow.State) -> xr.Dataset: + return load_skim_dataset_to_shared_memory(state, "tap") diff --git a/activitysim/core/skim_dict_factory.py b/activitysim/core/skim_dict_factory.py index 450b98d25c..b78401a887 100644 --- a/activitysim/core/skim_dict_factory.py +++ b/activitysim/core/skim_dict_factory.py @@ -1,6 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. # from builtins import int +from __future__ import annotations import logging import multiprocessing @@ -11,7 +12,7 @@ import numpy as np import openmatrix as omx -from activitysim.core import config, inject, skim_dictionary, util +from activitysim.core import skim_dictionary, util logger = logging.getLogger(__name__) @@ -52,7 +53,7 @@ def shape(self): class SkimInfo(object): - def __init__(self, skim_tag, network_los): + def __init__(self, state, skim_tag, network_los): """ skim_tag: str (e.g. 'TAZ') @@ -89,9 +90,9 @@ def __init__(self, skim_tag, network_los): self.block_offsets = None if skim_tag: - self.load_skim_info(skim_tag) + self.load_skim_info(state, skim_tag) - def load_skim_info(self, skim_tag): + def load_skim_info(self, state, skim_tag): """ Read omx files for skim (e.g. 'TAZ') and build skim_info dict @@ -103,7 +104,7 @@ def load_skim_info(self, skim_tag): omx_file_names = self.network_los.omx_file_names(skim_tag) - self.omx_file_paths = config.expand_input_file_list(omx_file_names) + self.omx_file_paths = state.filesystem.expand_input_file_list(omx_file_names) # ignore any 3D skims not in skim_time_periods # specifically, load all skims except those with key2 not in dim3_tags_to_load @@ -131,7 +132,7 @@ def load_skim_info(self, skim_tag): for skim_name in omx_file.listMatrices(): if skim_name in self.omx_manifest: warnings.warn( - f"duplicate skim '{skim_name}' found in {self.omx_manifest[skim_name]} and {omx_file}" + f"duplicate skim '{skim_name}' found in {self.omx_manifest[skim_name]} and {omx_file.filename}" ) self.omx_manifest[skim_name] = omx_file_path @@ -263,10 +264,12 @@ def _skim_data_from_buffer(self, skim_info, skim_buffer): assert False, "Not supported" def _memmap_skim_data_path(self, skim_tag): - return os.path.join(config.get_cache_dir(), f"cached_{skim_tag}.mmap") + return os.path.join( + self.network_los.state.filesystem.get_cache_dir(), f"cached_{skim_tag}.mmap" + ) - def load_skim_info(self, skim_tag): - return SkimInfo(skim_tag, self.network_los) + def load_skim_info(self, state, skim_tag): + return SkimInfo(state, skim_tag, self.network_los) def _read_skims_from_omx(self, skim_info, skim_data): """ @@ -496,14 +499,14 @@ def get_skim_data(self, skim_tag, skim_info): Parameters ---------- skim_tag: str - skim_info: string + skim_info: dict Returns ------- SkimData """ - data_buffers = inject.get_injectable("data_buffers", None) + data_buffers = self.network_los.state.get_injectable("data_buffers", None) if data_buffers: # we assume any existing skim buffers will already have skim data loaded into them logger.info( @@ -594,7 +597,9 @@ def get_skim_data(self, skim_tag, skim_info): """ # don't expect legacy shared memory buffers - assert not inject.get_injectable("data_buffers", {}).get(skim_tag) + assert not self.network_los.state.get_injectable("data_buffers", {}).get( + skim_tag + ) skim_cache_path = self._memmap_skim_data_path(skim_tag) if not os.path.isfile(skim_cache_path): diff --git a/activitysim/core/skim_dictionary.py b/activitysim/core/skim_dictionary.py index a1897b5368..e2d621461d 100644 --- a/activitysim/core/skim_dictionary.py +++ b/activitysim/core/skim_dictionary.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging from builtins import object, range @@ -7,6 +8,8 @@ import numpy as np import pandas as pd +from activitysim.core import workflow + logger = logging.getLogger(__name__) NOT_IN_SKIM_ZONE_ID = -1 @@ -146,7 +149,7 @@ def map(self, zone_ids): return offsets -class SkimDict(object): +class SkimDict: """ A SkimDict object is a wrapper around a dict of multiple skim objects, where each object is identified by a key. @@ -154,7 +157,7 @@ class SkimDict(object): Note that keys are either strings or tuples of two strings (to support stacking of skims.) """ - def __init__(self, skim_tag, skim_info, skim_data): + def __init__(self, state, skim_tag, skim_info, skim_data): logger.info(f"SkimDict init {skim_tag}") @@ -162,8 +165,8 @@ def __init__(self, skim_tag, skim_info, skim_data): self.skim_info = skim_info self.usage = set() # track keys of skims looked up - self.offset_mapper = ( - self._offset_mapper() + self.offset_mapper = self._offset_mapper( + state ) # (in function so subclass can override) self.omx_shape = skim_info.omx_shape @@ -184,7 +187,7 @@ def __init__(self, skim_tag, skim_info, skim_data): f"SkimDict.build_3d_skim_block_offset_table registered {len(self.skim_dim3)} 3d keys" ) - def _offset_mapper(self): + def _offset_mapper(self, state): """ Return an OffsetMapper to set self.offset_mapper for use with skims This allows subclasses (e.g. MazSkimDict) to 'tweak' the parent offset mapper. @@ -613,7 +616,7 @@ class MazSkimDict(SkimDict): to return values of for more distant pairs (or for skims that are not attributes in the maz-maz table.) """ - def __init__(self, skim_tag, network_los, taz_skim_dict): + def __init__(self, state: workflow.State, skim_tag, network_los, taz_skim_dict): """ we need network_los because we have dependencies on network_los.load_data (e.g. maz_to_maz_df, maz_taz_df, and the fallback taz skim_dict) @@ -638,10 +641,10 @@ def __init__(self, skim_tag, network_los, taz_skim_dict): should_recode_based_on_table, ) - if should_recode_based_on_table("land_use_taz"): + if should_recode_based_on_table(state, "land_use_taz"): from .skim_dict_factory import SkimInfo - skim_info = SkimInfo(None, network_los) + skim_info = SkimInfo(state, None, network_los) skim_info.skim_tag = taz_skim_dict.skim_info.skim_tag skim_info.dtype_name = network_los.skim_dtype_name skim_info.omx_manifest = taz_skim_dict.skim_info.omx_manifest @@ -654,12 +657,12 @@ def __init__(self, skim_tag, network_los, taz_skim_dict): skim_info.block_offsets = taz_skim_dict.skim_info.block_offsets skim_info.offset_map = recode_based_on_table( - taz_skim_dict.skim_info.offset_map, "land_use_taz" + state, taz_skim_dict.skim_info.offset_map, "land_use_taz" ) else: skim_info = taz_skim_dict.skim_info - super().__init__(skim_tag, skim_info, taz_skim_dict.skim_data) + super().__init__(state, skim_tag, skim_info, taz_skim_dict.skim_data) assert ( self.offset_mapper is not None ) # should have been set with _init_offset_mapper @@ -671,7 +674,7 @@ def __init__(self, skim_tag, network_los, taz_skim_dict): ) self.sparse_key_usage = set() - def _offset_mapper(self): + def _offset_mapper(self, state): """ return an OffsetMapper to map maz zone_ids to taz skim indexes Specifically, an offset_series with MAZ zone_id index and TAZ skim array offset values @@ -684,13 +687,13 @@ def _offset_mapper(self): """ # use taz offset_mapper to create series mapping directly from MAZ to TAZ skim index - taz_offset_mapper = super()._offset_mapper() - maz_taz = self.network_los.get_maz_to_taz_series + taz_offset_mapper = super()._offset_mapper(state) + maz_taz = self.network_los.get_maz_to_taz_series(state) maz_to_skim_offset = taz_offset_mapper.map(maz_taz) if isinstance(maz_to_skim_offset, np.ndarray): maz_to_skim_offset = pd.Series( - maz_to_skim_offset, self.network_los.get_maz_to_taz_series.index + maz_to_skim_offset, self.network_los.get_maz_to_taz_series(state).index ) # bug # MAZ @@ -731,7 +734,10 @@ def sparse_lookup(self, orig, dest, key): self.sparse_key_usage.add(key) - max_blend_distance = self.network_los.max_blend_distance.get(key, 0) + if self.network_los.max_blend_distance is None: + max_blend_distance = 0 + else: + max_blend_distance = self.network_los.max_blend_distance.get(key, 0) if max_blend_distance == 0: blend_distance_skim_name = None diff --git a/activitysim/core/steps/output.py b/activitysim/core/steps/output.py index 8b8a8e3be0..325fd2bbb0 100644 --- a/activitysim/core/steps/output.py +++ b/activitysim/core/steps/output.py @@ -1,18 +1,23 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import sys import numpy as np import pandas as pd +import pyarrow as pa +import pyarrow.csv as csv -from activitysim.core import config, inject, pipeline -from activitysim.core.config import setting +from activitysim.core import configuration, workflow +from activitysim.core.workflow.checkpoint import CHECKPOINT_NAME logger = logging.getLogger(__name__) -def track_skim_usage(output_dir): +@workflow.step +def track_skim_usage(state: workflow.State) -> None: """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) @@ -28,11 +33,12 @@ def track_skim_usage(output_dir): pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 - skim_dict = inject.get_injectable("skim_dict") + skim_dict = state.get_injectable("skim_dict") mode = "wb" if sys.version_info < (3,) else "w" - with open(config.output_file_path("skim_usage.txt"), mode) as output_file: - + with open( + state.filesystem.get_output_file_path("skim_usage.txt"), mode + ) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.get_skim_usage(): print(key, file=output_file) @@ -52,7 +58,7 @@ def track_skim_usage(output_dir): print(key, file=output_file) -def previous_write_data_dictionary(output_dir): +def previous_write_data_dictionary(state: workflow.State, output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table @@ -62,31 +68,31 @@ def previous_write_data_dictionary(output_dir): """ - model_settings = config.read_model_settings("write_data_dictionary") + model_settings = state.filesystem.read_model_settings("write_data_dictionary") txt_format = model_settings.get("txt_format", "data_dict.txt") csv_format = model_settings.get("csv_format", "data_dict.csv") if txt_format: - - output_file_path = config.output_file_path(txt_format) + output_file_path = state.get_output_file_path(txt_format) pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 - output_tables = pipeline.checkpointed_tables() + output_tables = state.checkpoint.list_tables() # write data dictionary for all checkpointed_tables with open(output_file_path, "w") as output_file: for table_name in output_tables: - df = inject.get_table(table_name, None).to_frame() + df = state.get_dataframe(table_name) print("\n### %s %s" % (table_name, df.shape), file=output_file) print("index:", df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file) -def write_data_dictionary(output_dir): +@workflow.step +def write_data_dictionary(state: workflow.State) -> None: """ Write table schema for all tables @@ -107,7 +113,7 @@ def write_data_dictionary(output_dir): """ - model_settings = config.read_model_settings("write_data_dictionary") + model_settings = state.filesystem.read_model_settings("write_data_dictionary") txt_format = model_settings.get("txt_format", "data_dict.txt") csv_format = model_settings.get("csv_format", "data_dict.csv") @@ -117,7 +123,7 @@ def write_data_dictionary(output_dir): ) return - table_names = pipeline.registered_tables() + table_names = state.registered_tables() # use table_names list from model_settings, if provided schema_tables = model_settings.get("tables", None) @@ -129,7 +135,7 @@ def write_data_dictionary(output_dir): final_shapes = dict() for table_name in table_names: try: - df = pipeline.get_table(table_name) + df = state.get_dataframe(table_name) except RuntimeError as run_err: if run_err.args and "dropped" in run_err.args[0]: # if a checkpointed table was dropped, that's not ideal, so we should @@ -138,6 +144,8 @@ def write_data_dictionary(output_dir): # note actually emitting a warnings.warn instead of a logger message will # unfortunately cause some of our excessively strict tests to fail continue + else: + raise final_shapes[table_name] = df.shape @@ -155,42 +163,42 @@ def write_data_dictionary(output_dir): schema[table_name] = info # annotate schema.info with name of checkpoint columns were first seen - for _, row in pipeline.get_checkpoints().iterrows(): + if state.checkpoint.store: + for _, row in state.checkpoint.get_inventory().iterrows(): + checkpoint_name = row[CHECKPOINT_NAME] - checkpoint_name = row[pipeline.CHECKPOINT_NAME] - - for table_name in table_names: - - # no change to table in this checkpoint - if row.get(table_name, None) != checkpoint_name: - continue + for table_name in table_names: + # no change to table in this checkpoint + if row.get(table_name, None) != checkpoint_name: + continue - # get the checkpointed version of the table - df = pipeline.get_table(table_name, checkpoint_name) + # get the checkpointed version of the table + df = state.checkpoint.load_dataframe(table_name, checkpoint_name) - if df.index.name and df.index.name not in df.columns: - df = df.reset_index() + if df.index.name and df.index.name not in df.columns: + df = df.reset_index() - info = schema.get(table_name, None) + info = schema.get(table_name, None) - if info is not None: - # tag any new columns with checkpoint name - prev_columns = info[info.checkpoint != ""].column_name.values - new_cols = [c for c in df.columns.values if c not in prev_columns] - is_new_column_this_checkpoont = info.column_name.isin(new_cols) - info.checkpoint = np.where( - is_new_column_this_checkpoont, checkpoint_name, info.checkpoint - ) - schema[table_name] = info + if info is not None: + # tag any new columns with checkpoint name + prev_columns = info[info.checkpoint != ""].column_name.values + new_cols = [c for c in df.columns.values if c not in prev_columns] + is_new_column_this_checkpoont = info.column_name.isin(new_cols) + info.checkpoint = np.where( + is_new_column_this_checkpoont, checkpoint_name, info.checkpoint + ) + schema[table_name] = info schema_df = pd.concat(schema.values()) if csv_format: - schema_df.to_csv(config.output_file_path(csv_format), header=True, index=False) + schema_df.to_csv( + state.get_output_file_path(csv_format), header=True, index=False + ) if txt_format: - with open(config.output_file_path(txt_format), "w") as output_file: - + with open(state.get_output_file_path(txt_format), "w") as output_file: # get max schema column widths from omnibus table col_width = {c: schema_df[c].str.len().max() + 2 for c in schema_df} @@ -215,7 +223,8 @@ def write_data_dictionary(output_dir): print(f"{info}\n", file=output_file) -def write_tables(output_dir): +@workflow.step +def write_tables(state: workflow.State) -> None: """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. @@ -258,76 +267,80 @@ def write_tables(output_dir): """ - output_tables_settings_name = "output_tables" - - output_tables_settings = setting(output_tables_settings_name) + output_tables_settings = state.settings.output_tables if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return - action = output_tables_settings.get("action") - tables = output_tables_settings.get("tables") - prefix = output_tables_settings.get("prefix", "final_") - h5_store = output_tables_settings.get("h5_store", False) - sort = output_tables_settings.get("sort", False) + action = output_tables_settings.action + tables = output_tables_settings.tables + prefix = output_tables_settings.prefix + h5_store = output_tables_settings.h5_store + sort = output_tables_settings.sort - registered_tables = pipeline.registered_tables() + registered_tables = state.registered_tables() if action == "include": # interpret empty or missing tables setting to mean include all registered tables output_tables_list = tables if tables is not None else registered_tables elif action == "skip": output_tables_list = [t for t in registered_tables if t not in tables] else: - raise "expected %s action '%s' to be either 'include' or 'skip'" % ( - output_tables_settings_name, - action, - ) + raise f"expected action '{action}' to be either 'include' or 'skip'" for table_name in output_tables_list: - - if not isinstance(table_name, str): + if isinstance(table_name, configuration.OutputTable): + table_decode_cols = table_name.decode_columns or {} + table_name = table_name.tablename + elif not isinstance(table_name, str): table_decode_cols = table_name.get("decode_columns", {}) table_name = table_name["tablename"] else: table_decode_cols = {} if table_name == "checkpoints": - df = pipeline.get_checkpoints() + dt = pa.Table.from_pandas( + state.checkpoint.get_inventory(), preserve_index=True + ) else: if table_name not in registered_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue - df = pipeline.get_table(table_name) + + # the write tables method now uses pyarrow to avoid making edits to + # the internal pipeline dataframes, which need to remain un-decoded + # for any subsequent summarize step[s]. + dt = state.get_pyarrow(table_name) + dt_index_name = state.get_dataframe_index_name(table_name) if sort: - traceable_table_indexes = inject.get_injectable( - "traceable_table_indexes", {} - ) + traceable_table_indexes = state.tracing.traceable_table_indexes - if df.index.name in traceable_table_indexes: - df = df.sort_index() + if dt_index_name in traceable_table_indexes: + dt = dt.sort_by(dt_index_name) logger.debug( - f"write_tables sorting {table_name} on index {df.index.name}" + f"write_tables sorting {table_name} on index {dt_index_name}" ) else: # find all registered columns we can use to sort this table # (they are ordered appropriately in traceable_table_indexes) sort_columns = [ - c for c in traceable_table_indexes if c in df.columns + (c, "ascending") + for c in traceable_table_indexes + if c in dt.columns ] if len(sort_columns) > 0: - df = df.sort_values(by=sort_columns) + dt = dt.sort_by(sort_columns) logger.debug( f"write_tables sorting {table_name} on columns {sort_columns}" ) else: logger.debug( - f"write_tables sorting {table_name} on unrecognized index {df.index.name}" + f"write_tables sorting {table_name} on unrecognized index {dt_index_name}" ) - df = df.sort_index() + dt = dt.sort_by(dt_index_name) - if config.setting("recode_pipeline_columns", True): + if state.settings.recode_pipeline_columns: for colname, decode_instruction in table_decode_cols.items(): if "|" in decode_instruction: decode_filter, decode_instruction = decode_instruction.split("|") @@ -338,14 +351,14 @@ def write_tables(output_dir): if "." not in decode_instruction: lookup_col = decode_instruction source_table = table_name - parent_table = df + parent_table = dt else: source_table, lookup_col = decode_instruction.split(".") - parent_table = inject.get_table(source_table) + parent_table = state.get_pyarrow(source_table) try: - map_col = parent_table[f"_original_{lookup_col}"] + map_col = parent_table.column(f"_original_{lookup_col}") except KeyError: - map_col = parent_table[lookup_col] + map_col = parent_table.column(lookup_col) map_col = np.asarray(map_col) map_func = map_col.__getitem__ if decode_filter: @@ -356,24 +369,28 @@ def map_func(x): else: raise ValueError(f"unknown decode_filter {decode_filter}") - if colname in df.columns: - df[colname] = df[colname].astype(int).map(map_func) - elif colname == df.index.name: - df.index = df.index.astype(int).map(map_func) + if colname in dt.column_names: + revised_col = ( + pd.Series(dt.column(colname)).astype(int).map(map_func) + ) + dt = dt.drop([colname]).append_column( + colname, pa.array(revised_col) + ) # drop _original_x from table if it is duplicative - if source_table == table_name and f"_original_{lookup_col}" in df: - df = df.drop(columns=[f"_original_{lookup_col}"]) + if ( + source_table == table_name + and f"_original_{lookup_col}" in dt.column_names + ): + dt = dt.drop([f"_original_{lookup_col}"]) if h5_store: - file_path = config.output_file_path("%soutput_tables.h5" % prefix) - df.to_hdf(file_path, key=table_name, mode="a", format="fixed") + file_path = state.get_output_file_path("%soutput_tables.h5" % prefix) + dt.to_pandas().to_hdf( + str(file_path), key=table_name, mode="a", format="fixed" + ) else: - file_name = "%s%s.csv" % (prefix, table_name) - file_path = config.output_file_path(file_name) + file_name = f"{prefix}{table_name}.csv" + file_path = state.get_output_file_path(file_name) # include the index if it has a name or is a MultiIndex - write_index = df.index.name is not None or isinstance( - df.index, pd.MultiIndex - ) - - df.to_csv(file_path, index=write_index) + csv.write_csv(dt, file_path) diff --git a/activitysim/core/test/__init__.py b/activitysim/core/test/__init__.py index 61e7caf526..9b942d5d21 100644 --- a/activitysim/core/test/__init__.py +++ b/activitysim/core/test/__init__.py @@ -1,2 +1,17 @@ # ActivitySim # See full license in LICENSE.txt. + +from __future__ import annotations + +try: + import pytest +except ImportError: + pass +else: + pytest.register_assert_rewrite("activitysim.core.test._tools") + +from activitysim.core.test._tools import ( # isort: skip + assert_equal, + assert_frame_substantively_equal, + run_if_exists, +) diff --git a/activitysim/core/test/_tools.py b/activitysim/core/test/_tools.py new file mode 100644 index 0000000000..930f25d404 --- /dev/null +++ b/activitysim/core/test/_tools.py @@ -0,0 +1,141 @@ +# ActivitySim +# See full license in LICENSE.txt. +from __future__ import annotations + +import traceback +from pathlib import Path + +import pandas as pd + + +def run_if_exists(filename): + import pytest + + stack = traceback.extract_stack() + base_dir = Path(stack[-2].filename).parent + target_file = base_dir.joinpath(filename) + + return pytest.mark.skipif( + not target_file.exists(), reason=f"required file {filename} is missing" + ) + + +def assert_frame_substantively_equal( + left, + right, + *args, + ignore_column_order=True, + ignore_extra_columns_left=False, + check_column_type_loosely=False, + **kwargs, +): + """ + Check that left and right DataFrame are substantively equal. + + This method generalizes the usual pandas DataFrame test, by allowing + the ordering of columns to be different, and allowing the left dataframe to + have extra columns (e.g. as might happen if more reporting or debugging + data is output into a dataframe, but we want to make sure that the "core" + expected things are all there and correct. + + Parameters + ---------- + left, right : pd.DataFrame + *args + Forwarded to pandas.testing.assert_frame_equal + ignore_column_order : bool, default True + Keyword only argument. + ignore_extra_columns_left : bool, default False + This cannot be True unless `ignore_column_order` is also True + check_column_type_loosely : bool, default False + Check that the dtype kind matches, not the dtype itself, for example + if one column is int32 and the other is int64 that is ok. + check_dtype : bool, default True + Whether to check the DataFrame dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_column_type : bool or {'equiv'}, default 'equiv' + Whether to check the columns class, dtype and inferred_type + are identical. Is passed as the ``exact`` argument of + :func:`assert_index_equal`. + check_frame_type : bool, default True + Whether to check the DataFrame class is identical. + check_names : bool, default True + Whether to check that the `names` attribute for both the `index` + and `column` attributes of the DataFrame is identical. + by_blocks : bool, default False + Specify how to compare internal data. If False, compare by columns. + If True, compare by blocks. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_like : bool, default False + If True, ignore the order of index & columns. + Note: index labels must match their respective rows + (same as in columns) - same labels must be with the same data. + check_freq : bool, default True + Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. + obj : str, default 'DataFrame' + Specify object name being compared, internally used to show appropriate + assertion message. + + **kwargs + Forwarded to pandas.testing.assert_frame_equal + """ + __tracebackhide__ = True # don't show this code in pytest outputs + + if ignore_extra_columns_left: + assert ignore_column_order + assert set(right.columns).issubset(left.columns) + left = left[right.columns] + + elif ignore_column_order: + # column order may not match, so fix it before checking + assert sorted(left.columns) == sorted(right.columns) + # if there are duplicate column names, we disavow this option + if not left.columns.has_duplicates: + left = left[right.columns] + + if check_column_type_loosely: + left_kinds = {k: i.kind for k, i in left.dtypes.items()} + right_kinds = {k: i.kind for k, i in left.dtypes.items()} + assert left_kinds == right_kinds + kwargs["check_column_type"] = False + + try: + pd.testing.assert_frame_equal(left, right, *args, **kwargs) + except Exception as err: + print(err) + raise + + +def assert_equal(x, y): + __tracebackhide__ = True # don't show this code in pytest outputs + try: + import pytest + except ImportError: + assert x == y + else: + if isinstance(x, list) and isinstance(y, list) and len(x) == len(y): + for n_, (x_, y_) in enumerate(zip(x, y)): + assert x_ == pytest.approx(y_), f"error at index {n_}" + elif isinstance(x, dict) and isinstance(y, dict) and x.keys() == y.keys(): + for n_ in x.keys(): + assert x[n_] == pytest.approx(y[n_]), f"error at key {n_}" + else: + try: + assert x == pytest.approx(y) + except (TypeError, AssertionError): + # pytest.approx() does not support nested data structures + for x_, y_ in zip(x, y): + assert x_ == pytest.approx(y_) diff --git a/activitysim/core/test/configs/custom_logging.yaml b/activitysim/core/test/configs/custom_logging.yaml index 71f1e6d4f1..e3fb9852d7 100644 --- a/activitysim/core/test/configs/custom_logging.yaml +++ b/activitysim/core/test/configs/custom_logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['xasim.log'] + filename: + get_log_file_path: 'xasim.log' mode: w formatter: simpleFormatter level: NOTSET @@ -51,4 +52,3 @@ logging: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/core/test/configs/logging.yaml b/activitysim/core/test/configs/logging.yaml index 35067d008c..6d02e2ed34 100644 --- a/activitysim/core/test/configs/logging.yaml +++ b/activitysim/core/test/configs/logging.yaml @@ -28,7 +28,7 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: activitysim.log mode: w formatter: simpleFormatter level: NOTSET @@ -51,4 +51,3 @@ logging: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/core/test/extensions/steps.py b/activitysim/core/test/extensions/steps.py index baa894c692..6772853168 100644 --- a/activitysim/core/test/extensions/steps.py +++ b/activitysim/core/test/extensions/steps.py @@ -1,64 +1,66 @@ +from __future__ import annotations + import pandas as pd -from activitysim.core import inject, pipeline, tracing +from activitysim.core import workflow -@inject.step() -def step1(): +@workflow.step +def step1(state: workflow.State) -> None: table1 = pd.DataFrame({"c": [1, 2, 3]}) - inject.add_table("table1", table1) + state.add_table("table1", table1) -@inject.step() -def step2(): +@workflow.step +def step2(state: workflow.State) -> None: table1 = pd.DataFrame({"c": [2, 4, 6]}) - inject.add_table("table2", table1) + state.add_table("table2", table1) -@inject.step() -def step3(): +@workflow.step +def step3(state: workflow.State) -> None: table1 = pd.DataFrame({"c": [3, 6, 9]}) - inject.add_table("table3", table1) + state.add_table("table3", table1) -@inject.step() -def step_add_col(): +@workflow.step +def step_add_col(state: workflow.State) -> None: - table_name = inject.get_step_arg("table_name") + table_name = state.get_step_arg("table_name") assert table_name is not None - col_name = inject.get_step_arg("column_name") + col_name = state.get_step_arg("column_name") assert col_name is not None - table = pipeline.get_table(table_name) + table = state.get_dataframe(table_name) assert col_name not in table.columns table[col_name] = table.index + (1000 * len(table.columns)) - pipeline.replace_table(table_name, table) + state.add_table(table_name, table) -@inject.step() -def step_forget_tab(): +@workflow.step +def step_forget_tab(state: workflow.State) -> None: - table_name = inject.get_step_arg("table_name") + table_name = state.get_step_arg("table_name") assert table_name is not None - table = pipeline.get_table(table_name) + table = state.get_dataframe(table_name) - pipeline.drop_table(table_name) + state.drop_table(table_name) -@inject.step() -def create_households(trace_hh_id): +@workflow.step +def create_households(state: workflow.State) -> None: df = pd.DataFrame({"household_id": [1, 2, 3], "home_zone_id": {100, 100, 101}}) - inject.add_table("households", df) + state.add_table("households", df) - pipeline.get_rn_generator().add_channel("households", df) + state.get_rn_generator().add_channel("households", df) - tracing.register_traceable_table("households", df) + state.tracing.register_traceable_table("households", df) diff --git a/activitysim/core/test/test_assign.py b/activitysim/core/test/test_assign.py index 3818711067..e018ec1d50 100644 --- a/activitysim/core/test/test_assign.py +++ b/activitysim/core/test/test_assign.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import logging.config import os.path @@ -8,12 +10,7 @@ import pandas as pd import pytest -from .. import assign, config, inject, tracing - - -def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) +from activitysim.core import assign, workflow def close_handlers(): @@ -26,9 +23,11 @@ def close_handlers(): logger.setLevel(logging.NOTSET) -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +@pytest.fixture +def state() -> workflow.State: + state = workflow.State() + state.initialize_filesystem(working_dir=os.path.dirname(__file__)) + return state @pytest.fixture(scope="module") @@ -51,22 +50,27 @@ def data(data_name): return pd.read_csv(data_name) -def test_read_model_spec(): - spec = assign.read_assignment_spec(config.config_file_path("assignment_spec.csv")) +def test_read_model_spec(state: workflow.State): + spec = assign.read_assignment_spec( + state.filesystem.get_config_file_path("assignment_spec.csv") + ) assert len(spec) == 8 assert list(spec.columns) == ["description", "target", "expression"] -def test_assign_variables(capsys, data): +def test_assign_variables(state: workflow.State, capsys, data): + state.default_settings() - spec = assign.read_assignment_spec(config.config_file_path("assignment_spec.csv")) + spec = assign.read_assignment_spec( + state.filesystem.get_config_file_path("assignment_spec.csv") + ) locals_d = {"CONSTANT": 7, "_shadow": 99} results, trace_results, trace_assigned_locals = assign.assign_variables( - spec, data, locals_d, trace_rows=None + state, spec, data, locals_d, trace_rows=None ) print(results) @@ -81,7 +85,7 @@ def test_assign_variables(capsys, data): trace_rows = [False, True, False] results, trace_results, trace_assigned_locals = assign.assign_variables( - spec, data, locals_d, trace_rows=trace_rows + state, spec, data, locals_d, trace_rows=trace_rows ) # should get same results as before @@ -108,10 +112,11 @@ def test_assign_variables(capsys, data): out, err = capsys.readouterr() -def test_assign_variables_aliased(capsys, data): +def test_assign_variables_aliased(state: workflow.State, capsys, data): + state.default_settings() spec = assign.read_assignment_spec( - config.config_file_path("assignment_spec_alias_df.csv") + state.filesystem.get_config_file_path("assignment_spec_alias_df.csv") ) locals_d = {"CONSTANT": 7, "_shadow": 99} @@ -119,7 +124,7 @@ def test_assign_variables_aliased(capsys, data): trace_rows = [False, True, False] results, trace_results, trace_assigned_locals = assign.assign_variables( - spec, data, locals_d, df_alias="aliased_df", trace_rows=trace_rows + state, spec, data, locals_d, df_alias="aliased_df", trace_rows=trace_rows ) print(results) @@ -146,17 +151,18 @@ def test_assign_variables_aliased(capsys, data): out, err = capsys.readouterr() -def test_assign_variables_failing(capsys, data): +def test_assign_variables_failing(state: workflow.State, capsys, data): + state.default_settings() close_handlers() output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) + state.filesystem.output_dir = output_dir - tracing.config_logger(basic=True) + state.logging.config_logger(basic=True) spec = assign.read_assignment_spec( - config.config_file_path("assignment_spec_failing.csv") + state.filesystem.get_config_file_path("assignment_spec_failing.csv") ) locals_d = { @@ -166,8 +172,8 @@ def test_assign_variables_failing(capsys, data): } with pytest.raises(NameError) as excinfo: - results, trace_results = assign.assign_variables( - spec, data, locals_d, trace_rows=None + results, trace_results, trace_assigned_locals = assign.assign_variables( + state, spec, data, locals_d, trace_rows=None ) out, err = capsys.readouterr() diff --git a/activitysim/core/test/test_inject_defaults.py b/activitysim/core/test/test_inject_defaults.py index 82e106984d..399756ae00 100644 --- a/activitysim/core/test/test_inject_defaults.py +++ b/activitysim/core/test/test_inject_defaults.py @@ -1,40 +1,40 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import os +from pathlib import Path import pytest +from pydantic import ValidationError # Note that the following import statement has the side-effect of registering injectables: -from .. import config, inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import workflow +from activitysim.core.configuration import Settings +from activitysim.core.exceptions import StateAccessError def test_defaults(): - inject.clear_cache() - - with pytest.raises(RuntimeError) as excinfo: - inject.get_injectable("configs_dir") - assert "directory does not exist" in str(excinfo.value) + state = workflow.State() + with pytest.raises(ValidationError): + state.initialize_filesystem(working_dir=Path(__file__).parents[1]) - with pytest.raises(RuntimeError) as excinfo: - inject.get_injectable("data_dir") - assert "directory does not exist" in str(excinfo.value) + work_dir = Path(__file__).parents[0] + state.initialize_filesystem(working_dir=work_dir) - with pytest.raises(RuntimeError) as excinfo: - output_dir = inject.get_injectable("output_dir") - print("output_dir", output_dir) - assert "directory does not exist" in str(excinfo.value) + assert state.filesystem.get_configs_dir() == (work_dir.joinpath("configs"),) + assert state.filesystem.get_data_dir() == (work_dir.joinpath("data"),) + assert state.filesystem.get_output_dir() == work_dir.joinpath("output") configs_dir = os.path.join(os.path.dirname(__file__), "configs_test_defaults") - inject.add_injectable("configs_dir", configs_dir) + with pytest.raises(ValidationError): + # can't write one path to configs_dir, must be a tuple + state.filesystem.configs_dir = Path(configs_dir) + state.filesystem.configs_dir = (Path(configs_dir),) - settings = inject.get_injectable("settings") - assert isinstance(settings, dict) + with pytest.raises(StateAccessError): + settings = state.settings - data_dir = os.path.join(os.path.dirname(__file__), "data") - inject.add_injectable("data_dir", data_dir) + state.load_settings() + assert isinstance(state.settings, Settings) diff --git a/activitysim/core/test/test_input.py b/activitysim/core/test/test_input.py index f695020b63..d0cfc24e24 100644 --- a/activitysim/core/test/test_input.py +++ b/activitysim/core/test/test_input.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import os import pandas as pd @@ -7,7 +9,7 @@ import yaml # Note that the following import statement has the side-effect of registering injectables: -from activitysim.core import config, inject, input +from activitysim.core import configuration, input, workflow @pytest.fixture(scope="module") @@ -21,21 +23,23 @@ def seed_households(): @pytest.fixture(scope="module") -def data_dir(): +def state(): configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) data_dir = os.path.join(os.path.dirname(__file__), "temp_data") if not os.path.exists(data_dir): os.mkdir(data_dir) - inject.add_injectable("data_dir", data_dir) + state = workflow.State().initialize_filesystem( + configs_dir=(configs_dir,), + output_dir=output_dir, + data_dir=(data_dir,), + ) - yield data_dir + yield state for file in os.listdir(data_dir): os.remove(os.path.join(data_dir, file)) @@ -43,17 +47,17 @@ def data_dir(): os.rmdir(data_dir) -def test_missing_table_list(data_dir): +def test_missing_table_list(state): - settings = inject.get_injectable("settings") - assert isinstance(settings, dict) + state.load_settings() + assert isinstance(state.settings, configuration.Settings) with pytest.raises(AssertionError) as excinfo: - input.read_input_table("households") + input.read_input_table(state, "households") assert "no input_table_list found" in str(excinfo.value) -def test_csv_reader(seed_households, data_dir): +def test_csv_reader(seed_households, state): settings_yaml = """ input_table_list: @@ -65,19 +69,20 @@ def test_csv_reader(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable("settings", settings) + settings = configuration.Settings.parse_obj(settings) + state.settings = settings - hh_file = os.path.join(data_dir, "households.csv") + hh_file = state.filesystem.get_data_dir()[0].joinpath("households.csv") seed_households.to_csv(hh_file, index=False) assert os.path.isfile(hh_file) - df = input.read_input_table("households") + df = input.read_input_table(state, "households") assert df.index.name == "household_id" -def test_hdf_reader1(seed_households, data_dir): +def test_hdf_reader1(seed_households, state): settings_yaml = """ input_table_list: @@ -89,19 +94,20 @@ def test_hdf_reader1(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable("settings", settings) + settings = configuration.Settings.parse_obj(settings) + state.settings = settings - hh_file = os.path.join(data_dir, "households.h5") + hh_file = state.filesystem.get_data_dir()[0].joinpath("households.h5") seed_households.to_hdf(hh_file, key="households", mode="w") assert os.path.isfile(hh_file) - df = input.read_input_table("households") + df = input.read_input_table(state, "households") assert df.index.name == "household_id" -def test_hdf_reader2(seed_households, data_dir): +def test_hdf_reader2(seed_households, state): settings_yaml = """ input_table_list: @@ -114,19 +120,20 @@ def test_hdf_reader2(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable("settings", settings) + settings = configuration.Settings.parse_obj(settings) + state.settings = settings - hh_file = os.path.join(data_dir, "households.h5") + hh_file = state.filesystem.get_data_dir()[0].joinpath("households.h5") seed_households.to_hdf(hh_file, key="seed_households", mode="w") assert os.path.isfile(hh_file) - df = input.read_input_table("households") + df = input.read_input_table(state, "households") assert df.index.name == "household_id" -def test_hdf_reader3(seed_households, data_dir): +def test_hdf_reader3(seed_households, state): settings_yaml = """ input_store: input_data.h5 @@ -138,19 +145,20 @@ def test_hdf_reader3(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable("settings", settings) + settings = configuration.Settings.parse_obj(settings) + state.settings = settings - hh_file = os.path.join(data_dir, "input_data.h5") + hh_file = state.filesystem.get_data_dir()[0].joinpath("input_data.h5") seed_households.to_hdf(hh_file, key="households", mode="w") assert os.path.isfile(hh_file) - df = input.read_input_table("households") + df = input.read_input_table(state, "households") assert df.index.name == "household_id" -def test_missing_filename(seed_households, data_dir): +def test_missing_filename(seed_households, state): settings_yaml = """ input_table_list: @@ -161,14 +169,15 @@ def test_missing_filename(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable("settings", settings) + settings = configuration.Settings.parse_obj(settings) + state.settings = settings with pytest.raises(AssertionError) as excinfo: - input.read_input_table("households") + input.read_input_table(state, "households") assert "no input file provided" in str(excinfo.value) -def test_create_input_store(seed_households, data_dir): +def test_create_input_store(seed_households, state): settings_yaml = """ create_input_store: True @@ -182,19 +191,23 @@ def test_create_input_store(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable("settings", settings) + settings = configuration.Settings.parse_obj(settings) + state.settings = settings - hh_file = os.path.join(data_dir, "households.csv") + hh_file = state.filesystem.get_data_dir()[0].joinpath("households.csv") seed_households.to_csv(hh_file, index=False) assert os.path.isfile(hh_file) - df = input.read_input_table("households") - - assert df.index.name == "household_id" - - output_store = os.path.join(inject.get_injectable("output_dir"), "input_data.h5") - assert os.path.exists(output_store) - - store_df = pd.read_hdf(output_store, "seed_households") - assert store_df.equals(seed_households) + with pytest.raises(NotImplementedError): + df = input.read_input_table(state, "households") + + # TODO if create_input_store is ever implemented + # + # assert df.index.name == "household_id" + # + # output_store = os.path.join(inject.get_injectable("output_dir"), "input_data.h5") + # assert os.path.exists(output_store) + # + # store_df = pd.read_hdf(output_store, "seed_households") + # assert store_df.equals(seed_households) diff --git a/activitysim/core/test/test_logging.py b/activitysim/core/test/test_logging.py new file mode 100644 index 0000000000..b99e808963 --- /dev/null +++ b/activitysim/core/test/test_logging.py @@ -0,0 +1,157 @@ +# ActivitySim +# See full license in LICENSE.txt. +from __future__ import annotations + +import logging +import textwrap + +import pytest +import yaml + +from activitysim.core import workflow + + +def close_handlers(): + loggers = logging.Logger.manager.loggerDict + for name in loggers: + logger = logging.getLogger(name) + logger.handlers = [] + logger.propagate = True + logger.setLevel(logging.NOTSET) + + +logging_config_content = { + "simple": """ + --- + logging: + version: 1 + disable_existing_loggers: true + loggers: + activitysim: + level: DEBUG + handlers: [logfile, console] + propagate: false + handlers: + logfile: + class: logging.FileHandler + filename: activitysim.log + mode: w + formatter: simpleFormatter + level: NOTSET + console: + class: logging.StreamHandler + stream: ext://sys.stdout + formatter: simpleFormatter + level: WARNING + formatters: + simpleFormatter: + class: logging.Formatter + format: '%(levelname)s - %(name)s - %(message)s' + datefmt: '%d/%m/%Y %H:%M:%S' + ... + """, + "functional": """ + --- + logging: + version: 1 + disable_existing_loggers: true + loggers: + activitysim: + level: DEBUG + handlers: [logfile, console] + propagate: false + handlers: + logfile: + class: logging.FileHandler + filename: + get_log_file_path: 'activitysim_from_func.log' + mode: w + formatter: simpleFormatter + level: NOTSET + console: + class: logging.StreamHandler + stream: ext://sys.stdout + formatter: simpleFormatter + level: WARNING + formatters: + simpleFormatter: + class: logging.Formatter + format: '%(levelname)s - %(name)s - %(message)s' + datefmt: '%d/%m/%Y %H:%M:%S' + ... + """, + "unsecure": """ + --- + logging: + version: 1 + disable_existing_loggers: true + loggers: + activitysim: + level: DEBUG + handlers: [logfile, console] + propagate: false + handlers: + logfile: + class: logging.FileHandler + filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim_unsecure.log'] + mode: w + formatter: simpleFormatter + level: NOTSET + console: + class: logging.StreamHandler + stream: ext://sys.stdout + formatter: simpleFormatter + level: WARNING + formatters: + simpleFormatter: + class: logging.Formatter + format: '%(levelname)s - %(name)s - %(message)s' + datefmt: '%d/%m/%Y %H:%M:%S' + ... + """, +} + + +@pytest.mark.parametrize("logging_yaml", logging_config_content.keys()) +def test_config_logger(capsys, logging_yaml): + + print(logging_config_content[logging_yaml]) + + state = workflow.State.make_temp() + state.filesystem.get_configs_dir()[0].joinpath("logging.yaml").write_text( + textwrap.dedent(logging_config_content[logging_yaml]) + ) + + if logging_yaml == "unsecure": + with pytest.raises(yaml.constructor.ConstructorError): + state.logging.config_logger() + return + + state.logging.config_logger() + + logger = logging.getLogger("activitysim") + + file_handlers = [h for h in logger.handlers if type(h) is logging.FileHandler] + assert len(file_handlers) == 1 + asim_logger_baseFilename = file_handlers[0].baseFilename + + logger.info("test_config_logger") + logger.info("log_info") + logger.warning("log_warn1") + + out, err = capsys.readouterr() + + assert "could not find conf file" not in out + assert "log_warn1" in out + assert "log_info" not in out + + close_handlers() + + logger = logging.getLogger(__name__) + logger.warning("log_warn2") + + with open(asim_logger_baseFilename) as content_file: + content = content_file.read() + print(content) + assert "log_warn1" in content + assert "log_warn2" not in content diff --git a/activitysim/core/test/test_logit.py b/activitysim/core/test/test_logit.py index 8253149e1f..c07c746507 100644 --- a/activitysim/core/test/test_logit.py +++ b/activitysim/core/test/test_logit.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import os.path @@ -8,18 +9,8 @@ import pandas.testing as pdt import pytest -from .. import inject, logit -from ..simulate import eval_variables - - -def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import logit, workflow +from activitysim.core.simulate import eval_variables @pytest.fixture(scope="module") @@ -27,15 +18,6 @@ def data_dir(): return os.path.join(os.path.dirname(__file__), "data") -def add_canonical_dirs(): - - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) - - # this is lifted straight from urbansim's test_mnl.py @pytest.fixture( scope="module", @@ -79,7 +61,8 @@ def spec(test_data): @pytest.fixture def utilities(choosers, spec, test_data): - vars = eval_variables(spec.index, choosers) + state = workflow.State().default_settings() + vars = eval_variables(state, spec.index, choosers) utils = vars.dot(spec).astype("float") return pd.DataFrame( utils.values.reshape(test_data["probabilities"].shape), @@ -88,33 +71,33 @@ def utilities(choosers, spec, test_data): def test_utils_to_probs(utilities, test_data): - probs = logit.utils_to_probs(utilities, trace_label=None) + state = workflow.State().default_settings() + probs = logit.utils_to_probs(state, utilities, trace_label=None) pdt.assert_frame_equal(probs, test_data["probabilities"]) def test_utils_to_probs_raises(): - - add_canonical_dirs() - + state = workflow.State().default_settings() idx = pd.Index(name="household_id", data=[1]) with pytest.raises(RuntimeError) as excinfo: logit.utils_to_probs( - pd.DataFrame([[1, 2, np.inf, 3]], index=idx), trace_label=None + state, pd.DataFrame([[1, 2, np.inf, 3]], index=idx), trace_label=None ) assert "infinite exponentiated utilities" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: logit.utils_to_probs( - pd.DataFrame([[-999, -999, -999, -999]], index=idx), trace_label=None + state, pd.DataFrame([[-999, -999, -999, -999]], index=idx), trace_label=None ) assert "all probabilities are zero" in str(excinfo.value) def test_make_choices_only_one(): + state = workflow.State().default_settings() probs = pd.DataFrame( [[1, 0, 0], [0, 1, 0]], columns=["a", "b", "c"], index=["x", "y"] ) - choices, rands = logit.make_choices(probs) + choices, rands = logit.make_choices(state, probs) pdt.assert_series_equal( choices, pd.Series([0, 1], index=["x", "y"]), check_dtype=False @@ -122,8 +105,9 @@ def test_make_choices_only_one(): def test_make_choices_real_probs(utilities): - probs = logit.utils_to_probs(utilities, trace_label=None) - choices, rands = logit.make_choices(probs) + state = workflow.State().default_settings() + probs = logit.utils_to_probs(state, utilities, trace_label=None) + choices, rands = logit.make_choices(state, probs) pdt.assert_series_equal( choices, @@ -151,7 +135,9 @@ def test_interaction_dataset_no_sample(interaction_choosers, interaction_alts): index=[1, 2, 3, 4] * 4, ) - interacted = logit.interaction_dataset(interaction_choosers, interaction_alts) + interacted = logit.interaction_dataset( + workflow.State().default_settings(), interaction_choosers, interaction_alts + ) interacted, expected = interacted.align(expected, axis=1) @@ -170,7 +156,10 @@ def test_interaction_dataset_sampled(interaction_choosers, interaction_alts): ) interacted = logit.interaction_dataset( - interaction_choosers, interaction_alts, sample_size=2 + workflow.State().default_settings(), + interaction_choosers, + interaction_alts, + sample_size=2, ) interacted, expected = interacted.align(expected, axis=1) diff --git a/activitysim/core/test/test_los.py b/activitysim/core/test/test_los.py index 9a99e9b6b2..00372a53ef 100644 --- a/activitysim/core/test/test_los.py +++ b/activitysim/core/test/test_los.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import os @@ -9,43 +10,42 @@ import pandas.testing as pdt import pytest -from .. import inject, los - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +import activitysim.abm.tables # noqa -- load table defs +from activitysim.core import exceptions, los, workflow def add_canonical_dirs(configs_dir_name): + state = workflow.State() configs_dir = os.path.join(os.path.dirname(__file__), f"los/{configs_dir_name}") - inject.add_injectable("configs_dir", configs_dir) - data_dir = os.path.join(os.path.dirname(__file__), f"los/data") - inject.add_injectable("data_dir", data_dir) - - output_dir = os.path.join(os.path.dirname(__file__), f"output") - inject.add_injectable("output_dir", output_dir) + output_dir = os.path.join(os.path.dirname(__file__), "output") + state.initialize_filesystem( + working_dir=os.path.dirname(__file__), + configs_dir=(configs_dir,), + output_dir=output_dir, + data_dir=(data_dir,), + ) + return state def test_legacy_configs(): - add_canonical_dirs("configs_legacy_settings") - - with pytest.warns(FutureWarning): - network_los = los.Network_LOS() + state = add_canonical_dirs("configs_legacy_settings").load_settings() - assert network_los.setting("zone_system") == los.ONE_ZONE + with pytest.raises(exceptions.SettingsFileNotFoundError): + network_los = los.Network_LOS(state) - assert "z1_taz_skims.omx" in network_los.omx_file_names("taz") + # if backwards compatability is ever fixed... + # assert network_los.setting("zone_system") == los.ONE_ZONE + # assert "z1_taz_skims.omx" in network_los.omx_file_names("taz") def test_one_zone(): - add_canonical_dirs("configs_1z") + state = add_canonical_dirs("configs_1z").load_settings() - network_los = los.Network_LOS() + network_los = los.Network_LOS(state) assert network_los.setting("zone_system") == los.ONE_ZONE @@ -89,9 +89,9 @@ def test_one_zone(): def test_two_zone(): - add_canonical_dirs("configs_2z") + state = add_canonical_dirs("configs_2z").load_settings() - network_los = los.Network_LOS() + network_los = los.Network_LOS(state) assert network_los.setting("zone_system") == los.TWO_ZONE @@ -139,9 +139,9 @@ def test_two_zone(): def test_three_zone(): - add_canonical_dirs("configs_3z") + state = add_canonical_dirs("configs_3z").load_settings() - network_los = los.Network_LOS() + network_los = los.Network_LOS(state) assert network_los.setting("zone_system") == los.THREE_ZONE @@ -164,8 +164,8 @@ def test_three_zone(): def test_30_minute_windows(): - add_canonical_dirs("configs_test_misc") - network_los = los.Network_LOS(los_settings_file_name="settings_30_min.yaml") + state = add_canonical_dirs("configs_test_misc").default_settings() + network_los = los.Network_LOS(state, los_settings_file_name="settings_30_min.yaml") assert network_los.skim_time_period_label(1) == "EA" assert network_los.skim_time_period_label(16) == "AM" @@ -181,8 +181,8 @@ def test_30_minute_windows(): def test_60_minute_windows(): - add_canonical_dirs("configs_test_misc") - network_los = los.Network_LOS(los_settings_file_name="settings_60_min.yaml") + state = add_canonical_dirs("configs_test_misc").default_settings() + network_los = los.Network_LOS(state, los_settings_file_name="settings_60_min.yaml") assert network_los.skim_time_period_label(1) == "EA" assert network_los.skim_time_period_label(8) == "AM" @@ -198,8 +198,8 @@ def test_60_minute_windows(): def test_1_week_time_window(): - add_canonical_dirs("configs_test_misc") - network_los = los.Network_LOS(los_settings_file_name="settings_1_week.yaml") + state = add_canonical_dirs("configs_test_misc").default_settings() + network_los = los.Network_LOS(state, los_settings_file_name="settings_1_week.yaml") assert network_los.skim_time_period_label(1) == "Sunday" assert network_los.skim_time_period_label(2) == "Monday" @@ -229,9 +229,9 @@ def test_1_week_time_window(): def test_skim_time_periods_future_warning(): - add_canonical_dirs("configs_test_misc") + state = add_canonical_dirs("configs_test_misc").default_settings() with pytest.warns(FutureWarning) as warning_test: network_los = los.Network_LOS( - los_settings_file_name="settings_legacy_hours_key.yaml" + state, los_settings_file_name="settings_legacy_hours_key.yaml" ) diff --git a/activitysim/core/test/test_pipeline.py b/activitysim/core/test/test_pipeline.py index 724aa18822..e061fdbb92 100644 --- a/activitysim/core/test/test_pipeline.py +++ b/activitysim/core/test/test_pipeline.py @@ -1,44 +1,40 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os import pytest import tables -from activitysim.core import inject, pipeline, tracing - -from .extensions import steps +from activitysim.core import workflow +from activitysim.core.test.extensions import steps # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 100 HH_ID = 961042 -def setup_function(): - - inject.reinject_decorated_tables() - - inject.remove_injectable("skim_dict") - inject.remove_injectable("skim_stack") +@pytest.fixture +def state(): configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) - data_dir = os.path.join(os.path.dirname(__file__), "data") - inject.add_injectable("data_dir", data_dir) - - inject.clear_cache() - tracing.config_logger() + state = ( + workflow.State() + .initialize_filesystem( + configs_dir=(configs_dir,), + output_dir=output_dir, + data_dir=(data_dir,), + ) + .load_settings() + ) - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() + state.logging.config_logger() + return state def close_handlers(): @@ -52,13 +48,12 @@ def close_handlers(): # @pytest.mark.filterwarnings('ignore::tables.NaturalNameWarning') -def test_pipeline_run(): +def test_pipeline_run(state): - inject.add_step("step1", steps.step1) - inject.add_step("step2", steps.step2) - inject.add_step("step3", steps.step3) - inject.add_step("step_add_col", steps.step_add_col) - inject.dump_state() + # workflow.step(steps.step1, step_name="step1") + # workflow.step(steps.step2, step_name="step2") + # workflow.step(steps.step3, step_name="step3") + # workflow.step(steps.step_add_col, step_name="step_add_col") _MODELS = [ "step1", @@ -67,43 +62,43 @@ def test_pipeline_run(): "step_add_col.table_name=table2;column_name=c2", ] - pipeline.run(models=_MODELS, resume_after=None) + state.run(models=_MODELS, resume_after=None) - checkpoints = pipeline.get_checkpoints() + checkpoints = state.checkpoint.get_inventory() print("checkpoints\n", checkpoints) - c2 = pipeline.get_table("table2").c2 + c2 = state.checkpoint.load_dataframe("table2").c2 # get table from - pipeline.get_table("table1", checkpoint_name="step3") + state.checkpoint.load_dataframe("table1", checkpoint_name="step3") # try to get a table from a step before it was checkpointed with pytest.raises(RuntimeError) as excinfo: - pipeline.get_table("table2", checkpoint_name="step1") + state.checkpoint.load_dataframe("table2", checkpoint_name="step1") assert "not in checkpoint 'step1'" in str(excinfo.value) # try to get a non-existant table with pytest.raises(RuntimeError) as excinfo: - pipeline.get_table("bogus") + state.checkpoint.load_dataframe("bogus") assert "never checkpointed" in str(excinfo.value) # try to get an existing table from a non-existant checkpoint with pytest.raises(RuntimeError) as excinfo: - pipeline.get_table("table1", checkpoint_name="bogus") + state.checkpoint.load_dataframe("table1", checkpoint_name="bogus") assert "not in checkpoints" in str(excinfo.value) - pipeline.close_pipeline() + state.checkpoint.close_store() close_handlers() -def test_pipeline_checkpoint_drop(): +def test_pipeline_checkpoint_drop(state): - inject.add_step("step1", steps.step1) - inject.add_step("step2", steps.step2) - inject.add_step("step3", steps.step3) - inject.add_step("step_add_col", steps.step_add_col) - inject.add_step("step_forget_tab", steps.step_forget_tab) + # workflow.step(steps.step1, step_name="step1") + # workflow.step(steps.step2, step_name="step2") + # workflow.step(steps.step3, step_name="step3") + # workflow.step(steps.step_add_col, step_name="step_add_col") + # workflow.step(steps.step_forget_tab, step_name="step_forget_tab") _MODELS = [ "step1", @@ -113,26 +108,26 @@ def test_pipeline_checkpoint_drop(): "step3", "step_forget_tab.table_name=table3", ] - pipeline.run(models=_MODELS, resume_after=None) + state.run(models=_MODELS, resume_after=None) - checkpoints = pipeline.get_checkpoints() + checkpoints = state.checkpoint.get_inventory() print("checkpoints\n", checkpoints) - pipeline.get_table("table1") + state.checkpoint.load_dataframe("table1") with pytest.raises(RuntimeError) as excinfo: - pipeline.get_table("table2") - assert "never checkpointed" in str(excinfo.value) + state.checkpoint.load_dataframe("table2") + # assert "never checkpointed" in str(excinfo.value) # can't get a dropped table from current checkpoint with pytest.raises(RuntimeError) as excinfo: - pipeline.get_table("table3") - assert "was dropped" in str(excinfo.value) + state.checkpoint.load_dataframe("table3") + # assert "was dropped" in str(excinfo.value) # ensure that we can still get table3 from a checkpoint at which it existed - pipeline.get_table("table3", checkpoint_name="step3") + state.checkpoint.load_dataframe("table3", checkpoint_name="step3") - pipeline.close_pipeline() + state.checkpoint.close_store() close_handlers() diff --git a/activitysim/core/test/test_random.py b/activitysim/core/test/test_random.py index 50b4f0dc5a..4a461811fa 100644 --- a/activitysim/core/test/test_random.py +++ b/activitysim/core/test/test_random.py @@ -1,5 +1,7 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import numpy as np import numpy.testing as npt import pandas as pd diff --git a/activitysim/core/test/test_simulate.py b/activitysim/core/test/test_simulate.py index ab100f6a96..9647b49454 100644 --- a/activitysim/core/test/test_simulate.py +++ b/activitysim/core/test/test_simulate.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import os.path @@ -9,40 +10,41 @@ import pandas.testing as pdt import pytest -from .. import inject, simulate +from activitysim.core import simulate, workflow -@pytest.fixture(scope="module") +@pytest.fixture def data_dir(): return os.path.join(os.path.dirname(__file__), "data") -@pytest.fixture(scope="module") +@pytest.fixture def spec_name(data_dir): return "sample_spec.csv" -@pytest.fixture(scope="module") -def spec(data_dir, spec_name): - return simulate.read_model_spec(file_name=spec_name) +@pytest.fixture +def state(data_dir) -> workflow.State: + state = workflow.State() + state.initialize_filesystem( + working_dir=os.path.dirname(__file__), data_dir=(data_dir,) + ).default_settings() + return state -@pytest.fixture(scope="module") -def data(data_dir): - return pd.read_csv(os.path.join(data_dir, "data.csv")) - +@pytest.fixture +def spec(state, spec_name): + return state.filesystem.read_model_spec(file_name=spec_name) -def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), f"output") - inject.add_injectable("output_dir", output_dir) +@pytest.fixture +def data(data_dir): + return pd.read_csv(os.path.join(data_dir, "data.csv")) -def test_read_model_spec(spec_name): +def test_read_model_spec(state, spec_name): - spec = simulate.read_model_spec(file_name=spec_name) + spec = state.filesystem.read_model_spec(file_name=spec_name) assert len(spec) == 4 assert spec.index.name == "Expression" @@ -50,9 +52,9 @@ def test_read_model_spec(spec_name): npt.assert_array_equal(spec.values, [[1.1, 11], [2.2, 22], [3.3, 33], [4.4, 44]]) -def test_eval_variables(spec, data): +def test_eval_variables(state, spec, data): - result = simulate.eval_variables(spec.index, data) + result = simulate.eval_variables(state, spec.index, data) expected = pd.DataFrame( [[1, 0, 4, 1], [0, 1, 4, 1], [0, 1, 5, 1]], index=data.index, columns=spec.index @@ -69,21 +71,24 @@ def test_eval_variables(spec, data): pdt.assert_frame_equal(result, expected, check_names=False) -def test_simple_simulate(data, spec): +def test_simple_simulate(state, data, spec): - inject.add_injectable("settings", {"check_for_variability": False}) + state.settings.check_for_variability = False - choices = simulate.simple_simulate(choosers=data, spec=spec, nest_spec=None) + choices = simulate.simple_simulate(state, choosers=data, spec=spec, nest_spec=None) expected = pd.Series([1, 1, 1], index=data.index) pdt.assert_series_equal(choices, expected, check_dtype=False) -def test_simple_simulate_chunked(data, spec): - - inject.add_injectable("settings", {"check_for_variability": False}) +def test_simple_simulate_chunked(state, data, spec): + state.settings.check_for_variability = False + state.settings.chunk_size = 2 choices = simulate.simple_simulate( - choosers=data, spec=spec, nest_spec=None, chunk_size=2 + state, + choosers=data, + spec=spec, + nest_spec=None, ) expected = pd.Series([1, 1, 1], index=data.index) pdt.assert_series_equal(choices, expected, check_dtype=False) diff --git a/activitysim/core/test/test_skim.py b/activitysim/core/test/test_skim.py index a1e47779ae..ff6d0a9d19 100644 --- a/activitysim/core/test/test_skim.py +++ b/activitysim/core/test/test_skim.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import numpy as np import numpy.testing as npt @@ -7,7 +8,7 @@ import pandas.testing as pdt import pytest -from .. import skim_dictionary +from activitysim.core import skim_dictionary, workflow @pytest.fixture @@ -35,7 +36,9 @@ def test_skims(data): skim_info.omx_shape = omx_shape skim_info.dtype_name = "int" - skim_dict = skim_dictionary.SkimDict("taz", skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict( + workflow.State().default_settings(), "taz", skim_info, skim_data + ) skim_dict.offset_mapper.set_offset_int(0) # default is -1 skims = skim_dict.wrap("taz_l", "taz_r") @@ -73,7 +76,9 @@ def test_3dskims(data): skim_info.dtype_name = "int" skim_info.key1_block_offsets = {"SOV": 0} - skim_dict = skim_dictionary.SkimDict("taz", skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict( + workflow.State().default_settings(), "taz", skim_info, skim_data + ) skim_dict.offset_mapper.set_offset_int(0) # default is -1 skims3d = skim_dict.wrap_3d(orig_key="taz_l", dest_key="taz_r", dim3_key="period") diff --git a/activitysim/core/test/test_timetable.py b/activitysim/core/test/test_timetable.py index e8ac8e5555..576217ced7 100644 --- a/activitysim/core/test/test_timetable.py +++ b/activitysim/core/test/test_timetable.py @@ -1,7 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. - -from builtins import range +from __future__ import annotations import numpy as np import pandas as pd @@ -9,8 +8,9 @@ import pytest from numpy.testing import assert_array_equal -from .. import chunk -from .. import timetable as tt +from activitysim.core import chunk +from activitysim.core import timetable as tt +from activitysim.core import workflow @pytest.fixture @@ -56,7 +56,9 @@ def tdd_alts(): def test_basic(persons, tdd_alts): - with chunk.chunk_log("test_basic", base=True): + state = workflow.State().default_settings() + + with chunk.chunk_log(state, "test_basic", base=True): person_windows = tt.create_timetable_windows(persons, tdd_alts) diff --git a/activitysim/core/test/test_tracing.py b/activitysim/core/test/test_tracing.py index df88e0d36c..8ae27add1c 100644 --- a/activitysim/core/test/test_tracing.py +++ b/activitysim/core/test/test_tracing.py @@ -1,12 +1,15 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import os.path import pandas as pd import pytest -from .. import inject, tracing +from activitysim.abm.tables import table_dict +from activitysim.core import tracing, workflow def close_handlers(): @@ -19,27 +22,30 @@ def close_handlers(): logger.setLevel(logging.NOTSET) -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() - - def add_canonical_dirs(): - inject.clear_cache() + state = workflow.State() configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) + state.add_injectable("configs_dir", configs_dir) output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) + state.add_injectable("output_dir", output_dir) + + state.initialize_filesystem( + working_dir=os.path.dirname(__file__), + configs_dir=(configs_dir,), + output_dir=output_dir, + ) + + return state def test_config_logger(capsys): - add_canonical_dirs() + state = add_canonical_dirs() - tracing.config_logger() + state.logging.config_logger() logger = logging.getLogger("activitysim") @@ -76,9 +82,9 @@ def test_config_logger(capsys): def test_print_summary(capsys): - add_canonical_dirs() + state = add_canonical_dirs() - tracing.config_logger() + state.logging.config_logger() tracing.print_summary( "label", df=pd.DataFrame(), describe=False, value_counts=False @@ -96,23 +102,24 @@ def test_print_summary(capsys): def test_register_households(capsys): - add_canonical_dirs() + state = add_canonical_dirs() + state.load_settings() - tracing.config_logger() + state.logging.config_logger() df = pd.DataFrame({"zort": ["a", "b", "c"]}, index=[1, 2, 3]) - inject.add_injectable("traceable_tables", ["households"]) - inject.add_injectable("trace_hh_id", 5) + state.tracing.traceable_tables = ["households"] + state.settings.trace_hh_id = 5 - tracing.register_traceable_table("households", df) + state.tracing.register_traceable_table("households", df) out, err = capsys.readouterr() # print out # don't consume output assert "Can't register table 'households' without index name" in out df.index.name = "household_id" - tracing.register_traceable_table("households", df) + state.tracing.register_traceable_table("households", df) out, err = capsys.readouterr() # print out # don't consume output @@ -124,22 +131,20 @@ def test_register_households(capsys): def test_register_tours(capsys): - add_canonical_dirs() + state = add_canonical_dirs().load_settings() - tracing.config_logger() + state.logging.config_logger() - inject.add_injectable("traceable_tables", ["households", "tours"]) + state.tracing.traceable_tables = ["households", "tours"] # in case another test injected this - inject.add_injectable("trace_tours", []) - inject.add_injectable( - "trace_hh_id", 3 - ) # need this or register_traceable_table is a nop + state.add_injectable("trace_tours", []) + state.settings.trace_hh_id = 3 tours_df = pd.DataFrame({"zort": ["a", "b", "c"]}, index=[10, 11, 12]) tours_df.index.name = "tour_id" - tracing.register_traceable_table("tours", tours_df) + state.tracing.register_traceable_table("tours", tours_df) out, err = capsys.readouterr() assert ( @@ -147,12 +152,12 @@ def test_register_tours(capsys): in out ) - inject.add_injectable("trace_hh_id", 3) + state.add_injectable("trace_hh_id", 3) households_df = pd.DataFrame({"dzing": ["a", "b", "c"]}, index=[1, 2, 3]) households_df.index.name = "household_id" - tracing.register_traceable_table("households", households_df) + state.tracing.register_traceable_table("households", households_df) - tracing.register_traceable_table("tours", tours_df) + state.tracing.register_traceable_table("tours", tours_df) out, err = capsys.readouterr() # print out # don't consume output @@ -160,13 +165,13 @@ def test_register_tours(capsys): tours_df["household_id"] = [1, 5, 3] - tracing.register_traceable_table("tours", tours_df) + state.tracing.register_traceable_table("tours", tours_df) out, err = capsys.readouterr() print(out) # don't consume output # should be tracing tour with tour_id 3 - traceable_table_ids = inject.get_injectable("traceable_table_ids") + traceable_table_ids = state.tracing.traceable_table_ids assert traceable_table_ids["tours"] == [12] close_handlers() @@ -174,12 +179,12 @@ def test_register_tours(capsys): def test_write_csv(capsys): - add_canonical_dirs() + state = add_canonical_dirs() - tracing.config_logger() + state.logging.config_logger() # should complain if df not a DataFrame or Series - tracing.write_csv(df="not a df or series", file_name="baddie") + state.tracing.write_csv(df="not a df or series", file_name="baddie") out, err = capsys.readouterr() @@ -212,16 +217,12 @@ def test_basic(capsys): close_handlers() - configs_dir = os.path.join(os.path.dirname(__file__), "configs") - inject.add_injectable("configs_dir", configs_dir) - - output_dir = os.path.join(os.path.dirname(__file__), "output") - inject.add_injectable("output_dir", output_dir) + state = add_canonical_dirs() # remove existing handlers or basicConfig is a NOP logging.getLogger().handlers = [] - tracing.config_logger(basic=True) + state.logging.config_logger(basic=True) logger = logging.getLogger() file_handlers = [h for h in logger.handlers if type(h) is logging.FileHandler] diff --git a/activitysim/core/test/test_util.py b/activitysim/core/test/test_util.py index 086e8a0326..ae9b4fa83b 100644 --- a/activitysim/core/test/test_util.py +++ b/activitysim/core/test/test_util.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import numpy as np import pandas as pd diff --git a/activitysim/core/test/utils_testing.py b/activitysim/core/test/utils_testing.py index a8a74fd3b4..223ae4cf25 100644 --- a/activitysim/core/test/utils_testing.py +++ b/activitysim/core/test/utils_testing.py @@ -1,11 +1,7 @@ # Orca # Copyright (C) 2016 UrbanSim Inc. # See full license in LICENSE. - -""" -Utilities used in testing of Orca. - -""" +from __future__ import annotations import numpy as np import numpy.testing as npt diff --git a/activitysim/core/testing.py b/activitysim/core/testing.py new file mode 100644 index 0000000000..c2f1cddf84 --- /dev/null +++ b/activitysim/core/testing.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from activitysim.core.test import * diff --git a/activitysim/core/timetable.py b/activitysim/core/timetable.py index a2106bbe25..85ecc69544 100644 --- a/activitysim/core/timetable.py +++ b/activitysim/core/timetable.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging from builtins import object, range @@ -8,7 +9,7 @@ import numpy as np import pandas as pd -from activitysim.core import chunk, pipeline +from activitysim.core import chunk, configuration, workflow logger = logging.getLogger(__name__) @@ -352,7 +353,7 @@ def __init__(self, windows_df, tdd_alts_df, table_name=None): self.checkpoint_df = None # series to map window row index value to window row's ordinal index - from ..core.fast_mapping import FastMapping + from activitysim.core.fast_mapping import FastMapping self.window_row_ix = FastMapping( pd.Series(list(range(len(windows_df.index))), index=windows_df.index) @@ -379,6 +380,9 @@ def __init__(self, windows_df, tdd_alts_df, table_name=None): assert (tdd_alts_df.index == list(range(tdd_alts_df.shape[0]))).all() self.tdd_footprints = np.asanyarray([list(r) for r in w_strings]).astype(int) + # by default, do not attach state to this object. + self.state = None + def begin_transaction(self, transaction_loggers): """ begin a transaction for an estimator or list of estimators @@ -411,6 +415,10 @@ def export_for_numba(self): tt_windows=self.windows, ) + def attach_state(self, state: workflow.State): + self.state = state + return self + def slice_windows_by_row_id(self, window_row_ids): """ return windows array slice containing rows for specified window_row_ids @@ -442,7 +450,7 @@ def get_windows_df(self): # assert (self.windows_df.values == self.windows).all() return self.windows_df - def replace_table(self): + def replace_table(self, state: workflow.State): """ Save or replace windows_df DataFrame to pipeline with saved table name (specified when object instantiated.) @@ -464,7 +472,7 @@ def replace_table(self): # get windows_df from bottleneck function in case updates to self.person_window # do not write through to pandas dataframe - pipeline.replace_table(self.windows_table_name, self.get_windows_df()) + state.add_table(self.windows_table_name, self.get_windows_df()) def tour_available(self, window_row_ids, tdds): """ @@ -632,7 +640,7 @@ def adjacent_window_run_length(self, window_row_ids, periods, before): assert len(window_row_ids) == len(periods) trace_label = "tt.adjacent_window_run_length" - with chunk.chunk_log(trace_label): + with chunk.chunk_log(self.state, trace_label) as chunk_sizer: available_run_length = _available_run_length_2( self.windows, self.window_row_ix._mapper, @@ -642,7 +650,9 @@ def adjacent_window_run_length(self, window_row_ids, periods, before): periods.to_numpy(), ) - chunk.log_df(trace_label, "available_run_length", available_run_length) + chunk_sizer.log_df( + trace_label, "available_run_length", available_run_length + ) return pd.Series(available_run_length, index=window_row_ids.index) diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index 44707c0aea..88bf0fc167 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -1,29 +1,21 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import logging import logging.config -import multiprocessing # for process name import os -import sys import time -from builtins import next, range -from collections import OrderedDict +from builtins import range import numpy as np import pandas as pd -import yaml - -from activitysim.core import inject - -from . import config # Configurations ASIM_LOGGER = "activitysim" CSV_FILE_TYPE = "csv" LOGGING_CONF_FILE_NAME = "logging.yaml" - logger = logging.getLogger(__name__) timing_notes = set() @@ -43,7 +35,7 @@ def format(self, record): return super(ElapsedTimeFormatter, self).format(record) -def extend_trace_label(trace_label, extension): +def extend_trace_label(trace_label: str = None, extension: str = None) -> str | None: if trace_label: trace_label = "%s.%s" % (trace_label, extension) return trace_label @@ -66,61 +58,32 @@ def print_elapsed_time(msg=None, t0=None, debug=False): return t1 -def log_runtime(model_name, start_time=None, timing=None, force=False): - global timing_notes - - assert (start_time or timing) and not (start_time and timing) - - timing = timing if timing else time.time() - start_time - seconds = round(timing, 1) - minutes = round(timing / 60, 1) - - process_name = multiprocessing.current_process().name - - if config.setting("multiprocess", False) and not force: - # when benchmarking, log timing for each processes in its own log - if config.setting("benchmarking", False): - header = "component_name,duration" - with config.open_log_file( - f"timing_log.{process_name}.csv", "a", header - ) as log_file: - print(f"{model_name},{timing}", file=log_file) - # only continue to log runtime in global timing log for locutor - if not inject.get_injectable("locutor", False): - return - - header = "process_name,model_name,seconds,minutes,notes" - note = " ".join(timing_notes) - with config.open_log_file("timing_log.csv", "a", header) as log_file: - print(f"{process_name},{model_name},{seconds},{minutes},{note}", file=log_file) - - timing_notes.clear() - - -def delete_output_files(file_type, ignore=None, subdir=None): +def delete_output_files(state, file_type, ignore=None, subdir=None): """ - Delete files in output directory of specified type + Delete files in output directory of specified type. Parameters ---------- - output_dir: str - Directory of trace output CSVs - - Returns - ------- - Nothing + state : Pipeline + The output directory is read from the Pipeline. + file_type : str + File extension to delete. + ignore : list[Path-like] + Specific files to leave alone. + subdir : list[Path-like], optional + Subdirectories to scrub. If not given, the top level output directory + plus the 'log' and 'trace' directories will be scrubbed. """ - output_dir = inject.get_injectable("output_dir") + output_dir = state.filesystem.get_output_dir() subdir = [subdir] if subdir else None directories = subdir or ["", "log", "trace"] for subdir in directories: + dir = output_dir.joinpath(output_dir, subdir) if subdir else output_dir - dir = os.path.join(output_dir, subdir) if subdir else output_dir - - if not os.path.exists(dir): + if not dir.exists(): continue if ignore: @@ -143,16 +106,12 @@ def delete_output_files(file_type, ignore=None, subdir=None): print(e) -def delete_trace_files(): +def delete_trace_files(state): """ Delete CSV files in output_dir - - Returns - ------- - Nothing """ - delete_output_files(CSV_FILE_TYPE, subdir="trace") - delete_output_files(CSV_FILE_TYPE, subdir="log") + delete_output_files(state, CSV_FILE_TYPE, subdir="trace") + delete_output_files(state, CSV_FILE_TYPE, subdir="log") active_log_files = [ h.baseFilename @@ -160,54 +119,7 @@ def delete_trace_files(): if isinstance(h, logging.FileHandler) ] - delete_output_files("log", ignore=active_log_files) - - -def config_logger(basic=False): - """ - Configure logger - - look for conf file in configs_dir, if not found use basicConfig - - Returns - ------- - Nothing - """ - - # look for conf file in configs_dir - if basic: - log_config_file = None - else: - log_config_file = config.config_file_path( - LOGGING_CONF_FILE_NAME, mandatory=False - ) - - if log_config_file: - try: - with open(log_config_file) as f: - config_dict = yaml.load(f, Loader=yaml.UnsafeLoader) - except Exception as e: - print(f"Unable to read logging config file {log_config_file}") - raise e - - try: - config_dict = config_dict["logging"] - config_dict.setdefault("version", 1) - logging.config.dictConfig(config_dict) - except Exception as e: - print(f"Unable to config logging as specified in {log_config_file}") - raise e - - else: - logging.basicConfig(level=logging.INFO, stream=sys.stdout) - - logger = logging.getLogger(ASIM_LOGGER) - - if log_config_file: - logger.info("Read logging configuration from: %s" % log_config_file) - else: - print("Configured logging using basicConfig") - logger.info("Configured logging using basicConfig") + delete_output_files(state, "log", ignore=active_log_files) def print_summary(label, df, describe=False, value_counts=False): @@ -243,132 +155,9 @@ def print_summary(label, df, describe=False, value_counts=False): logger.info("%s summary:\n%s" % (label, df.describe())) -def initialize_traceable_tables(): - - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - if len(traceable_table_ids) > 0: - logger.debug( - f"initialize_traceable_tables resetting table_ids for {list(traceable_table_ids.keys())}" - ) - inject.add_injectable("traceable_table_ids", {}) - - -def register_traceable_table(table_name, df): - """ - Register traceable table - - Parameters - ---------- - df: pandas.DataFrame - traced dataframe - - Returns - ------- - Nothing - """ - - # add index name to traceable_table_indexes - - logger.debug(f"register_traceable_table {table_name}") - - traceable_tables = inject.get_injectable("traceable_tables", []) - if table_name not in traceable_tables: - logger.error("table '%s' not in traceable_tables" % table_name) - return - - idx_name = df.index.name - if idx_name is None: - logger.error("Can't register table '%s' without index name" % table_name) - return - - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) - - if ( - idx_name in traceable_table_indexes - and traceable_table_indexes[idx_name] != table_name - ): - logger.error( - "table '%s' index name '%s' already registered for table '%s'" - % (table_name, idx_name, traceable_table_indexes[idx_name]) - ) - return - - # update traceable_table_indexes with this traceable_table's idx_name - if idx_name not in traceable_table_indexes: - traceable_table_indexes[idx_name] = table_name - logger.debug( - "adding table %s.%s to traceable_table_indexes" % (table_name, idx_name) - ) - inject.add_injectable("traceable_table_indexes", traceable_table_indexes) - - # add any new indexes associated with trace_hh_id to traceable_table_ids - - trace_hh_id = inject.get_injectable("trace_hh_id", None) - if trace_hh_id is None: - return - - new_traced_ids = [] - # if table_name == "households": - if table_name in ["households", "proto_households"]: - if trace_hh_id not in df.index: - logger.warning("trace_hh_id %s not in dataframe" % trace_hh_id) - new_traced_ids = [] - else: - logger.info( - "tracing household id %s in %s households" - % (trace_hh_id, len(df.index)) - ) - new_traced_ids = [trace_hh_id] - else: - - # find first already registered ref_col we can use to slice this table - ref_col = next((c for c in traceable_table_indexes if c in df.columns), None) - - if ref_col is None: - logger.error( - "can't find a registered table to slice table '%s' index name '%s'" - " in traceable_table_indexes: %s" - % (table_name, idx_name, traceable_table_indexes) - ) - return - - # get traceable_ids for ref_col table - ref_col_table_name = traceable_table_indexes[ref_col] - ref_col_traced_ids = traceable_table_ids.get(ref_col_table_name, []) - - # inject list of ids in table we are tracing - # this allows us to slice by id without requiring presence of a household id column - traced_df = df[df[ref_col].isin(ref_col_traced_ids)] - new_traced_ids = traced_df.index.tolist() - if len(new_traced_ids) == 0: - logger.warning( - "register %s: no rows with %s in %s." - % (table_name, ref_col, ref_col_traced_ids) - ) - - # update the list of trace_ids for this table - prior_traced_ids = traceable_table_ids.get(table_name, []) - - if new_traced_ids: - assert not set(prior_traced_ids) & set(new_traced_ids) - traceable_table_ids[table_name] = prior_traced_ids + new_traced_ids - inject.add_injectable("traceable_table_ids", traceable_table_ids) - - logger.debug( - "register %s: added %s new ids to %s existing trace ids" - % (table_name, len(new_traced_ids), len(prior_traced_ids)) - ) - logger.debug( - "register %s: tracing new ids %s in %s" - % (table_name, new_traced_ids, table_name) - ) - - def write_df_csv( df, file_path, index_label=None, columns=None, column_labels=None, transpose=True ): - need_header = not os.path.isfile(file_path) if columns: @@ -385,7 +174,6 @@ def write_df_csv( df_t.index.name = index_label if need_header: - if column_labels is None: column_labels = [None, None] if column_labels[0] is None: @@ -416,7 +204,6 @@ def write_df_csv( def write_series_csv( series, file_path, index_label=None, columns=None, column_labels=None ): - if isinstance(columns, str): series = series.rename(columns) elif isinstance(columns, list): @@ -430,64 +217,6 @@ def write_series_csv( series.to_csv(file_path, mode="a", index=True, header=need_header) -def write_csv( - df, file_name, index_label=None, columns=None, column_labels=None, transpose=True -): - """ - Print write_csv - - Parameters - ---------- - df: pandas.DataFrame or pandas.Series - traced dataframe - file_name: str - output file name - index_label: str - index name - columns: list - columns to write - transpose: bool - whether to transpose dataframe (ignored for series) - Returns - ------- - Nothing - """ - - assert len(file_name) > 0 - - if not file_name.endswith(".%s" % CSV_FILE_TYPE): - file_name = "%s.%s" % (file_name, CSV_FILE_TYPE) - - file_path = config.trace_file_path(file_name) - - if os.name == "nt": - abs_path = os.path.abspath(file_path) - if len(abs_path) > 255: - msg = f"path length ({len(abs_path)}) may exceed Windows maximum length unless LongPathsEnabled: {abs_path}" - logger.warning(msg) - - if os.path.isfile(file_path): - logger.debug("write_csv file exists %s %s" % (type(df).__name__, file_name)) - - if isinstance(df, pd.DataFrame): - # logger.debug("dumping %s dataframe to %s" % (df.shape, file_name)) - write_df_csv( - df, file_path, index_label, columns, column_labels, transpose=transpose - ) - elif isinstance(df, pd.Series): - # logger.debug("dumping %s element series to %s" % (df.shape[0], file_name)) - write_series_csv(df, file_path, index_label, columns, column_labels) - elif isinstance(df, dict): - df = pd.Series(data=df) - # logger.debug("dumping %s element dict to %s" % (df.shape[0], file_name)) - write_series_csv(df, file_path, index_label, columns, column_labels) - else: - logger.error( - "write_csv object for file_name '%s' of unexpected type: %s" - % (file_name, type(df)) - ) - - def slice_ids(df, ids, column=None): """ slice a dataframe to select only records with the specified ids @@ -523,96 +252,6 @@ def slice_ids(df, ids, column=None): return df -def get_trace_target(df, slicer, column=None): - """ - get target ids and column or index to identify target trace rows in df - - Parameters - ---------- - df: pandas.DataFrame - dataframe to slice - slicer: str - name of column or index to use for slicing - - Returns - ------- - (target, column) tuple - - target : int or list of ints - id or ids that identify tracer target rows - column : str - name of column to search for targets or None to search index - """ - - target_ids = None # id or ids to slice by (e.g. hh_id or person_ids or tour_ids) - - # special do-not-slice code for dumping entire df - if slicer == "NONE": - return target_ids, column - - if slicer is None: - slicer = df.index.name - - if isinstance(df, pd.DataFrame): - # always slice by household id if we can - if "household_id" in df.columns: - slicer = "household_id" - if slicer in df.columns: - column = slicer - - if column is None and df.index.name != slicer: - raise RuntimeError( - "bad slicer '%s' for df with index '%s'" % (slicer, df.index.name) - ) - - traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - - if df.empty: - target_ids = None - elif slicer in traceable_table_indexes: - # maps 'person_id' to 'persons', etc - table_name = traceable_table_indexes[slicer] - target_ids = traceable_table_ids.get(table_name, []) - elif slicer == "zone_id": - target_ids = inject.get_injectable("trace_od", []) - - return target_ids, column - - -def trace_targets(df, slicer=None, column=None): - - target_ids, column = get_trace_target(df, slicer, column) - - if target_ids is None: - targets = None - else: - - if column is None: - targets = df.index.isin(target_ids) - else: - # convert to numpy array for consistency since that is what index.isin returns - targets = df[column].isin(target_ids).to_numpy() - - return targets - - -def has_trace_targets(df, slicer=None, column=None): - - target_ids, column = get_trace_target(df, slicer, column) - - if target_ids is None: - found = False - else: - - if column is None: - found = df.index.isin(target_ids).any() - else: - found = df[column].isin(target_ids).any() - - return found - - def hh_id_for_chooser(id, choosers): """ @@ -665,272 +304,9 @@ def trace_id_for_chooser(id, choosers): return hh_id, column_name -def dump_df(dump_switch, df, trace_label, fname): - if dump_switch: - trace_label = extend_trace_label(trace_label, "DUMP.%s" % fname) - trace_df( - df, trace_label, index_label=df.index.name, slicer="NONE", transpose=False - ) - - -def trace_df( - df, - label, - slicer=None, - columns=None, - index_label=None, - column_labels=None, - transpose=True, - warn_if_empty=False, -): - """ - Slice dataframe by traced household or person id dataframe and write to CSV - - Parameters - ---------- - df: pandas.DataFrame - traced dataframe - label: str - tracer name - slicer: Object - slicer for subsetting - columns: list - columns to write - index_label: str - index name - column_labels: [str, str] - labels for columns in csv - transpose: boolean - whether to transpose file for legibility - warn_if_empty: boolean - write warning if sliced df is empty - - Returns - ------- - Nothing - """ - - target_ids, column = get_trace_target(df, slicer) - - if target_ids is not None: - df = slice_ids(df, target_ids, column) - - if warn_if_empty and df.shape[0] == 0 and target_ids != []: - column_name = column or slicer - logger.warning( - "slice_canonically: no rows in %s with %s == %s" - % (label, column_name, target_ids) - ) - - if df.shape[0] > 0: - write_csv( - df, - file_name=label, - index_label=(index_label or slicer), - columns=columns, - column_labels=column_labels, - transpose=transpose, - ) - - -def interaction_trace_rows(interaction_df, choosers, sample_size=None): - """ - Trace model design for interaction_simulate - - Parameters - ---------- - interaction_df: pandas.DataFrame - traced model_design dataframe - choosers: pandas.DataFrame - interaction_simulate choosers - (needed to filter the model_design dataframe by traced hh or person id) - sample_size int or None - int for constant sample size, or None if choosers have different numbers of alternatives - Returns - ------- - trace_rows : numpy.ndarray - array of booleans to flag which rows in interaction_df to trace - - trace_ids : tuple (str, numpy.ndarray) - column name and array of trace_ids mapping trace_rows to their target_id - for use by trace_interaction_eval_results which needs to know target_id - so it can create separate tables for each distinct target for readability - """ - - # slicer column name and id targets to use for chooser id added to model_design dataframe - # currently we only ever slice by person_id, but that could change, so we check here... - - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - - # Determine whether actual tables or proto_ tables for disaggregate accessibilities - persons_table_name = set(traceable_table_ids).intersection( - ["persons", "proto_persons"] - ) - households_table_name = set(traceable_table_ids).intersection( - ["households", "proto_households"] - ) - - assert len(persons_table_name) == 1 and len(persons_table_name) == 1 - persons_table_name, households_table_name = ( - persons_table_name.pop(), - households_table_name.pop(), - ) - - if choosers.index.name == "person_id" and persons_table_name in traceable_table_ids: - slicer_column_name = choosers.index.name - targets = traceable_table_ids["persons"] - elif choosers.index.name == "household_id" and "households" in traceable_table_ids: - slicer_column_name = choosers.index.name - targets = traceable_table_ids["households"] - elif "household_id" in choosers.columns and "households" in traceable_table_ids: - slicer_column_name = "household_id" - targets = traceable_table_ids[households_table_name] - elif "person_id" in choosers.columns and persons_table_name in traceable_table_ids: - slicer_column_name = "person_id" - targets = traceable_table_ids[persons_table_name] - else: - print(choosers.columns) - raise RuntimeError( - "interaction_trace_rows don't know how to slice index '%s'" - % choosers.index.name - ) - - if sample_size is None: - # if sample size not constant, we count on either - # slicer column being in itneraction_df - # or index of interaction_df being same as choosers - if slicer_column_name in interaction_df.columns: - trace_rows = np.in1d(interaction_df[slicer_column_name], targets) - trace_ids = interaction_df.loc[trace_rows, slicer_column_name].values - else: - assert interaction_df.index.name == choosers.index.name - trace_rows = np.in1d(interaction_df.index, targets) - trace_ids = interaction_df[trace_rows].index.values - - else: - - if slicer_column_name == choosers.index.name: - trace_rows = np.in1d(choosers.index, targets) - trace_ids = np.asanyarray(choosers[trace_rows].index) - elif slicer_column_name == "person_id": - trace_rows = np.in1d(choosers["person_id"], targets) - trace_ids = np.asanyarray(choosers[trace_rows].person_id) - elif slicer_column_name == "household_id": - trace_rows = np.in1d(choosers["household_id"], targets) - trace_ids = np.asanyarray(choosers[trace_rows].household_id) - else: - assert False - - # simply repeat if sample size is constant across choosers - assert sample_size == len(interaction_df.index) / len(choosers.index) - trace_rows = np.repeat(trace_rows, sample_size) - trace_ids = np.repeat(trace_ids, sample_size) - - assert type(trace_rows) == np.ndarray - assert type(trace_ids) == np.ndarray - - trace_ids = (slicer_column_name, trace_ids) - - return trace_rows, trace_ids - - -def trace_interaction_eval_results(trace_results, trace_ids, label): - """ - Trace model design eval results for interaction_simulate - - Parameters - ---------- - trace_results: pandas.DataFrame - traced model_design dataframe - trace_ids : tuple (str, numpy.ndarray) - column name and array of trace_ids from interaction_trace_rows() - used to filter the trace_results dataframe by traced hh or person id - label: str - tracer name - - Returns - ------- - Nothing - """ - - assert type(trace_ids[1]) == np.ndarray - - slicer_column_name = trace_ids[0] - - try: - trace_results[slicer_column_name] = trace_ids[1] - except ValueError: - trace_results[slicer_column_name] = int(trace_ids[1]) - - targets = np.unique(trace_ids[1]) - - if len(trace_results.index) == 0: - return - - # write out the raw dataframe - file_path = config.trace_file_path("%s.raw.csv" % label) - trace_results.to_csv(file_path, mode="a", index=True, header=True) - - # if there are multiple targets, we want them in separate tables for readability - for target in targets: - - df_target = trace_results[trace_results[slicer_column_name] == target] - - # we want the transposed columns in predictable order - df_target.sort_index(inplace=True) - - # # remove the slicer (person_id or hh_id) column? - # del df_target[slicer_column_name] - - target_label = "%s.%s.%s" % (label, slicer_column_name, target) - - trace_df( - df_target, - label=target_label, - slicer="NONE", - transpose=True, - column_labels=["expression", None], - warn_if_empty=False, - ) - - def no_results(trace_label): """ standard no-op to write tracing when a model produces no results """ logger.info("Skipping %s: no_results" % trace_label) - - -def deregister_traceable_table(table_name): - """ - un-register traceable table - - Parameters - ---------- - df: pandas.DataFrame - traced dataframe - - Returns - ------- - Nothing - """ - traceable_tables = inject.get_injectable("traceable_tables", []) - traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) - - if table_name not in traceable_tables: - logger.error("table '%s' not in traceable_tables" % table_name) - - else: - traceable_table_ids = { - k: v for k, v in traceable_table_ids.items() if k != table_name - } - traceable_table_indexes = OrderedDict( - {k: v for k, v in traceable_table_indexes.items() if v != table_name} - ) - - inject.add_injectable("traceable_table_ids", traceable_table_ids) - inject.add_injectable("traceable_table_indexes", traceable_table_indexes) - - return diff --git a/activitysim/core/util.py b/activitysim/core/util.py index 217cdb8377..e56460f439 100644 --- a/activitysim/core/util.py +++ b/activitysim/core/util.py @@ -1,5 +1,6 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations import argparse import collections @@ -7,12 +8,17 @@ import logging import os from builtins import zip +from collections.abc import Iterable from operator import itemgetter +from pathlib import Path import cytoolz as tz import cytoolz.curried import numpy as np import pandas as pd +import pyarrow as pa +import pyarrow.csv as csv +import pyarrow.parquet as pq import yaml logger = logging.getLogger(__name__) @@ -468,3 +474,59 @@ def nearest_node_index(node, nodes): deltas = nodes - node dist_2 = np.einsum("ij,ij->i", deltas, deltas) return np.argmin(dist_2) + + +def read_csv(filename): + """Simple read of a CSV file, much faster than pandas.read_csv""" + return csv.read_csv(filename).to_pandas() + + +def to_csv(df, filename, index=False): + """Simple write of a CSV file, much faster than pandas.DataFrame.to_csv""" + filename = Path(filename) + if filename.suffix == ".gz": + with pa.CompressedOutputStream(filename, "gzip") as out: + csv.write_csv(pa.Table.from_pandas(df, preserve_index=index), out) + else: + csv.write_csv(pa.Table.from_pandas(df, preserve_index=index), filename) + + +def read_parquet(filename): + """Simple read of a parquet file""" + return pq.read_table(filename).to_pandas() + + +def to_parquet(df, filename, index=False): + filename = Path(filename) + pq.write_table(pa.Table.from_pandas(df, preserve_index=index), filename) + + +def latest_file_modification_time(filenames: Iterable[Path]): + """Find the most recent file modification time.""" + return max(os.path.getmtime(filename) for filename in filenames) + + +def oldest_file_modification_time(filenames: Iterable[Path]): + """Find the least recent file modification time.""" + return min(os.path.getmtime(filename) for filename in filenames) + + +def zarr_file_modification_time(zarr_dir: Path): + """Find the most recent file modification time inside a zarr dir.""" + t = 0 + for dirpath, dirnames, filenames in os.walk(zarr_dir): + if os.path.basename(dirpath).startswith(".git"): + continue + for n in range(len(dirnames) - 1, -1, -1): + if dirnames[n].startswith(".git"): + dirnames.pop(n) + for f in filenames: + if f.startswith(".git") or f == ".DS_Store": + continue + finame = Path(os.path.join(dirpath, f)) + file_time = os.path.getmtime(finame) + if file_time > t: + t = file_time + if t == 0: + raise FileNotFoundError(zarr_dir) + return t diff --git a/activitysim/core/workflow/__init__.py b/activitysim/core/workflow/__init__.py new file mode 100644 index 0000000000..56a69c576b --- /dev/null +++ b/activitysim/core/workflow/__init__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +from .examples import create_example +from .state import State +from .steps import ModelSettingsFromYaml as from_yaml +from .steps import cached_object, func, step +from .steps import table as table +from .steps import temp_table as temp_table diff --git a/activitysim/core/workflow/accessor.py b/activitysim/core/workflow/accessor.py new file mode 100644 index 0000000000..a2317376d5 --- /dev/null +++ b/activitysim/core/workflow/accessor.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import inspect +import textwrap +import warnings + +from activitysim.core import workflow +from activitysim.core.exceptions import StateAccessError + +NO_DEFAULT = "< no default >" + + +class StateAccessor: + """ + Boilerplate code for accessors. + + Accessors consolidate groups of related functions in a common interface, + without requiring the main State class to become bloated by including all + relevant functionality. They also allow setting and storing attributes + without worrying about conflicting with similarly named attributes of + other accessors. + """ + + def __set_name__(self, owner, name): + self._name = name + + def __init__(self, state: "workflow.State" = None): + self._obj = state + + def __get__(self, instance, objtype=None): + if instance is None: + return self + cached_accessor = getattr(instance, f"_cached_accessor_{self._name}", None) + if cached_accessor is not None: + return cached_accessor + from .state import State + + assert isinstance(instance, State) + accessor_obj = self.__class__(instance) + object.__setattr__(instance, self._name, accessor_obj) + return accessor_obj + + def __set__(self, instance, value): + if isinstance(value, self.__class__): + setattr(instance, f"_cached_accessor_{self._name}", value) + else: + raise ValueError(f"cannot directly set accessor {self._name}") + + def __delete__(self, instance): + raise ValueError(f"cannot delete accessor {self._name}") + + +class FromState: + def __init__( + self, + member_type=None, + default_init=False, + default_value=NO_DEFAULT, + doc: str | None = None, + ): + """ + Creates a property to access an element from the current context. + + Parameters + ---------- + member_type : type + The data type for this attribute. Basic type validation may be + applied when setting this value, but validation could be disabled + when optimizing models for production and this type checking should + not be relied on as a runtime feature. If not given, member_type is + read from the accessor's type annotation (if applicable). + default_init : bool or callable + When set to true, if this context value is accessed and it has not + already been set, it is automatically initialized with the default + value (i.e. via a no-argument constructor) for the given type. + default_value : Any, optional + When set to some value, if this context value is accessed and it has + not already been set, it is automatically initialized with this + default value. + doc : str, optional + Attribute documentation. + """ + self.member_type = member_type + self._default_init = default_init + self._default_value = default_value + if self._default_init and self._default_value != NO_DEFAULT: + raise ValueError("cannot use both default_init and default_value") + if doc: + self.__doc__ = textwrap.dedent(doc).strip() + + def __set_name__(self, owner, name): + self.name = f"{owner.__name__.lower()}_{name}" + # set member type based on annotation + if self.member_type is None: + annot = inspect.get_annotations(owner, eval_str=True) + if name in annot: + self.member_type = annot[name] + + def __get__(self, instance: StateAccessor, objtype=None): + try: + return instance._obj._context[self.name] + except (KeyError, AttributeError): + if instance is None or instance._obj is None: + # a freestanding accessor not bound to a parent State is not + # typical but does happen when Sphinx generates documentation + return self + if self._default_init: + if callable(self._default_init): + instance._obj._context[self.name] = self._default_init() + else: + instance._obj._context[self.name] = self.member_type() + return instance._obj._context[self.name] + elif self._default_value != NO_DEFAULT: + instance._obj._context[self.name] = self._default_value + return instance._obj._context[self.name] + raise StateAccessError( + f"{self.name} not initialized for this state" + ) from None + + def __set__(self, instance: StateAccessor, value): + if not self.__validate_type(value): + raise TypeError(f"{self.name} must be {self.member_type} not {type(value)}") + instance._obj._context[self.name] = value + + def __delete__(self, instance): + self.__set__(instance, None) + + def __validate_type(self, value): + # type validation is only done at the top level for now. + try: + type_ok = isinstance(value, self.member_type) + except (TypeError, AttributeError): + from typing import get_args, get_origin + + type_ok = isinstance(value, get_origin(self.member_type)) + return type_ok diff --git a/activitysim/core/workflow/checkpoint.py b/activitysim/core/workflow/checkpoint.py new file mode 100644 index 0000000000..3aa4c3250c --- /dev/null +++ b/activitysim/core/workflow/checkpoint.py @@ -0,0 +1,1201 @@ +from __future__ import annotations + +import abc +import datetime as dt +import logging +import os +import warnings +from pathlib import Path +from typing import Optional, TypeVar + +import pandas as pd +import pyarrow as pa + +from activitysim.core.exceptions import ( + CheckpointFileNotFoundError, + CheckpointNameNotFoundError, + StateAccessError, + TableNameNotFound, +) +from activitysim.core.workflow.accessor import FromState, StateAccessor + +logger = logging.getLogger(__name__) + +# name of the checkpoint dict keys +# (which are also columns in the checkpoints dataframe stored in hte pipeline store) +TIMESTAMP = "timestamp" +CHECKPOINT_NAME = "checkpoint_name" +NON_TABLE_COLUMNS = [CHECKPOINT_NAME, TIMESTAMP] + +# name used for storing the checkpoints dataframe to the pipeline store +CHECKPOINT_TABLE_NAME = "checkpoints" + +LAST_CHECKPOINT = "_" + +# name of the first step/checkpoint created when the pipeline is started +INITIAL_CHECKPOINT_NAME = "init" +FINAL_CHECKPOINT_NAME = "final" + + +CheckpointStore = TypeVar("CheckpointStore", bound="GenericCheckpointStore") + + +class GenericCheckpointStore: + """Abstract base class defining interface for table storage.""" + + @abc.abstractmethod + def put( + self, + table_name: str, + df: pd.DataFrame, + complib: str = None, + checkpoint_name: str = None, + ) -> None: + """ + Store a table. + + Parameters + ---------- + table_name : str + df : pd.DataFrame + complib : str + Name of compression library to use. + checkpoint_name : str, optional + The checkpoint version name to use for this table. + """ + + @abc.abstractmethod + def get_dataframe( + self, table_name: str, checkpoint_name: str = None + ) -> pd.DataFrame: + """ + Load table from store as a pandas DataFrame. + + Parameters + ---------- + table_name : str + checkpoint_name : str, optional + The checkpoint version name to use for this table. + + Returns + ------- + pd.DataFrame + """ + + @property + @abc.abstractmethod + def is_readonly(self) -> bool: + """This store is read-only.""" + + @property + @abc.abstractmethod + def is_open(self) -> bool: + """This store is open.""" + + @abc.abstractmethod + def close(self) -> None: + """Close this store.""" + + @property + @abc.abstractmethod + def filename(self) -> Path: + """Location of this store.""" + + def list_checkpoint_names(self) -> list[str]: + """Get a list of all checkpoint names in this store.""" + try: + df = self.get_dataframe(CHECKPOINT_TABLE_NAME) + except Exception: + return [] + else: + return list(df.checkpoint_name) + + @classmethod + def from_hdf( + cls: CheckpointStore, + source_filename: Path, + dest_filename: Path, + mode: str = "a", + ) -> CheckpointStore: + """ + Create a new checkpoint store from an existing HdfStore. + + Parameters + ---------- + source_filename : path-like + The filename of the source HDF5 checkpoint file. This file should + be the output of an ActivitySim run (or constructed alike). + dest_filename : path-like + The filename or directory where a new checkpoint storage will be + created. + mode : str + The file mode used to open the destination. Must not be a read-only + mode or this operation will fail. + + Returns + ------- + CheckpointStore + """ + hdf_store = HdfStore(source_filename, "r") + output_store = cls(dest_filename, mode) + checkpoint_df = hdf_store.get_dataframe(CHECKPOINT_TABLE_NAME) + output_store.put(CHECKPOINT_TABLE_NAME, checkpoint_df) + for table_name in checkpoint_df.columns: + if table_name in NON_TABLE_COLUMNS: + continue + checkpoints_written = set() + for checkpoint_name in checkpoint_df[table_name]: + if checkpoint_name: + df = hdf_store.get_dataframe(table_name, checkpoint_name) + if checkpoint_name and checkpoint_name not in checkpoints_written: + output_store.put( + table_name, df, checkpoint_name=checkpoint_name + ) + checkpoints_written.add(checkpoint_name) + return output_store + + def _get_store_checkpoint_from_named_checkpoint( + self, table_name: str, checkpoint_name: str = LAST_CHECKPOINT + ): + """ + Get the name of the checkpoint where a table is actually written. + + Checkpoint tables are not re-written if the content has not changed, so + retrieving a particular table at a given checkpoint can involve back-tracking + to find where the file was last actually written. + + Parameters + ---------- + table_name : str + checkpoint_name : str, optional + The name of the checkpoint to load. If not given this function + will load the last stored checkpoint value. + + Returns + ------- + str + The checkpoint to actually load. + """ + cp_df = self.get_dataframe(CHECKPOINT_TABLE_NAME).set_index(CHECKPOINT_NAME) + if checkpoint_name == LAST_CHECKPOINT: + checkpoint_name = cp_df.index[-1] + try: + return cp_df.loc[checkpoint_name, table_name] + except KeyError: + if checkpoint_name not in cp_df.index: + raise CheckpointNameNotFoundError(checkpoint_name) + elif table_name not in cp_df.columns: + raise TableNameNotFound(table_name) + else: + raise + + +class HdfStore(GenericCheckpointStore): + """Storage interface for HDF5-based table storage.""" + + def __init__(self, filename: Path, mode="a"): + self._hdf5 = pd.HDFStore(str(filename), mode=mode) + + @property + def filename(self) -> Path: + return Path(self._hdf5.filename) + + def _store_table_key(self, table_name, checkpoint_name): + if checkpoint_name: + key = f"{table_name}/{checkpoint_name}" + else: + key = f"/{table_name}" + return key + + def put( + self, + table_name: str, + df: pd.DataFrame, + complib: str = None, + checkpoint_name: str = None, + ) -> None: + key = self._store_table_key(table_name, checkpoint_name) + if complib is None or len(df.columns) == 0: + # tables with no columns can't be compressed successfully, so to + # avoid them getting just lost and dropped they are instead written + # in fixed format with no compression, which should be just fine + # since they have no data anyhow. + self._hdf5.put(key, df) + else: + self._hdf5.put(key, df, "table", complib=complib) + self._hdf5.flush() + + def get_dataframe( + self, table_name: str, checkpoint_name: str = None + ) -> pd.DataFrame: + key = self._store_table_key(table_name, checkpoint_name) + return self._hdf5[key] + + @property + def is_readonly(self) -> bool: + return self._hdf5._mode == "r" + + @property + def is_open(self) -> bool: + return self._hdf5.is_open + + def close(self) -> None: + """Close this store.""" + self._hdf5.close() + + +class ParquetStore(GenericCheckpointStore): + """Storage interface for parquet-based table storage. + + This store will store each saved table in a parquet-format archive, + resulting in a hierarchy of separate files in a defined structure, as + opposed to a single monolithic repository files containing all the data. + + This interface will fall back to storing tables in a gzipped pickle if + the parquet format fails (as might happen if datatypes for some columns + are not homogenous and values are stored as "object"). + """ + + extension = ".parquetpipeline" + + @staticmethod + def _to_parquet(df: pd.DataFrame, filename, *args, **kwargs): + try: + df.to_parquet(filename, *args, **kwargs) + except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as err: + logger.error( + f"Problem writing to {filename}\n" f"{err}\n" f"falling back to pickle" + ) + # fallback to pickle, compatible with more dtypes + df.to_pickle(Path(filename).with_suffix(".pickle.gz")) + + def __init__(self, directory: Path, mode: str = "a", gitignore: bool = True): + """Initialize a storage interface for parquet-based table storage. + + Parameters + ---------- + directory : Path + The file directory for this ParquetStore. If this location does not + include a ".parquetpipeline" or ".zip" suffix, one is added. + mode : {"a", "r"}, default "a" + Mode to open this store, "a"ppend or "r"ead-only. Zipped stores + can only be opened in read-only mode. + gitignore : bool, default True + If not opened in read-only mode, should a ".gitignore" file be added + with a global wildcard (**)? Doing so will help prevent this store + from being accidentally committed to git. + """ + directory = Path(directory) + if directory.suffix == ".zip": + if mode != "r": + raise ValueError("can only open a Zip parquet store as read-only.") + elif directory.suffix != self.extension: + directory = directory.with_suffix(self.extension) + self._directory = directory + self._mode = mode + if self._mode != "r": + self._directory.mkdir(parents=True, exist_ok=True) + if gitignore and not self._directory.joinpath(".gitignore").exists(): + self._directory.joinpath(".gitignore").write_text("**\n") + + @property + def filename(self) -> Path: + """The directory location of this ParquetStore.""" + return self._directory + + def _store_table_path(self, table_name, checkpoint_name): + if checkpoint_name: + return self._directory.joinpath(table_name, f"{checkpoint_name}.parquet") + else: + return self._directory.joinpath(f"{table_name}.parquet") + + def put( + self, + table_name: str, + df: pd.DataFrame, + complib: str = "NOTSET", + checkpoint_name: str = None, + ) -> None: + if self.is_readonly: + raise ValueError("store is read-only") + filepath = self._store_table_path(table_name, checkpoint_name) + filepath.parent.mkdir(parents=True, exist_ok=True) + if complib == "NOTSET": + self._to_parquet(pd.DataFrame(df), filepath) + else: + self._to_parquet(pd.DataFrame(df), filepath, compression=complib) + + def get_dataframe( + self, table_name: str, checkpoint_name: str = None + ) -> pd.DataFrame: + if table_name != CHECKPOINT_TABLE_NAME and checkpoint_name is None: + checkpoint_name = LAST_CHECKPOINT + if self._directory.suffix == ".zip": + import io + import zipfile + + zip_internal_filename = self._store_table_path( + table_name, checkpoint_name + ).relative_to(self._directory) + with zipfile.ZipFile(self._directory, mode="r") as zipf: + namelist = set(zipf.namelist()) + if zip_internal_filename.as_posix() in namelist: + with zipf.open(zip_internal_filename.as_posix()) as zipo: + return pd.read_parquet(zipo) + elif ( + zip_internal_filename.with_suffix(".pickle.gz").as_posix() + in namelist + ): + with zipf.open( + zip_internal_filename.with_suffix(".pickle.gz").as_posix() + ) as zipo: + return pd.read_pickle(zipo, compression="gzip") + checkpoint_name_ = self._get_store_checkpoint_from_named_checkpoint( + table_name, checkpoint_name + ) + if checkpoint_name_ != checkpoint_name: + return self.get_dataframe(table_name, checkpoint_name_) + raise FileNotFoundError(str(zip_internal_filename)) + target_path = self._store_table_path(table_name, checkpoint_name) + if target_path.exists(): + return pd.read_parquet(target_path) + elif target_path.with_suffix(".pickle.gz").exists(): + return pd.read_pickle(target_path.with_suffix(".pickle.gz")) + else: + # the direct-read failed, check for backtracking checkpoint + if checkpoint_name is not None: + checkpoint_name_ = self._get_store_checkpoint_from_named_checkpoint( + table_name, checkpoint_name + ) + if checkpoint_name_ != checkpoint_name: + return self.get_dataframe(table_name, checkpoint_name_) + raise FileNotFoundError(target_path) + + @property + def is_readonly(self) -> bool: + return self._mode == "r" + + @property + def is_open(self) -> bool: + return self._directory is not None and self._directory.is_dir() + + def close(self) -> None: + """Close this store.""" + pass + + def make_zip_archive(self, output_filename) -> Path: + """ + Compress this pipeline into a zip archive. + + Parameters + ---------- + output_filename + + Returns + ------- + Path + Filename of the resulting zipped store. + """ + output_filename = Path(output_filename) + import zipfile + + if output_filename.suffix != ".zip": + output_filename = output_filename.with_suffix(".zip") + with zipfile.ZipFile(output_filename, "w", zipfile.ZIP_DEFLATED) as zipf: + for root, dirs, files in os.walk(self._directory): + files = [f for f in files if not f[0] == "."] + for f in files: + arcname = Path(root).joinpath(f).relative_to(self._directory) + zipf.write(Path(root).joinpath(f), arcname=arcname) + + return output_filename + + def wipe(self): + """ + Remove this store, including all subdirectories. + """ + if self.is_readonly: + raise ValueError("store is readonly") + walked = list(os.walk(self._directory)) + while walked: + root, dirs, files = walked.pop(-1) + for f in files: + if f.endswith(".parquet"): + os.unlink(os.path.join(root, f)) + # after removing all parquet files, is this directory basically empty? + should_drop_root = True + file_list = {f for f in Path(root).glob("**/*") if f.is_file()} + for f in file_list: + if f not in {".gitignore", ".DS_Store"}: + should_drop_root = False + if should_drop_root: + os.rmdir(root) + + +class NullStore(GenericCheckpointStore): + """ + A NullStore is a dummy that emulates a checkpoint store object. + + It never writes anything to disk and is primarily used to for + temporary data to prevent accidentally overwriting content in + a "real" store. + """ + + def put( + self, + table_name: str, + df: pd.DataFrame, + complib: str = "NOTSET", + checkpoint_name: str = None, + ) -> None: + pass + + def get_dataframe( + self, table_name: str, checkpoint_name: str = None + ) -> pd.DataFrame: + raise ValueError("no data is actually stored in NullStore") + + @property + def is_readonly(self) -> bool: + return False + + @property + def is_open(self) -> bool: + return True + + def close(self) -> None: + """Close this store.""" + pass + + +class Checkpoints(StateAccessor): + """ + State accessor for checkpointing operations. + + See :ref:`State.checkpoint ` for more detailed + documentation. + """ + + last_checkpoint: dict = FromState( + default_init=True, + doc=""" + Metadata about the last saved checkpoint. + + This dictionary contains the name of the checkpoint, a timestamp, and + the checkpoint-lookup for all relevant tables. + """, + ) + checkpoints: list[dict] = FromState( + default_init=True, + doc=""" + Metadata about various saved checkpoint(s). + + Each item in this list is a dictionary similar to the `last_checkpoint`. + """, + ) + _checkpoint_store: GenericCheckpointStore | None = FromState( + default_value=None, + doc=""" + The store where checkpoints are written. + """, + ) + + def __get__(self, instance, objtype=None) -> Checkpoints: + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) + + def initialize(self): + self.last_checkpoint = {} + self.checkpoints: list[dict] = [] + self._checkpoint_store = None + + @property + def store(self) -> GenericCheckpointStore: + """The store where checkpoints are written.""" + if self._checkpoint_store is None: + self.open_store() + return self._checkpoint_store + + def store_is_open(self) -> bool: + """Whether this checkpoint store is open.""" + if self._checkpoint_store is None: + return False + return self._checkpoint_store.is_open + + def default_pipeline_file_path(self): + if self._obj is None: + # a freestanding accessor not bound to a parent State is not + # typical but does happen when Sphinx generates documentation + return self + prefix = self._obj.get("pipeline_file_prefix", None) + if prefix is None: + return self._obj.filesystem.get_pipeline_filepath() + else: + pipeline_file_name = str(self._obj.filesystem.pipeline_file_name) + pipeline_file_name = f"{prefix}-{pipeline_file_name}" + return self._obj.filesystem.get_output_dir().joinpath(pipeline_file_name) + + def open_store( + self, pipeline_file_name: Optional[Path] = None, overwrite=False, mode="a" + ): + """ + Open the checkpoint store. + + The format for the checkpoint store is determined by the + `checkpoint_format` setting in the top-level Settings. + + Parameters + ---------- + pipeline_file_name : Path-like, optional + An explicit pipeline file path. If not given, the default pipeline + file path is opened. + overwrite : bool, default False + delete file before opening (unless resuming) + mode : {'a', 'w', 'r', 'r+'}, default 'a' + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + """ + + if self._checkpoint_store is not None: + raise RuntimeError("Pipeline store is already open!") + + if pipeline_file_name is None: + pipeline_file_path = self.default_pipeline_file_path() + else: + pipeline_file_path = Path(pipeline_file_name) + + if self._obj.settings.checkpoint_format == "hdf": + if overwrite: + try: + if os.path.isfile(pipeline_file_path): + logger.debug("removing pipeline store: %s" % pipeline_file_path) + os.unlink(pipeline_file_path) + except Exception as e: + print(e) + logger.warning(f"Error removing {pipeline_file_path}: {e}") + + self._checkpoint_store = HdfStore(pipeline_file_path, mode=mode) + else: + self._checkpoint_store = ParquetStore(pipeline_file_path, mode=mode) + + logger.debug(f"opened checkpoint.store {pipeline_file_path}") + + def close_store(self): + """ + Close the checkpoint storage. + """ + if self._checkpoint_store is not None: + self.store.close() + self._checkpoint_store = None + logger.debug("checkpoint.close_store") + + def is_readonly(self): + if self._checkpoint_store is not None: + try: + return self._checkpoint_store.is_readonly + except AttributeError: + return None + return False + + def last_checkpoint_name(self): + if self.last_checkpoint: + try: + return self.last_checkpoint.get("checkpoint_name", None) + except AttributeError: + return None + else: + return None + + def add(self, checkpoint_name: str): + """ + Create a new checkpoint with specified name. + + Adding a checkpoint will write into the checkpoint store + all the data required to restore the simulation to its + current state. + + Parameters + ---------- + checkpoint_name : str + """ + timestamp = dt.datetime.now() + + logger.debug("add_checkpoint %s timestamp %s" % (checkpoint_name, timestamp)) + + for table_name in self._obj.uncheckpointed_table_names(): + df = self._obj.get_dataframe(table_name) + logger.debug(f"add_checkpoint {checkpoint_name!r} table {table_name!r}") + self._write_df(df, table_name, checkpoint_name) + + # remember which checkpoint it was last written + self.last_checkpoint[table_name] = checkpoint_name + self._obj.existing_table_status[table_name] = False + + self.last_checkpoint[CHECKPOINT_NAME] = checkpoint_name + self.last_checkpoint[TIMESTAMP] = timestamp + + # append to the array of checkpoint history + self.checkpoints.append(self.last_checkpoint.copy()) + + # create a pandas dataframe of the checkpoint history, one row per checkpoint + checkpoints = pd.DataFrame(self.checkpoints) + + # convert empty values to str so PyTables doesn't pickle object types + for c in checkpoints.columns: + checkpoints[c] = checkpoints[c].fillna("") + + # write it to the store, overwriting any previous version (no way to simply extend) + self._write_df(checkpoints, CHECKPOINT_TABLE_NAME) + + def _read_df( + self, table_name, checkpoint_name=None, store: GenericCheckpointStore = None + ): + """ + Read a pandas dataframe from the pipeline store. + + We store multiple versions of all simulation tables, for every checkpoint in which they change, + so we need to know both the table_name and the checkpoint_name of hte desired table. + + The only exception is the checkpoints dataframe, which just has a table_name + + An error will be raised by HDFStore if the table is not found + + Parameters + ---------- + table_name : str + checkpoint_name : str + + Returns + ------- + df : pandas.DataFrame + the dataframe read from the store + + """ + if store is None: + store = self.store + return store.get_dataframe(table_name, checkpoint_name) + + def _write_df( + self, + df: pd.DataFrame, + table_name: str, + checkpoint_name: str = None, + store: GenericCheckpointStore = None, + ): + """ + Write a pandas dataframe to the pipeline store. + + We store multiple versions of all simulation tables, for every checkpoint in which they change, + so we need to know both the table_name and the checkpoint_name to label the saved table + + The only exception is the checkpoints dataframe, which just has a table_name, + although when using the parquet storage format this file is stored as "None.parquet" + to maintain a simple consistent file directory structure. + + Parameters + ---------- + df : pandas.DataFrame + dataframe to store + table_name : str + also conventionally the injected table name + checkpoint_name : str + the checkpoint at which the table was created/modified + store : GenericCheckpointStore, optional + Write to this store instead of the default store. + """ + if store is None: + store = self.store + + # coerce column names to str as unicode names will cause PyTables to pickle them + df.columns = df.columns.astype(str) + + store.put( + table_name, + df, + complib=self._obj.settings.pipeline_complib, + checkpoint_name=checkpoint_name, + ) + + def list_tables(self): + """ + Return a list of the names of all checkpointed tables + """ + return [ + name + for name, checkpoint_name in self.last_checkpoint.items() + if checkpoint_name and name not in NON_TABLE_COLUMNS + ] + + def load(self, checkpoint_name: str, store=None): + """ + Load dataframes and restore random number channel state from pipeline hdf5 file. + This restores the pipeline state that existed at the specified checkpoint in a prior simulation. + This allows us to resume the simulation after the specified checkpoint + + Parameters + ---------- + checkpoint_name : str + model_name of checkpoint to load (resume_after argument to open_pipeline) + """ + + logger.info(f"load_checkpoint {checkpoint_name} from {self.store.filename}") + + try: + checkpoints = self._read_df(CHECKPOINT_TABLE_NAME, store=store) + except FileNotFoundError as err: + raise CheckpointFileNotFoundError(err) from None + + if checkpoint_name == LAST_CHECKPOINT: + checkpoint_name = checkpoints[CHECKPOINT_NAME].iloc[-1] + logger.info(f"loading checkpoint '{checkpoint_name}'") + + try: + # truncate rows after target checkpoint + i = checkpoints[checkpoints[CHECKPOINT_NAME] == checkpoint_name].index[0] + checkpoints = checkpoints.loc[:i] + + # if the store is not open in read-only mode, + # write it to the store to ensure so any subsequent checkpoints are forgotten + if ( + store is None + and not self.is_readonly + and isinstance(self.store, pd.HDFStore) + ): + self._write_df(checkpoints, CHECKPOINT_TABLE_NAME) + + except IndexError: + msg = f"Couldn't find checkpoint '{checkpoint_name}' in checkpoints" + print(checkpoints[CHECKPOINT_NAME]) + logger.error(msg) + raise RuntimeError(msg) from None + + # convert pandas dataframe back to array of checkpoint dicts + checkpoints = checkpoints.to_dict(orient="records") + + # drop tables with empty names + for checkpoint in checkpoints: + for key in list(checkpoint.keys()): + if key not in NON_TABLE_COLUMNS and not checkpoint[key]: + del checkpoint[key] + + if store is None: + # patch _CHECKPOINTS array of dicts + self.checkpoints = checkpoints + + # patch _CHECKPOINTS dict with latest checkpoint info + self.last_checkpoint.clear() + self.last_checkpoint.update(self.checkpoints[-1]) + + logger.info( + "load_checkpoint %s timestamp %s" + % (checkpoint_name, self.last_checkpoint["timestamp"]) + ) + + tables = self.list_tables() + last_checkpoint = self.last_checkpoint + + else: + last_checkpoint = checkpoints[-1] + tables = [ + name + for name, checkpoint_name in last_checkpoint.items() + if checkpoint_name and name not in NON_TABLE_COLUMNS + ] + + loaded_tables = {} + for table_name in tables: + # read dataframe from pipeline store + df = self._read_df( + table_name, checkpoint_name=last_checkpoint[table_name], store=store + ) + logger.info("load_checkpoint table %s %s" % (table_name, df.shape)) + # register it as an orca table + self._obj.add_table(table_name, df) + loaded_tables[table_name] = df + if table_name == "land_use" and "_original_zone_id" in df.columns: + # The presence of _original_zone_id indicates this table index was + # decoded to zero-based, so we need to disable offset + # processing for legacy skim access. + # TODO: this "magic" column name should be replaced with a mechanism + # to write and recover particular settings from the pipeline + # store, but we don't have that mechanism yet + try: + self._obj.settings.offset_preprocessing = True + except StateAccessError: + pass + # self.obj.default_settings() + # self.obj.settings.offset_preprocessing = True + + # register for tracing in order that tracing.register_traceable_table wants us to register them + traceable_tables = self._obj.tracing.traceable_tables + + for table_name in traceable_tables: + if table_name in loaded_tables: + self._obj.tracing.register_traceable_table( + table_name, loaded_tables[table_name] + ) + + # add tables of known rng channels + rng_channels = self._obj.get_injectable("rng_channels", []) + if rng_channels: + logger.debug("loading random channels %s" % rng_channels) + for table_name in rng_channels: + if table_name in loaded_tables: + logger.debug("adding channel %s" % (table_name,)) + self._obj.rng().add_channel(table_name, loaded_tables[table_name]) + + if store is not None: + # we have loaded from an external store, so we make a new checkpoint + # with the same name as the one we just loaded. + self.add(checkpoint_name) + + def get_inventory(self): + """ + Get pandas dataframe of info about all checkpoints stored in pipeline + + pipeline doesn't have to be open + + Returns + ------- + checkpoints_df : pandas.DataFrame + + """ + df = self.store.get_dataframe(CHECKPOINT_TABLE_NAME) + # non-table columns first (column order in df is random because created from a dict) + table_names = [ + name for name in df.columns.values if name not in NON_TABLE_COLUMNS + ] + + df = df[NON_TABLE_COLUMNS + table_names] + + return df + + def restore(self, resume_after=None, mode="a"): + """ + Restore state from checkpoints. + + This can be used with "resume_after" to get the correct checkpoint, + or for a new run. + + If resume_after, then we expect the pipeline hdf5 file to exist and contain + checkpoints from a previous run, including a checkpoint with name specified in resume_after + + Parameters + ---------- + resume_after : str or None + name of checkpoint to load from pipeline store + mode : {'a', 'w', 'r', 'r+'}, default 'a' + same as for typical opening of H5Store. Ignored unless resume_after + is not None. This is here to allow read-only pipeline for benchmarking. + """ + + self._obj.init_state() + + if resume_after: + # open existing pipeline + logger.debug("checkpoint.restore - open existing pipeline") + if self._checkpoint_store is None: + self.open_store(overwrite=False, mode=mode) + try: + self.load(resume_after) + except (KeyError, CheckpointFileNotFoundError) as err: + if ( + isinstance(err, CheckpointFileNotFoundError) + or "checkpoints" in err.args[0] + ): + # no checkpoints initialized, fall back to restart + self.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME + self.add(INITIAL_CHECKPOINT_NAME) + else: + raise + logger.debug(f"restore from checkpoint {resume_after} complete") + else: + # open new, empty pipeline + logger.debug("checkpoint.restore - new, empty pipeline") + if self._checkpoint_store is None: + self.open_store(overwrite=True) + # - not sure why I thought we needed this? + # could have exogenous tables or prng instantiation under some circumstance?? + self.last_checkpoint[CHECKPOINT_NAME] = INITIAL_CHECKPOINT_NAME + # empty table, in case they have turned off all checkpointing + self.add(INITIAL_CHECKPOINT_NAME) + + logger.debug(f"restore from tabula rasa complete") + + def restore_from(self, location: Path, checkpoint_name: str = LAST_CHECKPOINT): + """ + Restore state from an alternative pipeline store. + + The checkpoint history is collapsed when reading out of an alternative + store location, given the presumption that if the use wanted to load a + prior intermediate state, that could be done so from the same outside + store, and the history does not need to be also preserved in the active + checkpoint store. + + Parameters + ---------- + location : Path-like + Location of pipeline store to load. + checkpoint_name : str + name of checkpoint to load from pipeline store + """ + self._obj.init_state() + logger.debug(f"checkpoint.restore_from - opening {location}") + if isinstance(location, str): + location = Path(location) + if self._obj.settings.checkpoint_format == "hdf": + from_store = HdfStore(location, mode="r") + else: + from_store = ParquetStore(location, mode="r") + self.load(checkpoint_name, store=from_store) + logger.debug(f"checkpoint.restore_from of {checkpoint_name} complete") + + def check_against(self, location: Path, checkpoint_name: str): + """ + Check that the tables in this State match those in an archived pipeline. + + Parameters + ---------- + location : Path-like + checkpoint_name : str + + Raises + ------ + AssertionError + If any registered table does not match. + """ + __tracebackhide__ = True # don't show this code in pytest outputs + + for table_name in self._obj.registered_tables(): + local_table = self._obj.get_dataframe(table_name) + logger.info(f"table {table_name!r}: shalpe1 {local_table.shape}") + + from .state import State + + ref_state = State() + ref_state.default_settings() + ref_state.checkpoint._checkpoint_store = NullStore() + + if isinstance(location, str): + location = Path(location) + if self._obj.settings.checkpoint_format == "hdf": + from_store = HdfStore(location, mode="r") + else: + from_store = ParquetStore(location, mode="r") + ref_state.checkpoint.load(checkpoint_name, store=from_store) + registered_tables = ref_state.registered_tables() + if len(registered_tables) == 0: + logger.warning("no tables checked") + for table_name in registered_tables: + local_table = self._obj.get_dataframe(table_name) + ref_table = ref_state.get_dataframe(table_name) + cols_in_run_but_not_ref = set(local_table.columns) - set(ref_table.columns) + cols_in_ref_but_not_run = set(ref_table.columns) - set(local_table.columns) + if cols_in_ref_but_not_run: + msg = f"checkpoint {checkpoint_name!r} table {table_name!r} column names mismatch" + if cols_in_run_but_not_ref: + msg += ( + f"\ncolumns found but not expected: {cols_in_run_but_not_ref}" + ) + if cols_in_ref_but_not_run: + msg += ( + f"\ncolumns expected but not found: {cols_in_ref_but_not_run}" + ) + raise AssertionError(msg) + elif cols_in_run_but_not_ref: + # if there are extra columns output that were not expected, but + # we at least have all the column names that were expected, just + # warn, not error + warnings.warn( + f"checkpoint {checkpoint_name!r} table {table_name!r}\n" + f"columns found but not expected: {cols_in_run_but_not_ref}" + ) + if len(ref_table.columns) == 0: + try: + pd.testing.assert_index_equal(local_table.index, ref_table.index) + except Exception as err: + raise AssertionError( + f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}" + ) + else: + logger.info(f"table {table_name!r}: ok") + else: + try: + pd.testing.assert_frame_equal( + local_table[ref_table.columns], ref_table, check_dtype=False + ) + except Exception as err: + raise AssertionError( + f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}" + ) + else: + logger.info(f"table {table_name!r}: ok") + + def cleanup(self): + """ + Remove intermediate checkpoints from pipeline. + + These are the steps to clean up: + + - Open main pipeline if not already open (it may be closed if + running with multiprocessing), + - Create a new single-checkpoint pipeline file with the latest + version of all checkpointed tables, + - Delete the original main pipeline and any subprocess pipelines + + This method is generally called at the end of a successful model + run, as it removes the intermediate checkpoint files. + + Called if cleanup_pipeline_after_run setting is True + + """ + # we don't expect to be called unless cleanup_pipeline_after_run setting is True + if not self._obj.settings.cleanup_pipeline_after_run: + logger.warning("will not clean up, `cleanup_pipeline_after_run` is False") + return + + if not self.store_is_open(): + self.restore(LAST_CHECKPOINT) + + assert self.store_is_open(), f"Pipeline is not open." + + FINAL_PIPELINE_FILE_NAME = f"final_{self._obj.filesystem.pipeline_file_name}" + FINAL_CHECKPOINT_NAME = "final" + + if self._obj.settings.checkpoint_format == "hdf": + # constructing the path manually like this will not create a + # subdirectory that competes with the HDF5 filename. + final_pipeline_file_path = self._obj.filesystem.get_output_dir().joinpath( + FINAL_PIPELINE_FILE_NAME + ) + else: + # calling for a subdir ensures that the subdirectory exists. + final_pipeline_file_path = self._obj.filesystem.get_output_dir( + subdir=FINAL_PIPELINE_FILE_NAME + ) + + # keep only the last row of checkpoints and patch the last checkpoint name + checkpoints_df = self.get_inventory().tail(1).copy() + checkpoints_df["checkpoint_name"] = FINAL_CHECKPOINT_NAME + + if self._obj.settings.checkpoint_format == "hdf": + with pd.HDFStore( + str(final_pipeline_file_path), mode="w" + ) as final_pipeline_store: + for table_name in self.list_tables(): + # patch last checkpoint name for all tables + checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME + + table_df = self._obj.get_table(table_name) + logger.debug( + f"cleanup_pipeline - adding table {table_name} {table_df.shape}" + ) + + final_pipeline_store[table_name] = table_df + + final_pipeline_store[CHECKPOINT_TABLE_NAME] = checkpoints_df + self.close_store() + else: + for table_name in self.list_tables(): + # patch last checkpoint name for all tables + checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME + + table_df = self._obj.get_table(table_name) + logger.debug( + f"cleanup_pipeline - adding table {table_name} {table_df.shape}" + ) + table_dir = final_pipeline_file_path.joinpath(table_name) + if not table_dir.exists(): + table_dir.mkdir(parents=True) + ParquetStore._to_parquet( + table_df, table_dir.joinpath(f"{FINAL_CHECKPOINT_NAME}.parquet") + ) + final_pipeline_file_path.joinpath(CHECKPOINT_TABLE_NAME).mkdir( + parents=True, exist_ok=True + ) + ParquetStore._to_parquet( + checkpoints_df, + final_pipeline_file_path.joinpath( + CHECKPOINT_TABLE_NAME, "None.parquet" + ), + ) + + logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") + self._obj.tracing.delete_output_files("h5", ignore=[final_pipeline_file_path]) + + # delete all ParquetStore except final + pqps = list( + self._obj.filesystem.get_output_dir().glob(f"**/*{ParquetStore.extension}") + ) + for pqp in pqps: + if pqp.name != final_pipeline_file_path.name: + ParquetStore(pqp).wipe() + + def load_dataframe(self, table_name, checkpoint_name=None): + """ + Return pandas dataframe corresponding to table_name + + if checkpoint_name is None, return the current (most recent) version of the table. + The table can be a checkpointed table or any registered orca table (e.g. function table) + + if checkpoint_name is specified, return table as it was at that checkpoint + (the most recently checkpointed version of the table at or before checkpoint_name) + + Parameters + ---------- + table_name : str + checkpoint_name : str or None + + Returns + ------- + df : pandas.DataFrame + """ + + if table_name not in self.last_checkpoint and self._obj.is_table(table_name): + if checkpoint_name is not None: + raise RuntimeError( + f"checkpoint.dataframe: checkpoint_name ({checkpoint_name!r}) not " + f"supported for non-checkpointed table {table_name!r}" + ) + + return self._obj.get_dataframe(table_name) + + # if there is no checkpoint name given, do not attempt to read from store + if checkpoint_name is None: + if table_name not in self.last_checkpoint: + raise RuntimeError("table '%s' never checkpointed." % table_name) + + if not self.last_checkpoint[table_name]: + raise RuntimeError("table '%s' was dropped." % table_name) + + return self._obj.get_dataframe(table_name) + + # find the requested checkpoint + checkpoint = next( + (x for x in self.checkpoints if x["checkpoint_name"] == checkpoint_name), + None, + ) + if checkpoint is None: + raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name) + + # find the checkpoint that table was written to store + last_checkpoint_name = checkpoint.get(table_name, None) + + if not last_checkpoint_name: + raise RuntimeError( + "table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name) + ) + + # if this version of table is same as current + if self.last_checkpoint.get(table_name, None) == last_checkpoint_name: + return self._obj.get_dataframe(table_name) + + return self._read_df(table_name, last_checkpoint_name) diff --git a/activitysim/core/workflow/chunking.py b/activitysim/core/workflow/chunking.py new file mode 100644 index 0000000000..90bc8ee17e --- /dev/null +++ b/activitysim/core/workflow/chunking.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import logging.config +import threading + +from activitysim.core.workflow.accessor import FromState, StateAccessor + +logger = logging.getLogger(__name__) + + +def _init_historian(): + from activitysim.core.chunk import ChunkHistorian + + return ChunkHistorian() + + +class Chunking(StateAccessor): + """ + This accessor provides chunking tools. + """ + + def __get__(self, instance, objtype=None) -> Chunking: + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) + + CHUNK_LEDGERS: list = FromState(default_init=True) + CHUNK_SIZERS: list = FromState(default_init=True) + ledger_lock: threading.Lock = FromState(default_init=True) + HISTORIAN = FromState(default_init=_init_historian) diff --git a/activitysim/core/workflow/dataset.py b/activitysim/core/workflow/dataset.py new file mode 100644 index 0000000000..8cf9b55726 --- /dev/null +++ b/activitysim/core/workflow/dataset.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import logging +from collections.abc import Iterable + +import xarray as xr +from sharrow.dataset import construct + +from .accessor import StateAccessor + +logger = logging.getLogger(__name__) + + +class Datasets(StateAccessor): + """ + This accessor provides easy access to state tables and datasets. + + Named datasets are available as attributes on this accessor, for + example `State.dataset.persons`, which is the equivalent of calling + `State.get_dataset("persons")`. + + Otherwise, there are no "normal" methods for this accessor, but the + `__dir__` method is implemented to provide tab-completion capabilities + inside compatible interactive interpreters (including Jupyter notebooks). + """ + + def __get__(self, instance, objtype=None) -> Datasets: + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) + + def __dir__(self) -> Iterable[str]: + return ( + self._obj.existing_table_status.keys() | self._obj._LOADABLE_TABLES.keys() + ) + + def __getattr__(self, item) -> xr.Dataset: + if item in self._obj.existing_table_status: + return self._obj.get_dataset(item) + elif item in self._obj._LOADABLE_TABLES: + arg_value = self._obj._LOADABLE_TABLES[item](self._obj._context) + return construct(arg_value) + raise AttributeError(item) diff --git a/activitysim/core/workflow/examples.py b/activitysim/core/workflow/examples.py new file mode 100644 index 0000000000..6214ef45fa --- /dev/null +++ b/activitysim/core/workflow/examples.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from pathlib import Path + +from activitysim.core import workflow + + +def create_example( + example_name: str, + directory: Path | str = None, + temp: bool = False, +) -> workflow.State: + """ + Create an example model. + + Parameters + ---------- + example_name : str + directory : Path-like, optional + Install the example into this directory. + temp : bool, default False + Install the example into a temporary directory tied to the returned + State object. Cannot be set to True if `directory` is given. + + Returns + ------- + State + """ + if temp: + if directory is not None: + raise ValueError("cannot give `directory` and also `temp`") + import tempfile + + temp_dir = tempfile.TemporaryDirectory() + directory = temp_dir.name + else: + temp_dir = None + if directory is None: + directory = Path.cwd() + + directory = Path(directory) + + # import inside function to prevent circular references. + from activitysim.examples import get_example + + installed_to, subdirs = get_example( + example_name, destination=directory, with_subdirs=True + ) + state = workflow.State.make_default(installed_to, **subdirs) + if temp: + state.set("_TEMP_DIR_", temp_dir) + return state diff --git a/activitysim/core/workflow/extending.py b/activitysim/core/workflow/extending.py new file mode 100644 index 0000000000..b75829f77b --- /dev/null +++ b/activitysim/core/workflow/extending.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from activitysim.core.workflow.accessor import StateAccessor + + +class Extend(StateAccessor): + """ + Methods to extend the capabilities of ActivitySim. + """ + + def __get__(self, instance, objtype=None) -> "Extend": + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) + + def declare_table( + self, table_name, traceable=True, random_channel=True, index_name=None + ): + """ + Declare a new table. + + Parameters + ---------- + table_name : str + traceable : bool, default True + random_channel : bool, default True + index_name : str, optional + + """ + + traceable_tables = self._obj.tracing.traceable_tables + if traceable and table_name not in traceable_tables: + traceable_tables.append(table_name) + self._obj.set("traceable_tables", traceable_tables) + + from activitysim.abm.models.util import canonical_ids + + rng_channels = self._obj.get("rng_channels") + if random_channel and table_name not in rng_channels: + rng_channels.append(table_name) + self._obj.set("rng_channels", rng_channels) + + canonical_table_index_names = self._obj.get("canonical_table_index_names") + if index_name is not None and table_name not in canonical_table_index_names: + canonical_table_index_names[table_name] = index_name + self._obj.set("canonical_table_index_names", canonical_table_index_names) diff --git a/activitysim/core/workflow/logging.py b/activitysim/core/workflow/logging.py new file mode 100644 index 0000000000..5f0ddd6fb3 --- /dev/null +++ b/activitysim/core/workflow/logging.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +import logging +import logging.config +import os +import sys +from collections.abc import Mapping, MutableMapping + +import yaml + +from activitysim.core.workflow.accessor import StateAccessor + +logger = logging.getLogger(__name__) + +ASIM_LOGGER = "activitysim" +CSV_FILE_TYPE = "csv" +LOGGING_CONF_FILE_NAME = "logging.yaml" + + +def _rewrite_config_dict(state, x): + if isinstance(x, Mapping): + # When a log config is a mapping of a single key that is `get_log_file_path` + # we apply the get_log_file_path method to the value, which can add a + # prefix (usually for multiprocessing) + if len(x.keys()) == 1 and "get_log_file_path" in x.keys(): + return _rewrite_config_dict( + state, state.get_log_file_path(x["get_log_file_path"]) + ) + # When a log config is a mapping of two keys that are `is_sub_task` + # and `is_not_sub_task`, we check the `is_sub_task` value in this context, + # and choose the appropriate value + elif ( + len(x.keys()) == 2 + and "is_sub_task" in x.keys() + and "is_not_sub_task" in x.keys() + ): + is_sub_task = state.get_injectable("is_sub_task", False) + return _rewrite_config_dict( + state, x["is_sub_task"] if is_sub_task else x["is_not_sub_task"] + ) + # accept alternate spelling "if_sub_task" in addition to "is_sub_task" + elif ( + len(x.keys()) == 2 + and "if_sub_task" in x.keys() + and "if_not_sub_task" in x.keys() + ): + is_sub_task = state.get_injectable("is_sub_task", False) + return _rewrite_config_dict( + state, x["if_sub_task"] if is_sub_task else x["if_not_sub_task"] + ) + else: + return {k: _rewrite_config_dict(state, v) for (k, v) in x.items()} + elif isinstance(x, list): + return [_rewrite_config_dict(state, v) for v in x] + elif isinstance(x, tuple): + return tuple(_rewrite_config_dict(state, v) for v in x) + else: + return x + + +class Logging(StateAccessor): + """ + This accessor provides logging tools. + """ + + def __get__(self, instance, objtype=None) -> Logging: + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) + + def config_logger(self, basic=False): + """ + Configure logger + + look for conf file in configs_dir, if not found use basicConfig + + Parameters + ---------- + basic : bool or int, default False + When set, ignore configuration file and just set logging to + use streaming to stdout. True implies logging level INFO, + or set to a different integer for that level. + + """ + + # look for conf file in configs_dir + if basic: + log_config_file = None + else: + log_config_file = self._obj.filesystem.get_config_file_path( + LOGGING_CONF_FILE_NAME, mandatory=False + ) + + if log_config_file: + try: + with open(log_config_file) as f: + config_dict = yaml.load(f, Loader=yaml.SafeLoader) + except Exception as e: + print(f"Unable to read logging config file {log_config_file}") + raise e + + config_dict = _rewrite_config_dict(self._obj, config_dict) + + try: + config_dict = config_dict["logging"] + config_dict.setdefault("version", 1) + logging.config.dictConfig(config_dict) + except Exception as e: + logging.warning( + f"Unable to config logging as specified in {log_config_file}" + ) + logging.warning( + "ActivitySim now requires YAML files to be loaded in " + "safe mode, check your file for unsafe tags such as " + "`!!python/object/apply`" + ) + raise e + + else: + if basic is True: + basic = logging.INFO + logging.basicConfig(level=basic, stream=sys.stdout) + + logger = logging.getLogger(ASIM_LOGGER) + + if log_config_file: + logger.info("Read logging configuration from: %s" % log_config_file) + else: + logger.log(basic, "Configured logging using basicConfig") + + def rotate_log_directory(self): + + output_dir = self._obj.filesystem.get_output_dir() + log_dir = output_dir.joinpath("log") + if not log_dir.exists(): + return + + from datetime import datetime + from stat import ST_CTIME + + old_log_time = os.stat(log_dir)[ST_CTIME] + rotate_name = os.path.join( + output_dir, + datetime.fromtimestamp(old_log_time).strftime("log--%Y-%m-%d--%H-%M-%S"), + ) + try: + os.rename(log_dir, rotate_name) + except Exception as err: + # if Windows fights us due to permissions or whatever, + print(f"unable to rotate log file, {err!r}") + else: + # on successful rotate, create new empty log directory + os.makedirs(log_dir) diff --git a/activitysim/core/workflow/report.py b/activitysim/core/workflow/report.py new file mode 100644 index 0000000000..97cdca339f --- /dev/null +++ b/activitysim/core/workflow/report.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +import logging + +from activitysim.core.workflow.accessor import StateAccessor + +logger = logging.getLogger(__name__) + + +class Reporting(StateAccessor): + """ + Tools for reporting and visualization + """ + + def __get__(self, instance, objtype=None) -> "Reporting": + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) diff --git a/activitysim/core/workflow/runner.py b/activitysim/core/workflow/runner.py new file mode 100644 index 0000000000..f4b72cb413 --- /dev/null +++ b/activitysim/core/workflow/runner.py @@ -0,0 +1,410 @@ +from __future__ import annotations + +import logging +import multiprocessing +import time +import warnings +from datetime import timedelta +from typing import Callable, Iterable + +from activitysim.core import tracing +from activitysim.core.exceptions import DuplicateWorkflowNameError +from activitysim.core.workflow.accessor import FromState, StateAccessor +from activitysim.core.workflow.checkpoint import ( + CHECKPOINT_NAME, + CHECKPOINT_TABLE_NAME, + FINAL_CHECKPOINT_NAME, + LAST_CHECKPOINT, +) +from activitysim.core.workflow.steps import run_named_step +from activitysim.core.workflow.util import write_notebook_heading + +# single character prefix for run_list model name to indicate that no checkpoint should be saved +NO_CHECKPOINT_PREFIX = "_" + + +logger = logging.getLogger(__name__) + + +def _format_elapsed_time(t): + t = round(t, 3) + if t < 60: + return f"{round(t, 3)} seconds" + td = str(timedelta(seconds=t)).rstrip("0") + if td.startswith("0:"): + td = td[2:] + return td + + +def split_arg(s: str, sep: str, default="") -> tuple[str, Any]: + """ + Split a string into two parts. + + When the part after the seperator is "true" or "false" (case-insensitive) + the second element of the returned tuple is the matching boolean value, + not a string. + + Parameters + ---------- + s : str + The string to split. + sep : str + The split character. + default : Any, default "" + The second part is by default an empty string, but through this + argument this can be overridden to be some other value. + + """ + r = s.split(sep, 2) + r = list(map(str.strip, r)) + + arg = r[0] + + if len(r) == 1: + val = default + else: + val = r[1] + val = {"true": True, "false": False}.get(val.lower(), val) + + return arg, val + + +class Runner(StateAccessor): + """ + This accessor provides the tools to actually run ActivitySim workflow steps. + """ + + def __get__(self, instance, objtype=None) -> "Runner": + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) + + def __call__(self, models, resume_after=None, memory_sidecar_process=None): + """ + run the specified list of models, optionally loading checkpoint and resuming after specified + checkpoint. + + Since we use model_name as checkpoint name, the same model may not be run more than once. + + If resume_after checkpoint is specified and a model with that name appears in the models list, + then we only run the models after that point in the list. This allows the user always to pass + the same list of models, but specify a resume_after point if desired. + + Parameters + ---------- + models : list[str] or Callable + A list of the model names to run, which should all have been + registered with the @workflow.step decorator. Alternative, give + a single function that is or could have been so-decorated. + resume_after : str or None + model_name of checkpoint to load checkpoint and AFTER WHICH to resume model run + memory_sidecar_process : MemorySidecar, optional + Subprocess that monitors memory usage + + returns: + nothing, but with pipeline open + """ + if isinstance(models, Callable) and models.__name__ is not None: + if models is self._obj._RUNNABLE_STEPS.get(models.__name__, None): + self([models.__name__], resume_after=None, memory_sidecar_process=None) + elif models is self._obj._LOADABLE_OBJECTS.get(models.__name__, None): + self._obj.set(models.__name__, self._obj.get(models.__name__)) + elif models is self._obj._LOADABLE_TABLES.get(models.__name__, None): + self._obj.set(models.__name__, self._obj.get(models.__name__)) + else: + raise DuplicateWorkflowNameError(models.__name__) + return + + if isinstance(models, str): + return self.by_name(models) + + t0 = tracing.print_elapsed_time() + + if resume_after == LAST_CHECKPOINT: + _checkpoints = self._obj.checkpoint.store.list_checkpoint_names() + if len(_checkpoints): + _resume_after = _checkpoints[-1] + else: + # nothing available in the checkpoint.store, cannot resume_after + resume_after = _resume_after = None + else: + _resume_after = resume_after + + if _resume_after: + + if ( + _resume_after != self._obj.checkpoint.last_checkpoint_name() + or self._obj.uncheckpointed_table_names() + ): + logger.debug( + f"last_checkpoint_name = {self._obj.checkpoint.last_checkpoint_name()}" + ) + logger.debug( + f"uncheckpointed_table_names = {self._obj.uncheckpointed_table_names()}" + ) + logger.debug(f"restoring from store with resume_after = {resume_after}") + self._obj.checkpoint.restore(resume_after) + t0 = tracing.print_elapsed_time("checkpoint.restore", t0) + else: + logger.debug(f"good to go with resume_after = {resume_after}") + + if resume_after == LAST_CHECKPOINT: + resume_after = self._obj.checkpoint.last_checkpoint[CHECKPOINT_NAME] + + if resume_after: + logger.info("resume_after %s" % resume_after) + if resume_after in models: + models = models[models.index(resume_after) + 1 :] + + self._obj.trace_memory_info("pipeline.run before preload_injectables") + + # preload any bulky injectables (e.g. skims) not in pipeline + if self._obj.get("preload_injectables", None): + if memory_sidecar_process: + memory_sidecar_process.set_event("preload_injectables") + t0 = tracing.print_elapsed_time("preload_injectables", t0) + + self._obj.trace_memory_info("pipeline.run after preload_injectables") + + t0 = tracing.print_elapsed_time() + for model in models: + if memory_sidecar_process: + memory_sidecar_process.set_event(model) + t1 = tracing.print_elapsed_time() + self.by_name(model) + self._obj.trace_memory_info(f"pipeline.run after {model}") + + self.log_runtime(model_name=model, start_time=t1) + + if memory_sidecar_process: + memory_sidecar_process.set_event("finalizing") + + # add checkpoint with final tables even if not intermediate checkpointing + if not self._obj.should_save_checkpoint(): + self._obj.checkpoint.add(FINAL_CHECKPOINT_NAME) + + self._obj.trace_memory_info("pipeline.run after run_models") + + t0 = tracing.print_elapsed_time("run_model (%s models)" % len(models), t0) + + # don't close the pipeline, as the user may want to read intermediate results from the store + + def __dir__(self) -> Iterable[str]: + return self._obj._RUNNABLE_STEPS.keys() | {"all"} + + def __getattr__(self, item): + if item in self._obj._RUNNABLE_STEPS: + + def f(**kwargs): + write_notebook_heading(item, self.heading_level) + return self.by_name(item, **kwargs) + + f.__doc__ = self._obj._RUNNABLE_STEPS[item].__doc__ + return f + raise AttributeError(item) + + timing_notes: set[str] = FromState(default_init=True) + + heading_level: int | None = FromState( + default_value=None, + doc=""" + Individual component heading level to use when running in a notebook. + + When individual components are called in a Jupyter notebook-like environment + using the `state.run.component_name` syntax, an HTML heading for each component + can be displayed in the notebook. These headings can be detected by Jupyter + extensions to enable rapid navigation with an automatically generated table + of contents. + """, + ) + + def log_runtime(self, model_name, start_time=None, timing=None, force=False): + assert (start_time or timing) and not (start_time and timing) + + timing = timing if timing else time.time() - start_time + seconds = round(timing, 1) + minutes = round(timing / 60, 1) + + process_name = multiprocessing.current_process().name + + if self._obj.settings.multiprocess and not force: + # when benchmarking, log timing for each processes in its own log + if self._obj.settings.benchmarking: + header = "component_name,duration" + with self._obj.filesystem.open_log_file( + f"timing_log.{process_name}.csv", "a", header + ) as log_file: + print(f"{model_name},{timing}", file=log_file) + # only continue to log runtime in global timing log for locutor + if not self._obj.get_injectable("locutor", False): + return + + header = "process_name,model_name,seconds,minutes,notes" + note = " ".join(self.timing_notes) + with self._obj.filesystem.open_log_file( + "timing_log.csv", "a", header + ) as log_file: + print( + f"{process_name},{model_name},{seconds},{minutes},{note}", file=log_file + ) + + self.timing_notes.clear() + + def _pre_run_step(self, model_name: str) -> bool | None: + """ + + Parameters + ---------- + model_name + + Returns + ------- + bool + True if the run of this step should be skipped. + """ + checkpointed_models = [ + checkpoint[CHECKPOINT_NAME] + for checkpoint in self._obj.checkpoint.checkpoints + ] + if model_name in checkpointed_models: + if self._obj.settings.duplicate_step_execution == "error": + checkpointed_model_bullets = "\n - ".join(checkpointed_models) + raise RuntimeError( + f"Checkpointed Models:\n - {checkpointed_model_bullets}\n" + f"Cannot run model '{model_name}' more than once" + ) + + self._obj.rng().begin_step(model_name) + + # check for args + if "." in model_name: + step_name, arg_string = model_name.split(".", 1) + args = dict( + (k, v) + for k, v in ( + split_arg(item, "=", default=True) for item in arg_string.split(";") + ) + ) + else: + step_name = model_name + args = {} + + # check for no_checkpoint prefix + if step_name[0] == NO_CHECKPOINT_PREFIX: + step_name = step_name[1:] + checkpoint = False + else: + checkpoint = self._obj.should_save_checkpoint(model_name) + + self._obj.add_injectable("step_args", args) + + self._obj.trace_memory_info(f"pipeline.run_model {model_name} start") + + logger.info(f"#run_model running step {step_name}") + + # these values are cached in the runner object itself, not in the context. + self.step_name = step_name + self.checkpoint = checkpoint + + def by_name(self, model_name, **kwargs): + """ + Run the specified model and add checkpoint for model_name + + Since we use model_name as checkpoint name, the same model may not be run more than once. + + Parameters + ---------- + model_name : str + model_name is assumed to be the name of a registered orca step + """ + self.t0 = time.time() + try: + should_skip = self._pre_run_step(model_name) + if should_skip: + return + + instrument = self._obj.settings.instrument + if instrument is not None: + try: + from pyinstrument import Profiler + except ImportError: + instrument = False + if isinstance(instrument, list | set | tuple): + if self.step_name not in instrument: + instrument = False + else: + instrument = True + + if instrument: + from pyinstrument import Profiler + + with Profiler() as profiler: + self._obj._context = run_named_step( + self.step_name, self._obj._context, **kwargs + ) + out_file = self._obj.filesystem.get_profiling_file_path( + f"{self.step_name}.html" + ) + with open(out_file, "w") as f: + f.write(profiler.output_html()) + else: + self._obj._context = run_named_step( + self.step_name, self._obj._context, **kwargs + ) + + except Exception: + self.t0 = self._log_elapsed_time(f"run.{model_name} UNTIL ERROR", self.t0) + self._obj.add_injectable("step_args", None) + self._obj.rng().end_step(model_name) + raise + + else: + # no error, finish as normal + self.t0 = self._log_elapsed_time(f"run.{model_name}", self.t0) + self._obj.trace_memory_info(f"pipeline.run_model {model_name} finished") + + self._obj.add_injectable("step_args", None) + + self._obj.rng().end_step(model_name) + if self.checkpoint: + self._obj.checkpoint.add(model_name) + else: + logger.info( + f"##### skipping {self.step_name} checkpoint for {model_name}" + ) + + def all(self, resume_after=LAST_CHECKPOINT, memory_sidecar_process=None): + try: + t0 = tracing.print_elapsed_time() + + if self._obj.settings.multiprocess: + logger.info("run multiprocess simulation") + + from activitysim.cli.run import INJECTABLES + from activitysim.core import mp_tasks + + injectables = {k: self._obj.get_injectable(k) for k in INJECTABLES} + injectables["settings"] = self._obj.settings + # injectables["settings_package"] = state.settings.dict() + mp_tasks.run_multiprocess(self._obj, injectables) + + else: + logger.info("run single process simulation") + self( + models=self._obj.settings.models, + resume_after=resume_after, + memory_sidecar_process=memory_sidecar_process, + ) + + except Exception: + # log time until error and the error traceback + tracing.print_elapsed_time("all models until this error", t0) + logger.exception("activitysim run encountered an unrecoverable error") + raise + + def _log_elapsed_time(self, msg, t0=None, level=25): + t1 = time.time() + assert t0 is not None + t = t1 - (t0 or t1) + msg = f" time to execute {msg} : {_format_elapsed_time(t)}" + logger.log(level, msg) + return t1 diff --git a/activitysim/core/workflow/state.py b/activitysim/core/workflow/state.py new file mode 100644 index 0000000000..550b339350 --- /dev/null +++ b/activitysim/core/workflow/state.py @@ -0,0 +1,1153 @@ +from __future__ import annotations + +import importlib +import io +import logging +import os +import sys +import textwrap +import warnings +from collections.abc import Iterable +from pathlib import Path +from typing import Any, Optional + +import pandas as pd +import pyarrow as pa +import xarray as xr +from pypyr.context import Context +from sharrow.dataset import construct as _dataset_construct + +import activitysim.core.random +from activitysim.core.configuration import FileSystem, NetworkSettings, Settings +from activitysim.core.exceptions import StateAccessError +from activitysim.core.workflow.checkpoint import LAST_CHECKPOINT, Checkpoints +from activitysim.core.workflow.chunking import Chunking +from activitysim.core.workflow.dataset import Datasets +from activitysim.core.workflow.extending import Extend +from activitysim.core.workflow.logging import Logging +from activitysim.core.workflow.report import Reporting +from activitysim.core.workflow.runner import Runner +from activitysim.core.workflow.steps import step as workflow_step +from activitysim.core.workflow.tracing import Tracing + +# ActivitySim +# See full license in LICENSE.txt. + + +logger = logging.getLogger(__name__) + +# name of the checkpoint dict keys +# (which are also columns in the checkpoints dataframe stored in hte pipeline store) + +# single character prefix for run_list model name to indicate that no checkpoint should be saved +NO_CHECKPOINT_PREFIX = "_" + +NO_DEFAULT = "throw error if missing" + + +class StateAttr: + """ + Convenience class for defining a context value as an attribute on a State. + + The name of the attribute defined in the `State` object is the key used + to find the attribute in the context. The primary use case is to make + a Pydantic BaseModel available as an attribute. + + Parameters + ---------- + member_type : type + default_init : bool, default False + When this attribute is accessed but the underlying key is not + found in the state's context, the default constructor can be called + to initialize the object. If this is False, accessing a missing + key raises a StateAccessError. + + See Also + -------- + activitysim.core.workflow.accessor.StateAccessor + """ + + def __init__(self, member_type, default_init=False, doc=None): + self.member_type = member_type + self._default_init = default_init + if doc: + self.__doc__ = textwrap.dedent(doc).strip() + else: + self.__doc__ = member_type.__doc__ + + def __set_name__(self, owner, name): + """Captures the attribute name when assigned in the State class.""" + self.name = name + + def __get__(self, instance, objtype=None): + """Access the value from the State's context.""" + if instance is None: + return self + try: + return instance._context[self.name] + except (KeyError, AttributeError): + if self._default_init: + instance._context[self.name] = self.member_type() + return instance._context[self.name] + raise StateAccessError( + f"{self.name} not initialized for this state" + ) from None + + def __set__(self, instance, value): + """Write a value into the State's context.""" + if not isinstance(value, self.member_type): + raise TypeError(f"{self.name} must be {self.member_type} not {type(value)}") + instance._context[self.name] = value + + def __delete__(self, instance): + """Remove a value from the State's context.""" + self.__set__(instance, None) + + +class State: + """ + The encapsulated state of an ActivitySim model. + """ + + def __init__(self, context=None): + """ + Initialize the encapsulated state of an ActivitySim model. + + Parameters + ---------- + context : pypyr.Context, optional + An initial context can be provided when the State is created. + """ + + self.open_files: dict[str, io.TextIOBase] = {} + """Files to close when state is destroyed or re-initialized.""" + + if context is None: + self._context = Context() + self.init_state() + elif isinstance(context, Context): + self._context = context + else: + raise TypeError(f"cannot init {type(self)} with {type(context)}") + + def __del__(self): + self.close_open_files() + + def __contains__(self, key): + """ + Check if a key is already stored in this state's context. + + This does *not* check if the key is automatically loadable, it only + checks if a cached value has already been stored. + + Parameters + ---------- + key : str + + Returns + ------- + bool + """ + return self._context.__contains__(key) + + def copy(self): + """ + Create a copy of this State. + + The copy will share the memory space for most arrays and tables with + the original state. + """ + return self.__class__(context=Context(self._context.copy())) + + def init_state(self) -> None: + """ + Initialize this state. + + - All checkpoints are wiped out. + - All open file objects connected to this state are closed. + - The status of all random number generators is cleared. + - The set of traceable table id's is emptied. + """ + self.checkpoint.initialize() + + self.close_open_files() + + self._initialize_prng() + + self.tracing.initialize() + self._context["_salient_tables"] = {} + + def _initialize_prng(self, base_seed=None): + from activitysim.core.random import Random + + self._context["prng"] = Random() + if base_seed is None: + try: + self.settings + except StateAccessError: + base_seed = 0 + else: + base_seed = self.settings.rng_base_seed + self._context["prng"].set_base_seed(base_seed) + + def import_extensions(self, ext: str | Iterable[str] = None, append=True) -> None: + """ + Import one or more extension modules for use with this model. + + This method isn't really necessary for single-process model + runs, as extension modules can be imported in the normal manner + for python. The real reason this methid is here is to support + multiprocessing. The names of extension modules imported with + this method will be saved and passed through to subtask workers, + which will also import the extensions and make them available as + model steps within the workers. + + Parameters + ---------- + ext : str | Iterable[str] + Names of extension modules to import. They should be module + or package names that can be imported from this state's working + directory. If they need to be imported from elsewhere, the + name should be the relative path to the extension module from + the working directory. + append : bool, default True + Extension names will be appended to the "imported_extensions" list + in this State's context (creating it if needed). Setting this + argument to false will remove references to any existing extensions, + before adding this new extension to the list. + """ + if ext is None: + return + if isinstance(ext, str): + ext = [ext] + if append: + extensions = self.get("imported_extensions", []) + else: + extensions = [] + if self.filesystem.working_dir: + working_dir = self.filesystem.working_dir + else: + working_dir = Path.cwd() + for e in ext: + basepath, extpath = os.path.split(working_dir.joinpath(e)) + if not basepath: + basepath = "." + sys.path.insert(0, os.path.abspath(basepath)) + try: + importlib.import_module(extpath) + except ImportError: + logger.exception("ImportError") + raise + except Exception as err: + logger.exception(f"Error {err}") + raise + finally: + del sys.path[0] + extensions.append(e) + self.set("imported_extensions", extensions) + + filesystem: FileSystem = StateAttr(FileSystem) + settings: Settings = StateAttr(Settings) + network_settings: NetworkSettings = StateAttr(NetworkSettings) + + checkpoint = Checkpoints() + logging = Logging() + tracing = Tracing() + extend = Extend() + report = Reporting() + dataset = Datasets() + chunk = Chunking() + + @property + def this_step(self): + step_list = self._context.get("_this_step", []) + if not step_list: + raise ValueError("not in a step") + return step_list[-1] + + @this_step.setter + def this_step(self, x): + assert isinstance(x, workflow_step) + step_list = self._context.get("_this_step", []) + step_list.append(x) + self._context["_this_step"] = step_list + + @this_step.deleter + def this_step(self): + step_list = self._context.get("_this_step", []) + step_list.pop() + self._context["_this_step"] = step_list + + @classmethod + def make_default( + cls, working_dir: Path = None, settings: dict[str, Any] = None, **kwargs + ) -> "State": + """ + Convenience constructor for mostly default States. + + Parameters + ---------- + working_dir : Path-like + If a directory, then this directory is the working directory. Or, + if the given path is actually a file, then the directory where the + file lives is the working directory (typically as a convenience for + using __file__ in testing). + settings : Mapping[str, Any] + Override settings values. + **kwargs + All other keyword arguments are forwarded to the + initialize_filesystem method. + + Returns + ------- + State + """ + if working_dir: + working_dir = Path(working_dir) + if working_dir.is_file(): + working_dir = working_dir.parent + self = cls().initialize_filesystem(working_dir, **kwargs) + settings_file = self.filesystem.get_config_file_path( + self.filesystem.settings_file_name, mandatory=False + ) + if settings_file is not None and settings_file.exists(): + self.load_settings() + else: + self.default_settings() + if settings: + for k, v in settings.items(): + setattr(self.settings, k, v) + return self + + @classmethod + def make_temp( + cls, source: Path = None, checkpoint_name: str = LAST_CHECKPOINT + ) -> "State": + """ + Initialize state with a temporary directory. + + Parameters + ---------- + source : Path-like, optional + Location of pipeline store to use to initialize this object. + checkpoint_name : str, optional + name of checkpoint to load from source store, defaults to + the last checkpoint found + + Returns + ------- + State + """ + import tempfile + + temp_dir = tempfile.TemporaryDirectory() + temp_dir_path = Path(temp_dir.name) + temp_dir_path.joinpath("configs").mkdir() + temp_dir_path.joinpath("data").mkdir() + temp_dir_path.joinpath("configs/settings.yaml").write_text("# empty\n") + state = cls.make_default(temp_dir_path) + state._context["_TEMP_DIR_"] = temp_dir + if source is not None: + state.checkpoint.restore_from(source, checkpoint_name) + return state + + def initialize_filesystem( + self, + working_dir: Path | None = None, + *, + configs_dir: Path | tuple[Path, ...] = ("configs",), + data_dir: Path | tuple[Path, ...] = ("data",), + output_dir: str | Path = "output", + profile_dir: Path | None = None, + cache_dir: Path | None = None, + settings_file_name: str = "settings.yaml", + pipeline_file_name: str = "pipeline", + **silently_ignored_kwargs, + ) -> State: + """ + Initialize the state's filesystem. + + ActivitySim has a number of features to extract settings, model configs, + data, and other inputs automatically from various files, and to write + outputs to the file system in various locations. These directories + are defined very early in a model run, and other settings are loaded + based on them, so for convenience the filesystem settings are collected + together in a single pydantic validated object separate from all other + settings. + + All arguments to this function beyond `working_dir` are keyword-only. + Keyword arguments other than those listed are silently ignored. + + Parameters + ---------- + working_dir : path-like, optional + The top-level working directory for the model. When other + directories are given as relative paths, those relative paths are + evaluated relative to this directory. + configs_dir : path-like or tuple of path-like, default "configs" + Directories containing model configurations and settings. ActivitySim + model runs can be configured with settings file inheritance to avoid + duplicating settings across model configurations, e.g. to allow for + single-process and multiprocess configurations that share most of + their inputs and settings. + data_dir : path-like or tuple of path-like, default "data" + The directories where input data files can be found. + output_dir : path-like, default "output" + Most ActivitySim output will be written to this directory (or + subdirectories within it). + profile_dir : path-like, optional + If runtime instrumentation is turned on, pyinstrument profiling + output will be written in this directory. If not given, a unique + time-stamped subdirectory will be created in the `output` directory. + cache_dir : path-like, optional + Various intermediate files may be stored in the cache directory. This + should be a writable and readable location, and cached files may + persist and be re-used by various different model runs. It should + always be safe to simply delete everything in the cache directory, + as everything saved here should be recreated automatically from other + inputs if it is missing (although it may take some time to do so). + settings_file_name : str, default "settings.yaml" + Top level settings are defined in this file, which should be found + in one or more `configs_dir` locations. + pipeline_file_name : str, default "pipeline" + The base filename for checkpointed intermediate outputs. + + Returns + ------- + self : State + """ + if isinstance(configs_dir, str | Path): + configs_dir = (configs_dir,) + if isinstance(data_dir, str | Path): + data_dir = (data_dir,) + + fs = dict( + configs_dir=configs_dir, + data_dir=data_dir, + output_dir=output_dir, + settings_file_name=settings_file_name, + pipeline_file_name=pipeline_file_name, + ) + if working_dir is not None: + fs["working_dir"] = working_dir + if profile_dir is not None: + fs["profile_dir"] = profile_dir + if cache_dir is not None: + fs["cache_dir"] = cache_dir + try: + self.filesystem: FileSystem = FileSystem.parse_obj(fs) + except Exception as err: + print(err) + raise + return self + + def default_settings(self, force=False) -> State: + """ + Initialize with all default settings, rather than reading from a file. + + Parameters + ---------- + force : bool, default False + If settings are already loaded, this method does nothing unless + this argument is true, in which case all existing settings are + discarded in favor of the defaults. + """ + try: + _ = self.settings + if force: + raise StateAccessError + except StateAccessError: + self.settings = Settings() + self.init_state() + return self + + def load_settings(self) -> State: + """ + Read and parse settings file(s) from config dirs. + + Returns + ------- + self : State + """ + # read settings file + raw_settings = self.filesystem.read_settings_file( + self.filesystem.settings_file_name, + mandatory=True, + include_stack=False, + ) + + # the settings can redefine the cache directories. + cache_dir = raw_settings.pop("cache_dir", None) + if cache_dir: + if self.filesystem.cache_dir != cache_dir: + logger.warning(f"settings file changes cache_dir to {cache_dir}") + self.filesystem.cache_dir = cache_dir + self.settings: Settings = Settings.parse_obj(raw_settings) + + extra_settings = set(self.settings.__dict__) - set(Settings.__fields__) + + if extra_settings: + warnings.warn( + "Writing arbitrary model values as top-level key in settings.yaml " + "is deprecated, make them sub-keys of `other_settings` instead.", + DeprecationWarning, + ) + logger.warning("Found the following unexpected settings:") + if self.settings.other_settings is None: + self.settings.other_settings = {} + for k in extra_settings: + logger.warning(f" - {k}") + self.settings.other_settings[k] = getattr(self.settings, k) + delattr(self.settings, k) + + self.init_state() + return self + + _RUNNABLE_STEPS = {} + _LOADABLE_TABLES = {} + _LOADABLE_OBJECTS = {} + _PREDICATES = {} + _TEMP_NAMES = set() # never checkpointed + + @property + def known_table_names(self): + return self._LOADABLE_TABLES.keys() | self.existing_table_names + + @property + def existing_table_names(self): + return self.existing_table_status.keys() + + @property + def existing_table_status(self) -> dict: + return self._context["_salient_tables"] + + def uncheckpointed_table_names(self): + uncheckpointed = [] + for tablename, table_status in self.existing_table_status.items(): + if table_status and tablename not in self._TEMP_NAMES: + uncheckpointed.append(tablename) + return uncheckpointed + + def _load_or_create_dataset( + self, table_name, overwrite=False, swallow_errors=False + ): + """ + Load a table from disk or otherwise programmatically create it. + + Parameters + ---------- + table_name : str + overwrite : bool + swallow_errors : bool + + Returns + ------- + xarray.Dataset + """ + if table_name in self.existing_table_names and not overwrite: + if swallow_errors: + return self.get_dataframe(table_name) + raise ValueError(f"table {table_name} already loaded") + if table_name not in self._LOADABLE_TABLES: + if swallow_errors: + return + raise ValueError(f"table {table_name} has no loading function") + logger.debug(f"loading table {table_name}") + try: + t = self._LOADABLE_TABLES[table_name](self._context) + except StateAccessError: + if not swallow_errors: + raise + else: + t = None + if t is not None: + self.add_table(table_name, t) + return t + + def get_dataset( + self, + table_name: str, + column_names: list[str] | None = None, + as_copy: bool = False, + ) -> xr.Dataset: + """ + Get a workflow table or dataset as a xarray.Dataset. + + Parameters + ---------- + table_name : str + Name of table or dataset to get. + column_names : list[str], optional + Include only these columns or variables in the dataset. + as_copy : bool, default False + Return a copy of the dataset instead of the original. + + Returns + ------- + xarray.Dataset + """ + t = self._context.get(table_name, None) + if t is None: + t = self._load_or_create_dataset(table_name, swallow_errors=False) + if t is None: + raise KeyError(table_name) + t = _dataset_construct(t) + if isinstance(t, xr.Dataset): + if column_names is not None: + t = t[column_names] + if as_copy: + return t.copy() + else: + return t + raise TypeError(f"cannot convert {table_name} to Dataset") + + def get_dataframe( + self, + tablename: str, + columns: Optional[list[str]] = None, + as_copy: bool = True, + ) -> pd.DataFrame: + """ + Get a workflow table as a pandas.DataFrame. + + Parameters + ---------- + tablename : str + Name of table to get. + columns : list[str], optional + Include only these columns in the dataframe. + as_copy : bool, default True + Return a copy of the dataframe instead of the original. + + Returns + ------- + DataFrame + """ + t = self._context.get(tablename, None) + if t is None: + t = self._load_or_create_dataset(tablename, swallow_errors=False) + if t is None: + raise KeyError(tablename) + if isinstance(t, pd.DataFrame): + if columns is not None: + t = t[columns] + if as_copy: + return t.copy() + else: + return t + elif isinstance(t, xr.Dataset): + # this route through pyarrow is generally faster than xarray.to_pandas + return t.single_dim.to_pyarrow().to_pandas() + raise TypeError(f"cannot convert {tablename} to DataFrame") + + def get_dataarray( + self, + tablename: str, + item: str, + as_copy: bool = True, + ) -> xr.DataArray: + """ + Get a workflow table item as a xarray.DataArray. + + Parameters + ---------- + tablename : str + Name of table to get. + item : str + Name of item within table. + as_copy : bool, default True + Return a copy of the data instead of the original. + + Returns + ------- + DataArray + """ + return self.get_dataset(tablename, column_names=[item])[item] + + def get_dataframe_index_name(self, tablename: str) -> str: + """ + Get the index name for a workflow table. + + Parameters + ---------- + tablename : str + Name of table to get. + + Returns + ------- + str + """ + t = self._context.get(tablename, None) + if t is None: + t = self._load_or_create_dataset(tablename, swallow_errors=False) + if t is None: + raise KeyError(tablename) + if isinstance(t, pd.DataFrame): + return t.index.name + raise TypeError(f"cannot get index name for {tablename}") + + def get_pyarrow( + self, tablename: str, columns: Optional[list[str] | str] = None + ) -> pa.Table: + """ + Get a workflow table as a pyarrow.Table. + + Parameters + ---------- + tablename : str + Name of table to get. + columns : list[str] or str, optional + Include only these columns in the dataframe. + + Returns + ------- + pyarrow.Table + """ + if isinstance(columns, str): + columns = [columns] + t = self._context.get(tablename, None) + if t is None: + t = self._load_or_create_dataset(tablename, swallow_errors=False) + if t is None: + raise KeyError(tablename) + if isinstance(t, pd.DataFrame): + t = pa.Table.from_pandas(t, preserve_index=True, columns=columns) + if isinstance(t, pa.Table): + if columns is not None: + t = t.select(columns) + return t + raise TypeError(f"cannot convert {tablename} to pyarrow.Table") + + def access(self, key: str, initializer: Any = NO_DEFAULT) -> Any: + """ + Raw access to values stored in this state's context. + + This method short-circuits all of ActivitySim's machinery to provide + or build missing context values automatically -- only values already + stored can be accessed. + + Parameters + ---------- + key : str + The name of the variable to access. + initializer : Any, optional + If the variable is not already in the state's context, it will + be set with this value, or if it is not provided a KeyError is + raised instead. + + Returns + ------- + Any + """ + if key not in self._context: + if initializer != NO_DEFAULT: + self.set(key, initializer) + else: + raise KeyError(key) + return self._context[key] + + def get(self, key, default: Any = NO_DEFAULT) -> Any: + """ + Automated access to values stored in this state's context. + + This method takes advantage of ActivitySim's machinery to provide + or build missing context values automatically. If a value is already + present in this state's context it is returned, otherwise the set of + defined methods for loadable objects and tables will be used to + create values if possible. + + Parameters + ---------- + key : str + The name of the variable to access. + default : Any, optional + If the variable is not already in the state's context, and cannot + be otherwise created automatically, it will be set with this value, + or else a KeyError is raised instead. + + Returns + ------- + Any + """ + + if not isinstance(key, str): + key_name = getattr(key, "__name__", None) + if key_name in self._LOADABLE_TABLES or key_name in self._LOADABLE_OBJECTS: + key = key_name + if key_name in self._RUNNABLE_STEPS: + raise ValueError( + f"cannot `get` {key_name}, it is a step, try State.run.{key_name}()" + ) + result = self._context.get(key, None) + if result is None: + try: + result = getattr(self.filesystem, key, None) + except StateAccessError: + result = None + if result is None: + if key in self._LOADABLE_TABLES: + result = self._LOADABLE_TABLES[key](self._context) + elif key in self._LOADABLE_OBJECTS: + result = self._LOADABLE_OBJECTS[key](self._context) + if result is None: + if default != NO_DEFAULT: + result = default + else: + self._context.assert_key_has_value( + key=key, caller=self.__class__.__name__ + ) + raise KeyError(key) + if not isinstance(result, xr.Dataset | xr.DataArray | pd.DataFrame | pd.Series): + result = self._context.get_formatted_value(result) + return result + + def set(self, key, value): + """ + Set a new value for a key in the context. + + Also removes from the context all other keys predicated on this key. + They can be regenerated later (from fresh inputs) if needed. + + Parameters + ---------- + key : str + """ + self._context[key] = value + for i in self._PREDICATES.get(key, []): + if i in self._context: + logger.debug(f"update of {key} clears cached {i}") + self.drop(i) + + def drop(self, key): + """ + Remove a key from the context. + + Also removes from the context all other keys predicated on this key. + + Parameters + ---------- + key : str + """ + del self._context[key] + for i in self._PREDICATES.get(key, []): + if i in self._context: + logger.debug(f"dropping {key} clears cached {i}") + self.drop(i) + + def extract(self, func): + return func(self) + + get_injectable = get # legacy function name + """Alias for :meth:`State.get`.""" + + add_injectable = set # legacy function name + """Alias for :meth:`State.set`.""" + + def rng(self) -> activitysim.core.random.Random: + if "prng" not in self._context: + self._initialize_prng() + return self._context["prng"] + + def pipeline_table_key(self, table_name, checkpoint_name): + if checkpoint_name: + key = f"{table_name}/{checkpoint_name}" + else: + key = f"/{table_name}" + return key + + def close_on_exit(self, file, name): + assert name not in self.open_files + self.open_files[name] = file + + def close_open_files(self): + for name, file in self.open_files.items(): + print("Closing %s" % name) + file.close() + self.open_files.clear() + + def get_rn_generator(self) -> activitysim.core.random.Random: + """ + Return the singleton random number object + + Returns + ------- + activitysim.random.Random + """ + return self.rng() + + def get_global_constants(self): + """ + Read global constants from settings file + + Returns + ------- + constants : dict + dictionary of constants to add to locals for use by expressions in model spec + """ + try: + filesystem = self.filesystem + except StateAccessError: + return {} + else: + return filesystem.read_settings_file("constants.yaml", mandatory=False) + + def add_table( + self, name: str, content: pd.DataFrame | xr.Dataset, salient: bool | None = None + ): + """ + Add a data table to this context, and potentially mark it for checkpointing. + + The table added completely replaces any existing table of the same + name. In part because checkpointing currently manages tables only in their + entirety, there is no mechanism to incrementally update a table by adding + data (columns and/or rows) in-place, although nothing prevents the user + of this method from partially re-using data content from an existing + table via a zero-copy transformation. + + Parameters + ---------- + name : str + The name of the table being added to this state's context. + content : pandas.DataFrame or xarray.Dataset + The new data content to write. + salient : bool, optional + Explicitly mark this table as salient or not. Salient tables + are marked to be checkpointed the next time a checkpoint operation + happens. If not set explicitly tables are presumed to be salient + unless they elsewhere defined as temporary tables. + + See Also + -------- + State.set + """ + if salient is None: + salient = name not in self._TEMP_NAMES + if salient: + # mark this salient table as edited, so it can be checkpointed + # at some later time if desired. + self.existing_table_status[name] = True + self.set(name, content) + + def is_table(self, name: str): + """ + Check if a name corresponds to a table in this state's context. + + Parameters + ---------- + name : str + + Returns + ------- + bool + """ + return name in self.existing_table_status + + def registered_tables(self): + """ + Return a list of the names of all currently registered dataframe tables + """ + return [name for name in self.existing_table_status if name in self._context] + + @property + def current_model_name(self) -> str: + """Name of the currently running model.""" + return self.rng().step_name + + def close_pipeline(self): + """ + Close any known open files + """ + self.close_open_files() + self.checkpoint.close_store() + self.init_state() + logger.debug("close_pipeline") + + def should_save_checkpoint(self, checkpoint_name=None) -> bool: + checkpoints = self.settings.checkpoints + + if checkpoints is True or checkpoints is False: + return checkpoints + + assert isinstance( + checkpoints, list + ), "setting 'checkpoints'' should be True or False or a list" + + return checkpoint_name in checkpoints + + def trace_memory_info(self, event, trace_ticks=0): + from activitysim.core.mem import trace_memory_info + + return trace_memory_info(event, state=self, trace_ticks=trace_ticks) + + run = Runner() + + def get_table(self, table_name, checkpoint_name=None): + """ + Return pandas dataframe corresponding to table_name + + if checkpoint_name is None, return the current (most recent) version of the table. + The table can be a checkpointed table or any registered orca table (e.g. function table) + + if checkpoint_name is specified, return table as it was at that checkpoint + (the most recently checkpointed version of the table at or before checkpoint_name) + + Parameters + ---------- + table_name : str + checkpoint_name : str or None + + Returns + ------- + df : pandas.DataFrame + """ + + if table_name not in self.checkpoint.last_checkpoint and self.is_table( + table_name + ): + if checkpoint_name is not None: + raise RuntimeError( + f"get_table: checkpoint_name ({checkpoint_name!r}) not " + f"supported for non-checkpointed table {table_name!r}" + ) + + return self._context.get(table_name) + + # if they want current version of table, no need to read from pipeline store + if checkpoint_name is None: + if table_name not in self.checkpoint.last_checkpoint: + raise RuntimeError("table '%s' never checkpointed." % table_name) + + if not self.checkpoint.last_checkpoint[table_name]: + raise RuntimeError("table '%s' was dropped." % table_name) + + return self._context.get(table_name) + + # find the requested checkpoint + checkpoint = next( + ( + x + for x in self.checkpoint.checkpoints + if x["checkpoint_name"] == checkpoint_name + ), + None, + ) + if checkpoint is None: + raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name) + + # find the checkpoint that table was written to store + last_checkpoint_name = checkpoint.get(table_name, None) + + if not last_checkpoint_name: + raise RuntimeError( + "table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name) + ) + + # if this version of table is same as current + if ( + self.checkpoint.last_checkpoint.get(table_name, None) + == last_checkpoint_name + ): + return self._context.get(table_name) + + return self.checkpoint._read_df(table_name, last_checkpoint_name) + + def extend_table(self, table_name, df, axis=0): + """ + add new table or extend (add rows) to an existing table + + Parameters + ---------- + table_name : str + orca/inject table name + df : pandas DataFrame + """ + assert axis in [0, 1] + + if self.is_table(table_name): + table_df = self.get_dataframe(table_name) + + if axis == 0: + # don't expect indexes to overlap + assert len(table_df.index.intersection(df.index)) == 0 + missing_df_str_columns = [ + c + for c in table_df.columns + if c not in df.columns and table_df[c].dtype == "O" + ] + else: + # expect indexes be same + assert table_df.index.equals(df.index) + new_df_columns = [c for c in df.columns if c not in table_df.columns] + df = df[new_df_columns] + missing_df_str_columns = [] + + # preserve existing column order + df = pd.concat([table_df, df], sort=False, axis=axis) + + # backfill missing df columns that were str (object) type in table_df + if axis == 0: + for c in missing_df_str_columns: + df[c] = df[c].fillna("") + + self.add_table(table_name, df) + + return df + + def drop_table(self, table_name): + if self.is_table(table_name): + logger.debug("drop_table dropping orca table '%s'" % table_name) + self._context.pop(table_name, None) + self.existing_table_status.pop(table_name) + + if table_name in self.checkpoint.last_checkpoint: + logger.debug( + "drop_table removing table %s from last_checkpoint" % table_name + ) + self.checkpoint.last_checkpoint[table_name] = "" + + def get_output_file_path(self, file_name: str, prefix: str | bool = None) -> Path: + if prefix is None or prefix is True: + prefix = self.get_injectable("output_file_prefix", None) + if prefix: + file_name = f"{prefix}-{file_name}" + return self.filesystem.get_output_dir().joinpath(file_name) + + def get_log_file_path(self, file_name: str, prefix: bool = True) -> Path: + """ + Get the log file path for this process. + + This method is not purely a pass-through to this state's `filesystem`, + as it also potentially adds a prefix to the filename based on the state. + + Parameters + ---------- + file_name : str + The name of the desired log file. + prefix : bool, default True + Whether to add a prefix to the desired log file name. This is + simply a boolean flag for whether to add the prefix, the actual + value of the prefix id drawn from the "log_file_prefix" key within + this state. If that key is not set, no prefix is added regardless + of the value of this argument. + + Returns + ------- + Path + """ + prefix = prefix and self.get_injectable("log_file_prefix", None) + if prefix: + file_name = f"{prefix}-{file_name}" + return self.filesystem.get_log_file_path(file_name) + + def set_step_args(self, args=None): + assert isinstance(args, dict) or args is None + self.add_injectable("step_args", args) + + def get_step_arg(self, arg_name, default=NO_DEFAULT): + args = self.get_injectable("step_args") + + assert isinstance(args, dict) + if arg_name not in args and default == NO_DEFAULT: + raise "step arg '%s' not found and no default" % arg_name + + return args.get(arg_name, default) diff --git a/activitysim/core/workflow/steps.py b/activitysim/core/workflow/steps.py new file mode 100644 index 0000000000..39d038a95d --- /dev/null +++ b/activitysim/core/workflow/steps.py @@ -0,0 +1,502 @@ +from __future__ import annotations + +import abc +import importlib +import importlib.machinery +import importlib.util +import logging +import time +from collections import namedtuple +from collections.abc import Container +from inspect import get_annotations, getfullargspec +from typing import Callable, Collection, Mapping, NamedTuple + +import numpy as np # noqa: 401 +import pandas as pd # noqa: 401 +import xarray as xr # noqa: 401 +from pypyr.context import Context +from pypyr.errors import KeyNotInContextError + +from activitysim.core import workflow +from activitysim.core.exceptions import ( + DuplicateWorkflowNameError, + DuplicateWorkflowTableError, +) +from activitysim.core.workflow.util import ( + get_formatted_or_default, + get_formatted_or_raw, + get_override_or_formatted_or_default, + is_notebook, +) + +logger = logging.getLogger(__name__) + +_STEP_LIBRARY = {} + +ExtendedArgSpec = namedtuple( + "ExtendedArgSpec", + "args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, " + "annotations, ndefault, required_args", +) + + +def error_logging(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as err: + logging.error(f"===== ERROR IN {func.__name__} =====") + logging.exception(f"{err}") + logging.error(f"===== / =====") + raise + + return wrapper + + +def _new_module(mod_name): + spec = importlib.machinery.ModuleSpec(mod_name, None) + return importlib.util.module_from_spec(spec) + + +def _create_module(mod_name, content): + mod = _new_module(mod_name) + for k, v in content.items(): + setattr(mod, k, v) + return mod + + +def _create_step(step_name, step_func): + # the module version of each step is for pypyr, and it always mutates + # context in-place instead of making updates to copies + _create_module(f"{__package__}.{step_name}", {"run_step": step_func}) + _STEP_LIBRARY[step_name] = step_func + + +def run_named_step(name, context, **kwargs): + try: + step_func = _STEP_LIBRARY[name] + except KeyError: + logger.error(f"Unknown step {name}, the known steps are:") + for n in sorted(_STEP_LIBRARY.keys()): + logger.error(f" - {n}") + raise + step_func(context, **kwargs) + return context + + +class StepArgInit(abc.ABC): + """ + Base class for things that initialize default workflow.step args from state. + """ + + @abc.abstractmethod + def __call__(self, state: workflow.State, **other_overrides): + raise NotImplementedError + + +class ModelSettingsFromYaml(StepArgInit): + def __init__(self, model_settings_file_name): + self.model_settings_file_name = model_settings_file_name + + def __call__(self, state: workflow.State, **other_overrides): + return state.filesystem.read_model_settings(self.model_settings_file_name) + + +class step: + """ + Decorator for ActivitySim model components and related functions. + + See the documentation on :ref:`workflow-steps` for more details. + + Parameters + ---------- + wrapped_func : Callable + The function being wrapped. + step_name : str, optional + The name of the step. This is usually just inferred from the name of + the function being wrapped, but it can be explicitly set to some other + value if needed. + cache : bool, default False + If true, this function is only run if the named value is not + already stored in the context. Also, the return value should + not be a mapping but instead just a single Python object that + will be stored in the context with a key given by the step_name. + kind : {"step", "table", "temp_table", "cached_object"} + The kind of workflow function being wrapped. + copy_tables : bool or Container[str], default True + If this evaluates to true, access to tables as a DataFrame is + always via a copy operation on any registered table instead of the + original. If given as a container, only table names in the container + are copied. + overloading : bool, default False + Allow this step definition to overload an existing wrapped workflow + function. This permits a user to overload ActivitySim functions with + bespoke alternatives. To ensure that the reverse never occurs (i.e. + the user creates a bespoke alternative implementation and then allows + it to be overwritten by ActivitySim's default by importing things in + the wrong order) steps defined and delivered within the ActivitySim + package itself should never set this flag. + + Returns + ------- + Callable + """ + + def __new__( + cls, + wrapped_func=None, + *, + step_name=None, + cache=False, + kind="step", + copy_tables=True, + overloading=False, + ): + if wrapped_func is not None and not isinstance(wrapped_func, Callable): + raise TypeError("workflow step must decorate a callable") + if step_name is None and wrapped_func is not None: + step_name = wrapped_func.__name__ + self = super().__new__(cls) + self._step_name = step_name + self._cache = cache + self._kind = kind + self._copy_tables = copy_tables + self._overloading = overloading + if wrapped_func is not None: + return self(wrapped_func) + else: + return self + + def __call__(self, wrapped_func): + """ + Initialize a workflow.step wrapper. + + Parameters + ---------- + wrapped_func : Callable + The function being decorated. It should return a dictionary + of context updates. + """ + from activitysim.core.workflow import State + + _validate_workflow_function(wrapped_func) + if self._step_name is None: + self._step_name = wrapped_func.__name__ + logger.debug(f"found workflow_{self._kind}: {self._step_name}") + docstring = wrapped_func.__doc__ + + # overloading of existing steps is only allowed when the user + # sets overloading=True, which should never be done for steps + # defined and delivered within the ActivitySim package itself + def warn_overload(): + if self._overloading: + logger.warning( + f"workflow.step {wrapped_func.__module__}.{self._step_name} " + f"overloading existing {self._step_name}" + ) + else: + raise DuplicateWorkflowNameError(self._step_name) + + # check for duplicate workflow function names + if self._step_name in State._LOADABLE_OBJECTS: + warn_overload() + if self._step_name in State._LOADABLE_TABLES: + warn_overload() + if self._step_name in State._RUNNABLE_STEPS: + warn_overload() + + ( + _args, + _varargs, + _varkw, + _defaults, + _kwonlyargs, + _kwonlydefaults, + _annotations, + ) = getfullargspec(wrapped_func) + + # getfullargspec does not eval stringized annotations, so re-get those + _annotations = get_annotations(wrapped_func, eval_str=True) + + if _defaults is None: + _ndefault = 0 + _required_args = _args + else: + _ndefault = len(_defaults) + _required_args = _args[:-_ndefault] + + self._fullargspec = ExtendedArgSpec( + _args, + _varargs, + _varkw, + _defaults, + _kwonlyargs, + _kwonlydefaults, + _annotations, + _ndefault, + _required_args, + ) + + if not _required_args or _required_args[0] != "state": + raise TypeError( + f"the first argument of a workflow_{self._kind} must be the state" + ) + + def run_step(context: Context = None, **override_kwargs) -> None: + if ( + self._cache + and (context is not None) + and (self._step_name in context) + and len(override_kwargs) == 0 + ): + return context.get_formatted(self._step_name) + assert isinstance(context, Context) + state = State(context) + + # initialize step-specific arguments if they are not provided in override_kwargs + if _ndefault: + for arg, default in zip(_args[-_ndefault:], _defaults): + if isinstance(default, StepArgInit): + override_kwargs[arg] = default(state, **override_kwargs) + else: + override_kwargs[arg] = default + if _kwonlydefaults: + for karg in _kwonlyargs: + karg_default = _kwonlydefaults.get(karg, None) + if isinstance(karg_default, StepArgInit): + override_kwargs[karg] = karg_default(state, **override_kwargs) + else: + override_kwargs[karg] = karg_default + + caption = get_override_or_formatted_or_default( + override_kwargs, context, "caption", None + ) + progress_tag = get_override_or_formatted_or_default( + override_kwargs, context, "progress_tag", caption + ) + # if progress_tag is not None: + # reset_progress_step(description=progress_tag) + + return_type = _annotations.get("return", "") + + caption_type = get_override_or_formatted_or_default( + override_kwargs, context, "caption_type", "fig" + ) + caption_maker = get_override_or_formatted_or_default( + override_kwargs, context, caption_type, None + ) + # parse and run function itself + args = [] + for arg in _required_args[1:]: + # first arg is always state + if arg in override_kwargs: + arg_value = override_kwargs[arg] + elif arg in context: + arg_value = context.get(arg) + else: + if arg in state._LOADABLE_TABLES: + arg_value = state._LOADABLE_TABLES[arg](context) + elif arg in state._LOADABLE_OBJECTS: + arg_value = state._LOADABLE_OBJECTS[arg](context) + else: + context.assert_key_has_value( + key=arg, caller=wrapped_func.__module__ + ) + raise KeyError(arg) + if ( + self._copy_tables + and arg in state.existing_table_status + and arg not in override_kwargs + ): + is_df = _annotations.get(arg) is pd.DataFrame + if is_df: + if isinstance(self._copy_tables, Container): + if arg in self._copy_tables: + arg_value = arg_value.copy() + else: + # copy_tables is truthy + arg_value = arg_value.copy() + if _annotations.get(arg) is pd.DataFrame and isinstance( + arg_value, xr.Dataset + ): + # convert to dataframe if asking for that + arg_value = arg_value.single_dim.to_pandas() + if _annotations.get(arg) is xr.Dataset and isinstance( + arg_value, pd.DataFrame + ): + # convert to dataset if asking for that + from sharrow.dataset import construct + + arg_value = construct(arg_value) + try: + args.append(arg_value) + except Exception as err: + raise ValueError(f"extracting {arg} from context") from err + if _ndefault: + # step arguments with defaults are never taken from the context + # they use the defaults always unless overridden manually + for arg, default in zip(_args[-_ndefault:], _defaults): + if arg in override_kwargs: + args.append(override_kwargs[arg]) + else: + args.append(default) + kwargs = {} + for karg in _kwonlyargs: + if karg in _kwonlydefaults: + # step arguments with defaults are never taken from the context + # they use the defaults always unless overridden manually + kwargs[karg] = override_kwargs.get(karg, _kwonlydefaults[karg]) + else: + if karg in override_kwargs: + kwargs[karg] = override_kwargs[karg] + else: + context.assert_key_has_value( + key=karg, caller=wrapped_func.__module__ + ) + try: + kwargs[karg] = get_formatted_or_raw(context, karg) + except Exception as err: + raise ValueError(f"extracting {karg} from context") from err + if _varkw: + kwargs.update(context) + for arg in _required_args: + if arg in kwargs: + kwargs.pop(arg) + try: + state.this_step = self + outcome = error_logging(wrapped_func)(state, *args, **kwargs) + finally: + del state.this_step + if self._kind == "table": + context[self._step_name] = outcome + if "_salient_tables" not in context: + context["_salient_tables"] = {} + context["_salient_tables"][self._step_name] = time.time() + return outcome + elif self._kind == "temp_table": + context[self._step_name] = outcome + return outcome + elif self._kind == "cached_object": + context[self._step_name] = outcome + return outcome + elif self._kind == "step": + if outcome is not None: + if not isinstance(outcome, Mapping): + raise ValueError( + f"workflow step {wrapped_func.__name__} should return a mapping or None" + ) + context.update(outcome) + + run_step.__doc__ = docstring + _create_step(self._step_name, run_step) + + def update_with_cache(state: State, *args, **kwargs): + ignore_cache = kwargs.pop("_ignore_cache_", False) + if self._step_name not in state._context or ignore_cache: + state._context[self._step_name] = wrapped_func(state, *args, **kwargs) + return state._context[self._step_name] + + update_with_cache.__doc__ = docstring + update_with_cache.__name__ = self._step_name + + if self._kind == "cached_object": + State._LOADABLE_OBJECTS[self._step_name] = run_step + return update_with_cache + elif self._kind == "table": + State._LOADABLE_TABLES[self._step_name] = run_step + return update_with_cache + elif self._kind == "temp_table": + State._TEMP_NAMES.add(self._step_name) + State._LOADABLE_TABLES[self._step_name] = run_step + for i in _args[1:]: + if i not in State._PREDICATES: + State._PREDICATES[i] = {self._step_name} + else: + State._PREDICATES[i].add(self._step_name) + return update_with_cache + elif self._kind == "step": + State._RUNNABLE_STEPS[self._step_name] = run_step + return wrapped_func + else: + raise ValueError(self._kind) + + +class cached_object(step): + """ + Decorator for functions that deliver objects that should be cached. + + The function is called to initialize or otherwise generate the value of + an object to be cached, but only if the matching name is not already stored + in the state's context. + + :py:class:`@workflow.cached_object ` is equivalent to + :py:class:`@workflow.step(cache=True, kind="cached_object") `. + """ + + def __new__(cls, wrapped_func=None, *, step_name=None): + return super().__new__( + cls, wrapped_func, step_name=step_name, cache=True, kind="cached_object" + ) + + +class table(step): + """ + Decorator for functions that deliver a data table. + + The function is called to initialize or otherwise generate the content of + a named data table, but only if the matching name is not already stored + in the state's context. + + :py:class:`@workflow.table ` is equivalent to + :py:class:`@workflow.step(cache=True, kind="table") `. + """ + + def __new__(cls, wrapped_func=None, *, step_name=None): + return super().__new__( + cls, wrapped_func, step_name=step_name, cache=True, kind="table" + ) + + +class temp_table(step): + """ + Decorator for functions that deliver a temporary data table. + + The function is called to initialize or otherwise generate the content of + a named temp table, but only if the matching name is not already stored + in the state's context. + + :py:class:`@workflow.temp_table ` is equivalent to + :py:class:`@workflow.step(cache=True, kind="temp_table") `. + """ + + def __new__(cls, wrapped_func=None, *, step_name=None): + return super().__new__( + cls, wrapped_func, step_name=step_name, cache=True, kind="temp_table" + ) + + +def _validate_workflow_function(f): + annot = get_annotations(f, eval_str=True) + argspec = getfullargspec(f) + if argspec.args[0] != "state": + raise SyntaxError("workflow.func must have `state` as the first argument") + if annot.get("state") is not workflow.State: + raise SyntaxError( + "workflow.func must have `State` as the first argument annotation" + ) + + +def func(function): + """ + Wrapper for a simple workflow function. + """ + _validate_workflow_function(function) + + def wrapper(state, *args, **kwargs): + if not isinstance(state, workflow.State): + raise TypeError( + "workflow functions must have a State as the first argument" + ) + return function(state, *args, **kwargs) + + return wrapper diff --git a/activitysim/core/workflow/test/conftest.py b/activitysim/core/workflow/test/conftest.py new file mode 100644 index 0000000000..7c123dacf4 --- /dev/null +++ b/activitysim/core/workflow/test/conftest.py @@ -0,0 +1,136 @@ +# The conftest.py file serves as a means of providing fixtures for an entire directory. +# Fixtures defined in a conftest.py can be used by any test in that package without +# needing to import them (pytest will automatically discover them). +# https://docs.pytest.org/en/7.2.x/reference/fixtures.html#conftest-py-sharing-fixtures-across-multiple-files + +from __future__ import annotations + +from pathlib import Path + +import pandas as pd +import pytest + +from activitysim.core.workflow import State +from activitysim.core.workflow.checkpoint import INITIAL_CHECKPOINT_NAME, ParquetStore + + +def _person_df() -> pd.DataFrame: + df = pd.DataFrame( + { + "Income": [45, 88, 56, 15, 71], + "Name": ["Andre", "Bruce", "Carol", "David", "Eugene"], + "Age": [14, 25, 55, 8, 21], + "WorkMode": ["Car", "Bus", "Car", "Car", "Walk"], + }, + index=pd.Index([441, 445, 552, 556, 934], name="person_id"), + ) + df["WorkMode"] = df["WorkMode"].astype("category") + return df + + +@pytest.fixture +def person_df() -> pd.DataFrame: + """ + Sample persons dataframe with dummy data. + """ + return _person_df() + + +def _los_df() -> pd.DataFrame: + return pd.DataFrame( + { + "Speed": {"Car": 60, "Bus": 20, "Walk": 3}, + "Cost": {"Car": 3.25, "Bus": 1.75, "Walk": 0}, + } + ) + + +@pytest.fixture +def los_df() -> pd.DataFrame: + """ + Sample LOS dataframe with dummy data. + """ + return _los_df() + + +@pytest.fixture +def los_messy_df() -> pd.DataFrame: + """ + Sample LOS dataframe with messy data. + """ + return _los_messy_df() + + +def _los_messy_df() -> pd.DataFrame: + los_df = _los_df() + los_df["int_first"] = [123, "text", 456.7] + los_df["text_first"] = ["klondike", 5, 12.34] + los_df["float_first"] = [456.7, "text", 555] + return los_df + + +@pytest.fixture(scope="session") +def sample_parquet_store(tmp_path_factory: pytest.TempPathFactory) -> Path: + """ + Generate sample parquet store for testing. + + Parameters + ---------- + tmp_path_factory : pytest.TempPathFactory + PyTest's own temporary path fixture, the sample parquet store + will be created in a temporary directory here. + + Returns + ------- + Path + Location of zip archive + """ + t = tmp_path_factory.mktemp("core-workflow") + + s = t.joinpath("sample-1") + s.joinpath("configs").mkdir(parents=True, exist_ok=True) + s.joinpath("data").mkdir(exist_ok=True) + + state = State.make_default(s) + state.checkpoint.add(INITIAL_CHECKPOINT_NAME) + + # a table to store + person_df = _person_df() + state.add_table("persons", person_df) + state.checkpoint.add("init_persons") + + # a second table + state.add_table("level_of_service", _los_df()) + state.checkpoint.add("init_los") + + # modify table + person_df["status"] = [11, 22, 33, 44, 55] + state.add_table("persons", person_df) + state.checkpoint.add("mod_persons") + + # modify table messy + state.add_table("level_of_service", _los_messy_df()) + state.checkpoint.add("mod_los") + + return state.checkpoint.store.filename + + +@pytest.fixture(scope="session") +def sample_parquet_zip(sample_parquet_store: Path) -> Path: + """ + Copy the sample parquet store into a read-only Zip archive. + + Parameters + ---------- + sample_parquet_store : Path + Location of original ParquetStore files. + + Returns + ------- + Path + Location of zip archive + """ + ps = ParquetStore(sample_parquet_store, mode="r") + return ps.make_zip_archive( + output_filename=ps.filename.parent.joinpath("samplepipeline") + ) diff --git a/activitysim/core/workflow/test/test_parquet_store.py b/activitysim/core/workflow/test/test_parquet_store.py new file mode 100644 index 0000000000..75889a1284 --- /dev/null +++ b/activitysim/core/workflow/test/test_parquet_store.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd +import pytest + +from activitysim.core import exceptions +from activitysim.core.workflow.checkpoint import GenericCheckpointStore, ParquetStore + + +def _test_parquet_store(store: GenericCheckpointStore, person_df, los_df, los_messy_df): + assert isinstance(store, GenericCheckpointStore) + assert store.list_checkpoint_names() == [ + "init", + "init_persons", + "init_los", + "mod_persons", + "mod_los", + ] + with pytest.raises(exceptions.TableNameNotFound): + store.get_dataframe("missing-tablename", "init_persons") + with pytest.raises(exceptions.CheckpointNameNotFoundError): + store.get_dataframe("persons", "bad-checkpoint-name") + pd.testing.assert_frame_equal( + store.get_dataframe("persons", "init_persons"), person_df + ) + pd.testing.assert_frame_equal(store.get_dataframe("persons", "init_los"), person_df) + with pytest.raises(AssertionError, match="DataFrame shape mismatch"): + pd.testing.assert_frame_equal( + store.get_dataframe("persons", "mod_persons"), person_df + ) + pd.testing.assert_frame_equal( + store.get_dataframe("persons", "mod_persons"), + person_df.assign(status=[11, 22, 33, 44, 55]), + ) + # call for last checkpoint explicitly + pd.testing.assert_frame_equal( + store.get_dataframe("persons", "_"), + person_df.assign(status=[11, 22, 33, 44, 55]), + ) + # call for last checkpoint implicitly + pd.testing.assert_frame_equal( + store.get_dataframe("persons"), + person_df.assign(status=[11, 22, 33, 44, 55]), + ) + + pd.testing.assert_frame_equal( + store.get_dataframe("level_of_service", "init_los"), + los_df, + ) + pd.testing.assert_frame_equal( + store.get_dataframe("level_of_service", "mod_persons"), + los_df, + ) + # messy data has mixed dtypes, falls back to pickle instead of parquet + pd.testing.assert_frame_equal( + store.get_dataframe("level_of_service"), + los_messy_df, + ) + + +def test_parquet_store(sample_parquet_store: Path, person_df, los_df, los_messy_df): + ps = ParquetStore(sample_parquet_store, mode="r") + _test_parquet_store(ps, person_df, los_df, los_messy_df) + + +def test_parquet_store_zip(sample_parquet_zip: Path, person_df, los_df, los_messy_df): + ps = ParquetStore(sample_parquet_zip, mode="r") + _test_parquet_store(ps, person_df, los_df, los_messy_df) diff --git a/activitysim/core/workflow/tracing.py b/activitysim/core/workflow/tracing.py new file mode 100644 index 0000000000..384e10f59b --- /dev/null +++ b/activitysim/core/workflow/tracing.py @@ -0,0 +1,745 @@ +from __future__ import annotations + +import ast +import csv +import logging +import logging.config +import os +import struct +import sys +import tarfile +import tempfile +import time +from collections.abc import Mapping, MutableMapping, Sequence +from pathlib import Path +from typing import Any, Optional + +import numpy as np +import pandas as pd +import yaml + +from activitysim.core import tracing +from activitysim.core.test import assert_equal, assert_frame_substantively_equal +from activitysim.core.workflow.accessor import FromState, StateAccessor + +logger = logging.getLogger(__name__) + +CSV_FILE_TYPE = "csv" + +DEFAULT_TRACEABLE_TABLES = [ + "households", + "persons", + "tours", + "joint_tour_participants", + "trips", + "vehicles", +] + + +class RunId(str): + def __new__(cls, x=None): + if x is None: + return cls( + hex(struct.unpack(" Path | None: + if self._obj is None: + return None + result = self._obj._context.get("tracing_validation_directory", None) + if isinstance(result, tempfile.TemporaryDirectory): + return Path(result.name) + return result + + @validation_directory.setter + def validation_directory(self, directory: Path | None): + if directory is None: + self._obj._context.pop("tracing_validation_directory", None) + else: + directory = Path(directory) + # decompress cache file into working directory + if directory.suffixes[-2:] == [".tar", ".gz"]: + tempdir = tempfile.TemporaryDirectory() + with tarfile.open(directory) as tfile: + tfile.extractall(tempdir.name) + self._obj._context["tracing_validation_directory"] = tempdir + else: + self._obj._context["tracing_validation_directory"] = directory + + def __get__(self, instance, objtype=None) -> "Tracing": + # derived __get__ changes annotation, aids in type checking + return super().__get__(instance, objtype) + + def initialize(self): + self.traceable_table_ids = {} + + def register_traceable_table(self, table_name: str, df: pd.DataFrame) -> None: + """ + Register traceable table + + Parameters + ---------- + table_name : str + df: pandas.DataFrame + The traced dataframe. + """ + + # add index name to traceable_table_indexes + + logger.debug(f"register_traceable_table {table_name}") + + traceable_tables = self.traceable_tables + if table_name not in traceable_tables: + logger.error("table '%s' not in traceable_tables" % table_name) + return + + idx_name = df.index.name + if idx_name is None: + logger.error("Can't register table '%s' without index name" % table_name) + return + + traceable_table_ids = self.traceable_table_ids + traceable_table_indexes = self.traceable_table_indexes + + if ( + idx_name in traceable_table_indexes + and traceable_table_indexes[idx_name] != table_name + ): + logger.error( + "table '%s' index name '%s' already registered for table '%s'" + % (table_name, idx_name, traceable_table_indexes[idx_name]) + ) + return + + # update traceable_table_indexes with this traceable_table's idx_name + if idx_name not in traceable_table_indexes: + traceable_table_indexes[idx_name] = table_name + logger.debug( + "adding table %s.%s to traceable_table_indexes" % (table_name, idx_name) + ) + self.traceable_table_indexes = traceable_table_indexes + + # add any new indexes associated with trace_hh_id to traceable_table_ids + + trace_hh_id = self._obj.settings.trace_hh_id + if trace_hh_id is None: + return + + new_traced_ids = [] + # if table_name == "households": + if table_name in ["households", "proto_households"]: + if trace_hh_id not in df.index: + logger.warning("trace_hh_id %s not in dataframe" % trace_hh_id) + new_traced_ids = [] + else: + logger.info( + "tracing household id %s in %s households" + % (trace_hh_id, len(df.index)) + ) + new_traced_ids = [trace_hh_id] + else: + # find first already registered ref_col we can use to slice this table + ref_col = next( + (c for c in traceable_table_indexes if c in df.columns), None + ) + + if ref_col is None: + logger.error( + "can't find a registered table to slice table '%s' index name '%s'" + " in traceable_table_indexes: %s" + % (table_name, idx_name, traceable_table_indexes) + ) + return + + # get traceable_ids for ref_col table + ref_col_table_name = traceable_table_indexes[ref_col] + ref_col_traced_ids = traceable_table_ids.get(ref_col_table_name, []) + + # inject list of ids in table we are tracing + # this allows us to slice by id without requiring presence of a household id column + traced_df = df[df[ref_col].isin(ref_col_traced_ids)] + new_traced_ids = traced_df.index.tolist() + if len(new_traced_ids) == 0: + logger.warning( + "register %s: no rows with %s in %s." + % (table_name, ref_col, ref_col_traced_ids) + ) + + # update the list of trace_ids for this table + prior_traced_ids = traceable_table_ids.get(table_name, []) + + if new_traced_ids: + assert not set(prior_traced_ids) & set(new_traced_ids) + traceable_table_ids[table_name] = prior_traced_ids + new_traced_ids + self.traceable_table_ids = traceable_table_ids + + logger.debug( + "register %s: added %s new ids to %s existing trace ids" + % (table_name, len(new_traced_ids), len(prior_traced_ids)) + ) + logger.debug( + "register %s: tracing new ids %s in %s" + % (table_name, new_traced_ids, table_name) + ) + + def deregister_traceable_table(self, table_name: str) -> None: + """ + un-register traceable table + + Parameters + ---------- + table_name : str + """ + traceable_table_ids = self.traceable_table_ids + traceable_table_indexes = self.traceable_table_indexes + + if table_name not in self.traceable_tables: + logger.error("table '%s' not in traceable_tables" % table_name) + + else: + self.traceable_table_ids = { + k: v for k, v in traceable_table_ids.items() if k != table_name + } + self.traceable_table_indexes = { + k: v for k, v in traceable_table_indexes.items() if v != table_name + } + + def write_csv( + self, + df, + file_name, + index_label=None, + columns=None, + column_labels=None, + transpose=True, + ): + """ + Print write_csv + + Parameters + ---------- + df: pandas.DataFrame or pandas.Series or dict + traced dataframe + file_name: str + output file name + index_label: str + index name + columns: list + columns to write + transpose: bool + whether to transpose dataframe (ignored for series) + Returns + ------- + Nothing + """ + + assert len(file_name) > 0 + + if not file_name.endswith(".%s" % CSV_FILE_TYPE): + file_name = "%s.%s" % (file_name, CSV_FILE_TYPE) + + file_path = self._obj.filesystem.get_trace_file_path( + file_name, tail=self.run_id + ) + + if os.name == "nt": + abs_path = os.path.abspath(file_path) + if len(abs_path) > 255: + msg = f"path length ({len(abs_path)}) may exceed Windows maximum length unless LongPathsEnabled: {abs_path}" + logger.warning(msg) + + if os.path.isfile(file_path): + logger.debug("write_csv file exists %s %s" % (type(df).__name__, file_name)) + + if isinstance(df, pd.DataFrame): + # logger.debug("dumping %s dataframe to %s" % (df.shape, file_name)) + tracing.write_df_csv( + df, file_path, index_label, columns, column_labels, transpose=transpose + ) + elif isinstance(df, pd.Series): + # logger.debug("dumping %s element series to %s" % (df.shape[0], file_name)) + tracing.write_series_csv(df, file_path, index_label, columns, column_labels) + elif isinstance(df, dict): + df = pd.Series(data=df) + # logger.debug("dumping %s element dict to %s" % (df.shape[0], file_name)) + tracing.write_series_csv(df, file_path, index_label, columns, column_labels) + else: + logger.error( + "write_csv object for file_name '%s' of unexpected type: %s" + % (file_name, type(df)) + ) + + def trace_df( + self, + df: pd.DataFrame, + label: str, + slicer=None, + columns: Optional[list[str]] = None, + index_label=None, + column_labels=None, + transpose=True, + warn_if_empty=False, + ): + """ + Slice dataframe by traced household or person id dataframe and write to CSV + + Parameters + ---------- + state: workflow.State + df: pandas.DataFrame + traced dataframe + label: str + tracer name + slicer: Object + slicer for subsetting + columns: list + columns to write + index_label: str + index name + column_labels: [str, str] + labels for columns in csv + transpose: boolean + whether to transpose file for legibility + warn_if_empty: boolean + write warning if sliced df is empty + + Returns + ------- + Nothing + """ + + target_ids, column = self.get_trace_target(df, slicer) + + if target_ids is not None: + df = tracing.slice_ids(df, target_ids, column) + + if warn_if_empty and df.shape[0] == 0 and target_ids != []: + column_name = column or slicer + logger.warning( + "slice_canonically: no rows in %s with %s == %s" + % (label, column_name, target_ids) + ) + + if df.shape[0] > 0: + self.write_csv( + df, + file_name=label, + index_label=(index_label or slicer), + columns=columns, + column_labels=column_labels, + transpose=transpose, + ) + + if self.validation_directory: + skip_validation = False + if label.endswith("constants"): + skip_validation = ( + True # contants sometimes has skimwrapper objects added + ) + if not skip_validation: + try: + that_path = self._obj.filesystem.find_trace_file_path( + label, trace_dir=self.validation_directory, file_type="csv" + ) + except FileNotFoundError as err: + logger.warning( + f"trace validation file not found: {err}\n" + f" in validation_directory: {self.validation_directory}" + ) + else: + if transpose: + # wreaks havoc with pandas dtypes and column names + # check as a simple list of lists instead + def literal_eval(x): + try: + return ast.literal_eval(x) + except Exception: + return x + + def read_csv_as_list_of_lists(finame): + with open(finame, newline="") as csvfile: + return [ + list(map(literal_eval, row)) + for row in csv.reader(csvfile) + ] + + that_blob = read_csv_as_list_of_lists(that_path) + this_path = self._obj.filesystem.get_trace_file_path( + label, tail=self.run_id, file_type="csv" + ) + this_blob = read_csv_as_list_of_lists(this_path) + + _this_index = [i[0] for i in this_blob] + if len(set(_this_index)) == len(_this_index): + # indexes are unique, convert to dict + this_dict = dict( + zip( + [i[0] for i in this_blob], + [i[1:] for i in this_blob], + ) + ) + that_dict = dict( + zip( + [i[0] for i in that_blob], + [i[1:] for i in that_blob], + ) + ) + assert_equal(this_dict, that_dict) + else: + try: + assert_equal(this_blob, that_blob) + except: + logger.error(f"trace validation BAD: {label}") + raise + else: + logger.debug(f"trace validation OK: {label}") + else: + that_df = pd.read_csv(that_path) + # check against the file we just wrote + this_path = self._obj.filesystem.get_trace_file_path( + label, tail=self.run_id, file_type="csv" + ) + this_df = pd.read_csv(this_path) + assert_frame_substantively_equal(this_df, that_df) + logger.debug(f"trace validation OK: {label}") + + def trace_interaction_eval_results(self, trace_results, trace_ids, label): + """ + Trace model design eval results for interaction_simulate + + Parameters + ---------- + trace_results: pandas.DataFrame + traced model_design dataframe + trace_ids : tuple (str, numpy.ndarray) + column name and array of trace_ids from interaction_trace_rows() + used to filter the trace_results dataframe by traced hh or person id + label: str + tracer name + + Returns + ------- + Nothing + """ + + assert type(trace_ids[1]) == np.ndarray + + slicer_column_name = trace_ids[0] + + try: + trace_results[slicer_column_name] = trace_ids[1] + except ValueError: + trace_results[slicer_column_name] = int(trace_ids[1]) + + targets = np.unique(trace_ids[1]) + + if len(trace_results.index) == 0: + return + + # write out the raw dataframe + + file_path = self._obj.filesystem.get_trace_file_path( + "%s.raw.csv" % label, tail=self.run_id + ) + trace_results.to_csv(file_path, mode="a", index=True, header=True) + + # if there are multiple targets, we want them in separate tables for readability + for target in targets: + df_target = trace_results[trace_results[slicer_column_name] == target] + + # we want the transposed columns in predictable order + df_target.sort_index(inplace=True) + + # # remove the slicer (person_id or hh_id) column? + # del df_target[slicer_column_name] + + target_label = "%s.%s.%s" % (label, slicer_column_name, target) + + self.trace_df( + df_target, + label=target_label, + slicer="NONE", + transpose=True, + column_labels=["expression", None], + warn_if_empty=False, + ) + + def interaction_trace_rows(self, interaction_df, choosers, sample_size=None): + """ + Trace model design for interaction_simulate + + Parameters + ---------- + interaction_df: pandas.DataFrame + traced model_design dataframe + choosers: pandas.DataFrame + interaction_simulate choosers + (needed to filter the model_design dataframe by traced hh or person id) + sample_size int or None + int for constant sample size, or None if choosers have different numbers of alternatives + Returns + ------- + trace_rows : numpy.ndarray + array of booleans to flag which rows in interaction_df to trace + + trace_ids : tuple (str, numpy.ndarray) + column name and array of trace_ids mapping trace_rows to their target_id + for use by trace_interaction_eval_results which needs to know target_id + so it can create separate tables for each distinct target for readability + """ + + # slicer column name and id targets to use for chooser id added to model_design dataframe + # currently we only ever slice by person_id, but that could change, so we check here... + + traceable_table_ids = self.traceable_table_ids + + # Determine whether actual tables or proto_ tables for disaggregate accessibilities + persons_table_name = set(traceable_table_ids).intersection( + ["persons", "proto_persons"] + ) + households_table_name = set(traceable_table_ids).intersection( + ["households", "proto_households"] + ) + + assert len(persons_table_name) == 1 and len(persons_table_name) == 1 + persons_table_name, households_table_name = ( + persons_table_name.pop(), + households_table_name.pop(), + ) + + if ( + choosers.index.name in ["person_id", "proto_person_id"] + ) and persons_table_name in traceable_table_ids: + slicer_column_name = choosers.index.name + targets = traceable_table_ids[persons_table_name] + elif ( + choosers.index.name in ["household_id", "proto_household_id"] + ) and households_table_name in traceable_table_ids: + slicer_column_name = choosers.index.name + targets = traceable_table_ids[households_table_name] + elif "household_id" in choosers.columns and "households" in traceable_table_ids: + slicer_column_name = "household_id" + targets = traceable_table_ids[households_table_name] + elif ( + "person_id" in choosers.columns + and persons_table_name in traceable_table_ids + ): + slicer_column_name = "person_id" + targets = traceable_table_ids[persons_table_name] + elif ( + choosers.index.name == "proto_tour_id" + and "proto_tours" in traceable_table_ids + ): + slicer_column_name = choosers.index.name + targets = traceable_table_ids["proto_tours"] + else: + print(choosers.columns) + raise RuntimeError( + "interaction_trace_rows don't know how to slice index '%s'" + % choosers.index.name + ) + + if sample_size is None: + # if sample size not constant, we count on either + # slicer column being in itneraction_df + # or index of interaction_df being same as choosers + if slicer_column_name in interaction_df.columns: + trace_rows = np.in1d(interaction_df[slicer_column_name], targets) + trace_ids = interaction_df.loc[trace_rows, slicer_column_name].values + else: + assert interaction_df.index.name == choosers.index.name + trace_rows = np.in1d(interaction_df.index, targets) + trace_ids = interaction_df[trace_rows].index.values + + else: + if slicer_column_name == choosers.index.name: + trace_rows = np.in1d(choosers.index, targets) + trace_ids = np.asanyarray(choosers[trace_rows].index) + elif slicer_column_name == "person_id": + trace_rows = np.in1d(choosers["person_id"], targets) + trace_ids = np.asanyarray(choosers[trace_rows].person_id) + elif slicer_column_name == "household_id": + trace_rows = np.in1d(choosers["household_id"], targets) + trace_ids = np.asanyarray(choosers[trace_rows].household_id) + else: + assert False + + # simply repeat if sample size is constant across choosers + assert sample_size == len(interaction_df.index) / len(choosers.index) + trace_rows = np.repeat(trace_rows, sample_size) + trace_ids = np.repeat(trace_ids, sample_size) + + assert type(trace_rows) == np.ndarray + assert type(trace_ids) == np.ndarray + + trace_ids = (slicer_column_name, trace_ids) + + return trace_rows, trace_ids + + def get_trace_target(self, df: pd.DataFrame, slicer: str, column: Any = None): + """ + get target ids and column or index to identify target trace rows in df + + Parameters + ---------- + df: pandas.DataFrame + This dataframe is to be sliced + slicer: str + name of column or index to use for slicing + column : Any + + Returns + ------- + target : int or list of ints + id or ids that identify tracer target rows + column : str + name of column to search for targets or None to search index + """ + + target_ids = ( + None # id or ids to slice by (e.g. hh_id or person_ids or tour_ids) + ) + + # special do-not-slice code for dumping entire df + if slicer == "NONE": + return target_ids, column + + if slicer is None: + slicer = df.index.name + + if isinstance(df, pd.DataFrame): + # always slice by household id if we can + if "household_id" in df.columns: + slicer = "household_id" + if slicer in df.columns: + column = slicer + + if column is None and df.index.name != slicer: + raise RuntimeError( + "bad slicer '%s' for df with index '%s'" % (slicer, df.index.name) + ) + + traceable_table_indexes = self.traceable_table_indexes + traceable_table_ids = self.traceable_table_ids + + if df.empty: + target_ids = None + elif slicer in traceable_table_indexes: + # maps 'person_id' to 'persons', etc + table_name = traceable_table_indexes[slicer] + target_ids = traceable_table_ids.get(table_name, []) + elif slicer == "zone_id": + target_ids = self._obj.settings.trace_od + + return target_ids, column + + def trace_targets(self, df, slicer=None, column=None): + target_ids, column = self.get_trace_target(df, slicer, column) + + if target_ids is None: + targets = None + else: + if column is None: + targets = df.index.isin(target_ids) + else: + # convert to numpy array for consistency since that is what index.isin returns + targets = df[column].isin(target_ids).to_numpy() + + return targets + + def has_trace_targets(self, df, slicer=None, column=None): + target_ids, column = self.get_trace_target(df, slicer, column) + + if target_ids is None: + found = False + else: + if column is None: + found = df.index.isin(target_ids).any() + else: + found = df[column].isin(target_ids).any() + + return found + + def dump_df(self, dump_switch, df, trace_label, fname): + if dump_switch: + trace_label = tracing.extend_trace_label(trace_label, "DUMP.%s" % fname) + self.trace_df( + df, + trace_label, + index_label=df.index.name, + slicer="NONE", + transpose=False, + ) + + def delete_output_files(self, file_type, ignore=None, subdir=None): + """ + Delete files in output directory of specified type. + + Parameters + ---------- + file_type : str + File extension to delete. + ignore : list[Path-like] + Specific files to leave alone. + subdir : list[Path-like], optional + Subdirectories to scrub. If not given, the top level output directory + plus the 'log' and 'trace' directories will be scrubbed. + """ + + output_dir = self._obj.filesystem.get_output_dir() + + subdir = [subdir] if subdir else None + directories = subdir or ["", "log", "trace"] + + for subdir in directories: + dir = output_dir.joinpath(output_dir, subdir) if subdir else output_dir + + if not dir.exists(): + continue + + if ignore: + ignore = [os.path.realpath(p) for p in ignore] + + # logger.debug("Deleting %s files in output dir %s" % (file_type, dir)) + + for the_file in os.listdir(dir): + if the_file.endswith(file_type): + file_path = os.path.join(dir, the_file) + + if ignore and os.path.realpath(file_path) in ignore: + continue + + try: + if os.path.isfile(file_path): + logger.debug("delete_output_files deleting %s" % file_path) + os.unlink(file_path) + except Exception as e: + print(e) + + def delete_trace_files(self): + """ + Delete CSV files in output_dir + """ + self.delete_output_files(CSV_FILE_TYPE, subdir="trace") + self.delete_output_files(CSV_FILE_TYPE, subdir="log") + + active_log_files = [ + h.baseFilename + for h in logger.root.handlers + if isinstance(h, logging.FileHandler) + ] + + self.delete_output_files("log", ignore=active_log_files) diff --git a/activitysim/core/workflow/util.py b/activitysim/core/workflow/util.py new file mode 100644 index 0000000000..37f327944f --- /dev/null +++ b/activitysim/core/workflow/util.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import logging +import os + +from pypyr.context import Context, KeyNotInContextError + +logger = logging.getLogger(__name__) + + +def get_formatted_or_raw(self: Context, key: str): + try: + return self.get_formatted(key) + except TypeError: + return self.get(key) + except Exception as err: + raise ValueError(f"extracting {key} from context") from err + + +def get_formatted_or_default(self: Context, key: str, default): + try: + return self.get_formatted(key) + except (KeyNotInContextError, KeyError): + return default + except TypeError: + return self.get(key) + except Exception as err: + raise ValueError(f"extracting {key} from context") from err + + +def get_override_or_formatted_or_default( + overrides: dict, self: Context, key: str, default +): + if key in overrides: + return overrides[key] + else: + return get_formatted_or_default(self, key, default) + + +def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + +def write_notebook_heading(text: str, heading_level: int | None = None) -> None: + """ + If running in a jupyter-like environment, display a heading. + + Parameters + ---------- + text : str + The heading to display + heading_level : int, optional + The heading level to use. Should be an integer from 1 to 6. + If omitted or zero, no heading is not displayed. + """ + if heading_level and is_notebook(): + if heading_level < 0: + raise ValueError("negative heading levels not allowed") + if heading_level > 6: + # heading levels greater than 6 are not allowed + heading_level = 6 + import IPython.display + + IPython.display.display_markdown("#" * heading_level + f" {text}", raw=True) + + +def remove_empty_folders(path_abs): + walk = list(os.walk(path_abs)) + for path, _, _ in walk[::-1]: + if len(os.listdir(path)) == 0: + os.remove(path) diff --git a/activitysim/estimation/larch/cdap.py b/activitysim/estimation/larch/cdap.py index fdb801de03..761cce05b2 100644 --- a/activitysim/estimation/larch/cdap.py +++ b/activitysim/estimation/larch/cdap.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import importlib import itertools import logging @@ -326,7 +328,7 @@ def read_yaml(filename, **kwargs): if person_type_map is None: raise KeyError("PERSON_TYPE_MAP missing from cdap_settings.yaml") - person_rank = cdap.assign_cdap_rank(persons, person_type_map) + person_rank = cdap.assign_cdap_rank(None, persons, person_type_map) coefficients = read_csv( coefficients_file, diff --git a/activitysim/estimation/test/test_larch_estimation.py b/activitysim/estimation/test/test_larch_estimation.py index b148009f3f..ec38c2a014 100644 --- a/activitysim/estimation/test/test_larch_estimation.py +++ b/activitysim/estimation/test/test_larch_estimation.py @@ -1,43 +1,35 @@ +from __future__ import annotations + +import datetime import os -import subprocess -import tempfile +from pathlib import Path import pandas as pd +import platformdirs import pytest -from activitysim.cli.create import get_example +from activitysim.core import workflow @pytest.fixture(scope="module") def est_data(): - cwd = os.getcwd() - tempdir = tempfile.TemporaryDirectory() - os.chdir(tempdir.name) - - get_example("example_estimation_sf", "_test_est") - os.chdir("_test_est") - - # !activitysim run -c configs_estimation/configs -c configs -o output -d data_sf - print(f"List of files now in {os.getcwd()}") - subprocess.run(["find", "."]) - print(f"\n\nrunning activitysim estimation mode in {os.getcwd()}") - subprocess.run( - [ - "activitysim", - "run", - "-c", - "configs_estimation/configs", - "-c", - "configs", - "-o", - "output", - "-d", - "data_sf", - ], + working_dir = Path(platformdirs.user_cache_dir(appname="ActivitySim")).joinpath( + f"estimation-test-base" ) + working_dir.mkdir(parents=True, exist_ok=True) + os.chdir(working_dir) + if not working_dir.joinpath("success.txt").exists(): + import activitysim.abm + + state = workflow.create_example("example_estimation_sf", directory=working_dir) + state.run.all() + working_dir.joinpath("success.txt").write_text( + datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S.%f") + ) - yield os.getcwd() + os.chdir(working_dir.joinpath("example_estimation_sf")) + yield str(working_dir.joinpath("example_estimation_sf")) os.chdir(cwd) diff --git a/activitysim/estimation/test/test_larch_estimation/test_cdap_model.csv b/activitysim/estimation/test/test_larch_estimation/test_cdap_model.csv index 2ce2d1807a..a9e237b235 100644 --- a/activitysim/estimation/test/test_larch_estimation/test_cdap_model.csv +++ b/activitysim/estimation/test/test_larch_estimation/test_cdap_model.csv @@ -1,162 +1,162 @@ ,value,initvalue,nullvalue,minimum,maximum,best coef_UNAVAILABLE,-999,-999,0,-999,-999,-999 -coef_child_who_is_in_school_or_too_young_for_school_interaction_with_off_peak_accessibility_to_retail_N,0.85692976124501019,0.08233,0,,,0.85692976124501019 -coef_driving_age_child_who_is_in_school_asc_M,0.73848046972910941,2.3309186849999999,0,,,0.73848046972910941 -coef_driving_age_child_who_is_in_school_asc_N,-8.9308855274487637,-0.59911911200000001,0,,,-8.9308855274487637 -coef_driving_age_child_who_is_in_school_interaction_income_between_50k_and_100k_H,-0.93853079335916489,-0.50309999999999999,0,,,-0.93853079335916489 -coef_driving_age_child_who_is_in_school_interaction_with_fewer_cars_than_workers_H,0.4088477129872149,0.64749999999999996,0,,,0.4088477129872149 -coef_driving_age_child_who_is_in_school_interaction_with_income_more_than_100k_H,-1.7505782057913633,-2.0459999999999998,0,,,-1.7505782057913633 -coef_driving_age_child_who_is_in_school_interaction_with_less_than_20k_H,0.45641220632252377,1.3069999999999999,0,,,0.45641220632252377 -coef_full_time_worker_asc_M,1.0349870898895677,1.3787345790000001,0,,,1.0349870898895677 -coef_full_time_worker_asc_N,0.74205343594968753,0.62266239099999998,0,,,0.74205343594968753 -coef_full_time_worker_interaction_with_age_less_than_40_M,0.43851597545706389,0.20910000000000001,0,,,0.43851597545706389 -coef_full_time_worker_interaction_with_female_gender_M,0.052938784327757844,-0.12590000000000001,0,,,0.052938784327757844 -coef_full_time_worker_interaction_with_fewer_cars_than_workers_H,0.41848366916078727,0.50390000000000001,0,,,0.41848366916078727 -coef_full_time_worker_interaction_with_income_less_than_20k_H,0.35894137920266034,0.53129999999999999,0,,,0.35894137920266034 -coef_full_time_worker_intraction_with_peak_accessibility_to_all_employment_M,0.14689866226825507,0.1212,0,,,0.14689866226825507 -coef_non_working_adult_asc_N,0.87387238182319815,0.59464538600000005,0,,,0.87387238182319815 +coef_child_who_is_in_school_or_too_young_for_school_interaction_with_off_peak_accessibility_to_retail_N,0.87159546592932291,0.08233,0,,,0.87159546592932291 +coef_driving_age_child_who_is_in_school_asc_M,0.74815228797618816,2.3309186849999999,0,,,0.74815228797618816 +coef_driving_age_child_who_is_in_school_asc_N,-9.0640177490324412,-0.59911911200000001,0,,,-9.0640177490324412 +coef_driving_age_child_who_is_in_school_interaction_income_between_50k_and_100k_H,-0.90601686275009163,-0.50309999999999999,0,,,-0.90601686275009163 +coef_driving_age_child_who_is_in_school_interaction_with_fewer_cars_than_workers_H,0.42727506400961712,0.64749999999999996,0,,,0.42727506400961712 +coef_driving_age_child_who_is_in_school_interaction_with_income_more_than_100k_H,-1.7272824609972235,-2.0459999999999998,0,,,-1.7272824609972235 +coef_driving_age_child_who_is_in_school_interaction_with_less_than_20k_H,0.45396678614268526,1.3069999999999999,0,,,0.45396678614268526 +coef_full_time_worker_asc_M,0.98435866822120488,1.3787345790000001,0,,,0.98435866822120488 +coef_full_time_worker_asc_N,0.73324510331920412,0.62266239099999998,0,,,0.73324510331920412 +coef_full_time_worker_interaction_with_age_less_than_40_M,0.44443952843908197,0.20910000000000001,0,,,0.44443952843908197 +coef_full_time_worker_interaction_with_female_gender_M,0.038410812572387439,-0.12590000000000001,0,,,0.038410812572387439 +coef_full_time_worker_interaction_with_fewer_cars_than_workers_H,0.41538570763618882,0.50390000000000001,0,,,0.41538570763618882 +coef_full_time_worker_interaction_with_income_less_than_20k_H,0.44991051165021323,0.53129999999999999,0,,,0.44991051165021323 +coef_full_time_worker_intraction_with_peak_accessibility_to_all_employment_M,0.15132046128679377,0.1212,0,,,0.15132046128679377 +coef_non_working_adult_asc_N,0.61315184259117572,0.59464538600000005,0,,,0.61315184259117572 coef_non_working_adult_interaction_with_female_gender_M,-0.74299999999999999,-0.74299999999999999,0,,,-0.74299999999999999 -coef_non_working_adult_interaction_with_fewer_cars_than_workers_H,0.79722787615405288,0.89649999999999996,0,,,0.79722787615405288 -coef_non_working_adult_interaction_with_income_between_50k_and_100k_H,-1.2296785020118925,-0.56020000000000003,0,,,-1.2296785020118925 -coef_non_working_adult_interaction_with_income_more_than_100k_H,-0.68256354997790902,-0.71879999999999999,0,,,-0.68256354997790902 +coef_non_working_adult_interaction_with_fewer_cars_than_workers_H,0.80008083982324651,0.89649999999999996,0,,,0.80008083982324651 +coef_non_working_adult_interaction_with_income_between_50k_and_100k_H,-1.135877229516298,-0.56020000000000003,0,,,-1.135877229516298 +coef_non_working_adult_interaction_with_income_more_than_100k_H,-0.53695636446511075,-0.71879999999999999,0,,,-0.53695636446511075 coef_non_working_adult_interaction_with_more_cars_than_workers_M,0.65149999999999997,0.65149999999999997,0,,,0.65149999999999997 -coef_non_working_adult_interaction_with_more_cars_than_workers_N,1.3261941742938397,0.81679999999999997,0,,,1.3261941742938397 +coef_non_working_adult_interaction_with_more_cars_than_workers_N,1.4756696612117433,0.81679999999999997,0,,,1.4756696612117433 coef_non_working_adult_interaction_with_peak_accessibility_to_all_employment_M,0.23139999999999999,0.23139999999999999,0,,,0.23139999999999999 -coef_non_working_adult_retired_or_univ_student_interaction_with_off_peak_accessibility_to_all_employment_N,0.017589208634449675,0.072069999999999995,0,,,0.017589208634449675 -coef_part_time_worker_asc_M,4.9000426829635204,-0.71882373799999999,0,,,4.9000426829635204 -coef_part_time_worker_asc_N,0.65921416326577464,0.63603246700000005,0,,,0.65921416326577464 -coef_part_time_worker_interaction_with_income_between_50k_and_100k_H,0.17186717026517481,-0.4032,0,,,0.17186717026517481 -coef_part_time_worker_interaction_with_income_less_than_20k_H,0.38041904791748155,0.32319999999999999,0,,,0.38041904791748155 -coef_part_time_worker_interaction_with_income_more_than_100k_H,-1.362929826508287,-0.35339999999999999,0,,,-1.362929826508287 -coef_part_time_worker_interaction_with_income_more_than_100k_N,0.49250876189647425,0.42070000000000002,0,,,0.49250876189647425 -coef_part_time_worker_interaction_with_peak_accessibility_to_all_employment_M,-0.24708976803064769,0.20039999999999999,0,,,-0.24708976803064769 -coef_pre_driving_age_child_who_is_in_school_asc_M,3.9459941307419402,3.295863529,0,,,3.9459941307419402 -coef_pre_driving_age_child_who_is_in_school_asc_N,-6.8110611320385024,0.57142433999999998,0,,,-6.8110611320385024 -coef_pre_driving_age_child_who_is_in_school_interaction_with_age_13_to_15_M,-1.5479969903363138,-0.71409999999999996,0,,,-1.5479969903363138 -coef_pre_driving_age_child_who_is_in_school_interaction_with_age_13_to_15_N,-1.1527677554511755,-0.67200000000000004,0,,,-1.1527677554511755 -coef_pre_driving_age_child_who_is_in_school_interaction_with_age_6_to_9_M,-0.67028640157584041,-0.29430000000000001,0,,,-0.67028640157584041 -coef_pre_driving_age_child_who_is_in_school_interaction_with_fewer_cars_than_workers_H,1.1016412307552537,0.58620000000000005,0,,,1.1016412307552537 -coef_pre_driving_age_child_who_is_too_young_for_school_asc_M,0.93761313981906003,1.052531189,0,,,0.93761313981906003 -coef_pre_driving_age_child_who_is_too_young_for_school_asc_N,-8.4726758731572183,-0.83756777599999999,0,,,-8.4726758731572183 -coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_age_0_to_1_M,-0.94370983129521369,-0.45150000000000001,0,,,-0.94370983129521369 -coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_age_4_to_5_M,0.23942677869103898,0.61070000000000002,0,,,0.23942677869103898 -coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_fewer_cars_than_workers_H,-0.062626330292437762,0.50609999999999999,0,,,-0.062626330292437762 -coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_income_between_50k_and_100k_H,-0.72315508703120213,-0.57079999999999997,0,,,-0.72315508703120213 -coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_income_more_than_100k_H,-0.78255504212379234,-0.61860000000000004,0,,,-0.78255504212379234 -coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_more_cars_than_workers_N,0.77202970270645432,0.29909999999999998,0,,,0.77202970270645432 -coef_retired_asc_N,1.5364416553612539,0.408202071,0,,,1.5364416553612539 -coef_retired_interaction_with_age_more_than_80_H,1.1681808203688446,0.76659999999999995,0,,,1.1681808203688446 +coef_non_working_adult_retired_or_univ_student_interaction_with_off_peak_accessibility_to_all_employment_N,0.05369614335005668,0.072069999999999995,0,,,0.05369614335005668 +coef_part_time_worker_asc_M,4.6632375293425374,-0.71882373799999999,0,,,4.6632375293425374 +coef_part_time_worker_asc_N,0.63327669200378511,0.63603246700000005,0,,,0.63327669200378511 +coef_part_time_worker_interaction_with_income_between_50k_and_100k_H,0.084764438455744495,-0.4032,0,,,0.084764438455744495 +coef_part_time_worker_interaction_with_income_less_than_20k_H,0.38471102017852971,0.32319999999999999,0,,,0.38471102017852971 +coef_part_time_worker_interaction_with_income_more_than_100k_H,-1.3396802216835464,-0.35339999999999999,0,,,-1.3396802216835464 +coef_part_time_worker_interaction_with_income_more_than_100k_N,0.57320315896108665,0.42070000000000002,0,,,0.57320315896108665 +coef_part_time_worker_interaction_with_peak_accessibility_to_all_employment_M,-0.22927275954114743,0.20039999999999999,0,,,-0.22927275954114743 +coef_pre_driving_age_child_who_is_in_school_asc_M,3.9697740814213565,3.295863529,0,,,3.9697740814213565 +coef_pre_driving_age_child_who_is_in_school_asc_N,-6.9669814176755258,0.57142433999999998,0,,,-6.9669814176755258 +coef_pre_driving_age_child_who_is_in_school_interaction_with_age_13_to_15_M,-1.5862536567965162,-0.71409999999999996,0,,,-1.5862536567965162 +coef_pre_driving_age_child_who_is_in_school_interaction_with_age_13_to_15_N,-1.1444471156694567,-0.67200000000000004,0,,,-1.1444471156694567 +coef_pre_driving_age_child_who_is_in_school_interaction_with_age_6_to_9_M,-0.71694326641444928,-0.29430000000000001,0,,,-0.71694326641444928 +coef_pre_driving_age_child_who_is_in_school_interaction_with_fewer_cars_than_workers_H,1.0962667615224149,0.58620000000000005,0,,,1.0962667615224149 +coef_pre_driving_age_child_who_is_too_young_for_school_asc_M,0.92486435043901694,1.052531189,0,,,0.92486435043901694 +coef_pre_driving_age_child_who_is_too_young_for_school_asc_N,-8.6130259294995177,-0.83756777599999999,0,,,-8.6130259294995177 +coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_age_0_to_1_M,-0.94468281293415513,-0.45150000000000001,0,,,-0.94468281293415513 +coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_age_4_to_5_M,0.23268035055770159,0.61070000000000002,0,,,0.23268035055770159 +coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_fewer_cars_than_workers_H,-0.056007934757943807,0.50609999999999999,0,,,-0.056007934757943807 +coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_income_between_50k_and_100k_H,-0.71502868941234388,-0.57079999999999997,0,,,-0.71502868941234388 +coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_income_more_than_100k_H,-0.7760438496909059,-0.61860000000000004,0,,,-0.7760438496909059 +coef_pre_driving_age_child_who_is_too_young_for_school_interaction_with_more_cars_than_workers_N,0.77447196962877585,0.29909999999999998,0,,,0.77447196962877585 +coef_retired_asc_N,1.1425900456454805,0.408202071,0,,,1.1425900456454805 +coef_retired_interaction_with_age_more_than_80_H,1.1363693439466673,0.76659999999999995,0,,,1.1363693439466673 coef_retired_interaction_with_female_M,0.47689999999999999,0.47689999999999999,0,,,0.47689999999999999 -coef_retired_interaction_with_fewer_cars_than_workers_H,0.84933419475176275,0.54959999999999998,0,,,0.84933419475176275 -coef_retired_interaction_with_income_less_than_20k_H,0.63470799837285252,0.53300000000000003,0,,,0.63470799837285252 +coef_retired_interaction_with_fewer_cars_than_workers_H,1.1295952477348739,0.54959999999999998,0,,,1.1295952477348739 +coef_retired_interaction_with_income_less_than_20k_H,0.65398902348188781,0.53300000000000003,0,,,0.65398902348188781 coef_retired_interaction_with_more_cars_than_workers_M,2.992,2.992,0,,,2.992 -coef_retired_interaction_with_more_cars_than_workers_N,0.71261516449491702,1.056,0,,,0.71261516449491702 +coef_retired_interaction_with_more_cars_than_workers_N,0.76262176538305015,1.056,0,,,0.76262176538305015 coef_retired_interaction_with_peak_accessibility_to_all_employment_M,0.2792,0.2792,0,,,0.2792 -coef_university_student_asc_M,2.1168963081096361,2.3535951759999998,0,,,2.1168963081096361 -coef_university_student_asc_N,0.80965567152181694,0.609709846,0,,,0.80965567152181694 +coef_university_student_asc_M,2.1206612172447477,2.3535951759999998,0,,,2.1206612172447477 +coef_university_student_asc_N,0.47323909115586493,0.609709846,0,,,0.47323909115586493 -999.0,-999,-999,-999,-999,-999,-999 -coef_H_11,1.4760964655352482,1.6259999999999999,0,,,1.4760964655352482 -coef_H_12,-0.064158230056873197,0.74070000000000003,0,,,-0.064158230056873197 -coef_H_13,1.493250407209171,1.1830000000000001,0,,,1.493250407209171 -coef_H_14,1.2067361944629116,0.94359999999999999,0,,,1.2067361944629116 -coef_H_15,0.96634010744463039,1.298,0,,,0.96634010744463039 -coef_H_16,1.5407052927368943,2.0640000000000001,0,,,1.5407052927368943 -coef_H_17,1.4643057546116216,1.5009999999999999,0,,,1.4643057546116216 -coef_H_18,1.2137468768191544,0.99119999999999997,0,,,1.2137468768191544 -coef_H_22,-20.048185633111903,0.8911,0,,,-20.048185633111903 -coef_H_23,0.76133088237279201,1.6419999999999999,0,,,0.76133088237279201 -coef_H_24,0.965915261743025,0.70569999999999999,0,,,0.965915261743025 -coef_H_25,-18.390425378629139,0.46300000000000002,0,,,-18.390425378629139 -coef_H_26,4.1358263879148796,3.0569999999999999,0,,,4.1358263879148796 -coef_H_27,0.40240985829557219,0.76849999999999996,0,,,0.40240985829557219 -coef_H_28,-17.787339403083198,1.0700000000000001,0,,,-17.787339403083198 -coef_H_33,0.9552059166675555,1.018,0,,,0.9552059166675555 -coef_H_34,1.5575240015522469,1.7809999999999999,0,,,1.5575240015522469 -coef_H_35,-18.276728044549817,0.48349999999999999,0,,,-18.276728044549817 +coef_H_11,1.5446242163709172,1.6259999999999999,0,,,1.5446242163709172 +coef_H_12,0.15833568776463425,0.74070000000000003,0,,,0.15833568776463425 +coef_H_13,1.5389002358533881,1.1830000000000001,0,,,1.5389002358533881 +coef_H_14,1.3436422025992563,0.94359999999999999,0,,,1.3436422025992563 +coef_H_15,0.62460727406650574,1.298,0,,,0.62460727406650574 +coef_H_16,1.521735383660767,2.0640000000000001,0,,,1.521735383660767 +coef_H_17,1.4606474415665514,1.5009999999999999,0,,,1.4606474415665514 +coef_H_18,1.2139109701831139,0.99119999999999997,0,,,1.2139109701831139 +coef_H_22,-15.923116744497655,0.8911,0,,,-15.923116744497655 +coef_H_23,0.73052600611455265,1.6419999999999999,0,,,0.73052600611455265 +coef_H_24,1.0651655384578942,0.70569999999999999,0,,,1.0651655384578942 +coef_H_25,-16.202420121437097,0.46300000000000002,0,,,-16.202420121437097 +coef_H_26,4.156194778954351,3.0569999999999999,0,,,4.156194778954351 +coef_H_27,0.40222213291144671,0.76849999999999996,0,,,0.40222213291144671 +coef_H_28,-15.469749038901073,1.0700000000000001,0,,,-15.469749038901073 +coef_H_33,0.99710939823398292,1.018,0,,,0.99710939823398292 +coef_H_34,1.6743259141961955,1.7809999999999999,0,,,1.6743259141961955 +coef_H_35,-17.763631508602064,0.48349999999999999,0,,,-17.763631508602064 coef_H_36,1.546,1.546,0,,,1.546 coef_H_37,1.552,1.552,0,,,1.552 coef_H_38,1.3400000000000001,1.3400000000000001,0,,,1.3400000000000001 -coef_H_44,1.5193676847366135,1.3520000000000001,0,,,1.5193676847366135 -coef_H_45,2.052832778022585,1.2090000000000001,0,,,2.052832778022585 +coef_H_44,1.7326164145508278,1.3520000000000001,0,,,1.7326164145508278 +coef_H_45,2.1749474138597793,1.2090000000000001,0,,,2.1749474138597793 coef_H_46,0.52429999999999999,0.52429999999999999,0,,,0.52429999999999999 coef_H_47,0.81120000000000003,0.81120000000000003,0,,,0.81120000000000003 coef_H_48,1.167,1.167,0,,,1.167 -coef_H_55,1.0130996289124303,1.407,0,,,1.0130996289124303 +coef_H_55,0.98410006600474542,1.407,0,,,0.98410006600474542 coef_H_56_57_58,0.86319999999999997,0.86319999999999997,0,,,0.86319999999999997 -coef_H_66,20.828610446733244,2.198,0,,,20.828610446733244 -coef_H_67,-22.167021114601525,0.97699999999999998,0,,,-22.167021114601525 +coef_H_66,18.485095476410191,2.198,0,,,18.485095476410191 +coef_H_67,-18.838145375909068,0.97699999999999998,0,,,-18.838145375909068 coef_H_68,1.4670000000000001,1.4670000000000001,0,,,1.4670000000000001 -coef_H_77,2.4238673331441256,2.7999999999999998,0,,,2.4238673331441256 -coef_H_78,18.7395851650442,1.4339999999999999,0,,,18.7395851650442 -coef_H_88,1.1842097491232364,1.3779999999999999,0,,,1.1842097491232364 -coef_M_11,-0.10315745218079309,0.14099999999999999,0,,,-0.10315745218079309 -coef_M_12,0.24289066416954588,0.088450000000000001,0,,,0.24289066416954588 -coef_M_13,0.23512628556691934,0.42730000000000001,0,,,0.23512628556691934 -coef_M_16,0.37436260076978561,0.38419999999999999,0,,,0.37436260076978561 -coef_M_17,-0.12351627151201074,0.26229999999999998,0,,,-0.12351627151201074 -coef_M_18,0.26156298152357188,0.51180000000000003,0,,,0.26156298152357188 -coef_M_22,0.8876822133090766,1.135,0,,,0.8876822133090766 -coef_M_23,0.16967023848026888,0.17299999999999999,0,,,0.16967023848026888 -coef_M_26,1.9043268450318949,1.103,0,,,1.9043268450318949 -coef_M_27,0.098747793809281739,0.30790000000000001,0,,,0.098747793809281739 -coef_M_28,0.37788788825830905,0.50739999999999996,0,,,0.37788788825830905 -coef_M_33,0.46951340963465871,0.87260000000000004,0,,,0.46951340963465871 +coef_H_77,2.4644108765258532,2.7999999999999998,0,,,2.4644108765258532 +coef_H_78,16.353710938881033,1.4339999999999999,0,,,16.353710938881033 +coef_H_88,1.189203293091496,1.3779999999999999,0,,,1.189203293091496 +coef_M_11,-0.069163059806229873,0.14099999999999999,0,,,-0.069163059806229873 +coef_M_12,0.27916204731988919,0.088450000000000001,0,,,0.27916204731988919 +coef_M_13,0.29314801601836593,0.42730000000000001,0,,,0.29314801601836593 +coef_M_16,0.39262282583718466,0.38419999999999999,0,,,0.39262282583718466 +coef_M_17,-0.10503260073739486,0.26229999999999998,0,,,-0.10503260073739486 +coef_M_18,0.28872595185423811,0.51180000000000003,0,,,0.28872595185423811 +coef_M_22,0.94589050141866338,1.135,0,,,0.94589050141866338 +coef_M_23,0.1879379616749213,0.17299999999999999,0,,,0.1879379616749213 +coef_M_26,1.9267776692819061,1.103,0,,,1.9267776692819061 +coef_M_27,0.1161255949291411,0.30790000000000001,0,,,0.1161255949291411 +coef_M_28,0.40930716183312305,0.50739999999999996,0,,,0.40930716183312305 +coef_M_33,0.42731234923519629,0.87260000000000004,0,,,0.42731234923519629 coef_M_36,-0.0020999999999999999,-0.0020999999999999999,0,,,-0.0020999999999999999 coef_M_37,0.29749999999999999,0.29749999999999999,0,,,0.29749999999999999 coef_M_38,0.22539999999999999,0.22539999999999999,0,,,0.22539999999999999 -coef_M_66,19.233945823086177,0.47939999999999999,0,,,19.233945823086177 -coef_M_67,-20.972852325241863,0.5151,0,,,-20.972852325241863 +coef_M_66,16.866971034774586,0.47939999999999999,0,,,16.866971034774586 +coef_M_67,-18.934085377461077,0.5151,0,,,-18.934085377461077 coef_M_68,0.55159999999999998,0.55159999999999998,0,,,0.55159999999999998 -coef_M_77,1.1716655129786113,0.97309999999999997,0,,,1.1716655129786113 -coef_M_78,-1.6161731715056533,0.59609999999999996,0,,,-1.6161731715056533 -coef_M_88,1.1434842530496023,1.651,0,,,1.1434842530496023 -coef_N_11,0.77649157176524208,1.123,0,,,0.77649157176524208 -coef_N_12,0.488124079368988,0.49469999999999997,0,,,0.488124079368988 -coef_N_13,0.49743796422025144,0.55230000000000001,0,,,0.49743796422025144 -coef_N_14,-0.35982728457022323,0.021860000000000001,0,,,-0.35982728457022323 -coef_N_15,0.36564167259654867,0.3115,0,,,0.36564167259654867 -coef_N_16,0.86415804495299897,0.40949999999999998,0,,,0.86415804495299897 -coef_N_17,-0.15493465142131949,0.6008,0,,,-0.15493465142131949 -coef_N_18,0.061553453384430544,0.751,0,,,0.061553453384430544 -coef_N_22,0.78153353062803688,1.032,0,,,0.78153353062803688 -coef_N_23,0.51787026611388409,0.33550000000000002,0,,,0.51787026611388409 -coef_N_24,1.207010392275131,0.74770000000000003,0,,,1.207010392275131 -coef_N_25,-0.30432121041501858,0.098309999999999995,0,,,-0.30432121041501858 -coef_N_26,-17.935679569781499,0.495,0,,,-17.935679569781499 -coef_N_27,0.29124572708554775,0.89839999999999998,0,,,0.29124572708554775 -coef_N_28,0.93187705384090591,1.452,0,,,0.93187705384090591 -coef_N_33,0.97616884807409698,1.054,0,,,0.97616884807409698 -coef_N_34,-0.19667396077248783,0.193,0,,,-0.19667396077248783 -coef_N_35,0.072084224013530651,0.40649999999999997,0,,,0.072084224013530651 +coef_M_77,1.1689185377597398,0.97309999999999997,0,,,1.1689185377597398 +coef_M_78,-1.5331233891804061,0.59609999999999996,0,,,-1.5331233891804061 +coef_M_88,1.1505839302746146,1.651,0,,,1.1505839302746146 +coef_N_11,0.87318887937108214,1.123,0,,,0.87318887937108214 +coef_N_12,0.24183915201361481,0.49469999999999997,0,,,0.24183915201361481 +coef_N_13,0.59192245194065063,0.55230000000000001,0,,,0.59192245194065063 +coef_N_14,-0.11520399026943709,0.021860000000000001,0,,,-0.11520399026943709 +coef_N_15,0.35837177201195769,0.3115,0,,,0.35837177201195769 +coef_N_16,0.87560206874032775,0.40949999999999998,0,,,0.87560206874032775 +coef_N_17,-0.12677865514794309,0.6008,0,,,-0.12677865514794309 +coef_N_18,0.10704226969734577,0.751,0,,,0.10704226969734577 +coef_N_22,0.72707556913323979,1.032,0,,,0.72707556913323979 +coef_N_23,0.54094331873208856,0.33550000000000002,0,,,0.54094331873208856 +coef_N_24,1.2778673813277723,0.74770000000000003,0,,,1.2778673813277723 +coef_N_25,-0.44876565834828996,0.098309999999999995,0,,,-0.44876565834828996 +coef_N_26,-14.897952319514477,0.495,0,,,-14.897952319514477 +coef_N_27,0.3031034553268101,0.89839999999999998,0,,,0.3031034553268101 +coef_N_28,0.90141537238007852,1.452,0,,,0.90141537238007852 +coef_N_33,0.94126692083673891,1.054,0,,,0.94126692083673891 +coef_N_34,-0.17640412257232874,0.193,0,,,-0.17640412257232874 +coef_N_35,0.10306375422257623,0.40649999999999997,0,,,0.10306375422257623 coef_N_36,1.6200000000000001,1.6200000000000001,0,,,1.6200000000000001 coef_N_37,0.51649999999999996,0.51649999999999996,0,,,0.51649999999999996 coef_N_38,0.89729999999999999,0.89729999999999999,0,,,0.89729999999999999 -coef_N_44,0.49234460682467146,0.69840000000000002,0,,,0.49234460682467146 -coef_N_45,0.079463133627298746,0.18640000000000001,0,,,0.079463133627298746 +coef_N_44,0.18753988004006014,0.69840000000000002,0,,,0.18753988004006014 +coef_N_45,0.10139966319988514,0.18640000000000001,0,,,0.10139966319988514 coef_N_46,0.68010000000000004,0.68010000000000004,0,,,0.68010000000000004 coef_N_47,0.56459999999999999,0.56459999999999999,0,,,0.56459999999999999 coef_N_48,1.1639999999999999,1.1639999999999999,0,,,1.1639999999999999 -coef_N_55,0.71170975542229609,0.72909999999999997,0,,,0.71170975542229609 +coef_N_55,0.71481848079980614,0.72909999999999997,0,,,0.71481848079980614 coef_N_56_57_58,0.29189999999999999,0.29189999999999999,0,,,0.29189999999999999 -coef_N_66,-3.0799970068090738,1.512,0,,,-3.0799970068090738 -coef_N_67,-17.056285405105893,1.4219999999999999,0,,,-17.056285405105893 +coef_N_66,-2.4730624946698345,1.512,0,,,-2.4730624946698345 +coef_N_67,-14.740556739194025,1.4219999999999999,0,,,-14.740556739194025 coef_N_68,1.2729999999999999,1.2729999999999999,0,,,1.2729999999999999 -coef_N_77,2.2686582220936948,1.5529999999999999,0,,,2.2686582220936948 -coef_N_78,-0.3962846317912857,0.61839999999999995,0,,,-0.3962846317912857 -coef_N_88,-0.18416905539843373,0.87709999999999999,0,,,-0.18416905539843373 -coef_H_124_122_144,0.69278315447063155,0.95730000000000004,0,,,0.69278315447063155 -coef_H_126_146,1.7014712183693104,0.29389999999999999,0,,,1.7014712183693104 -coef_H_222_224_244,-9.0699293225103421,0.98809999999999998,0,,,-9.0699293225103421 -coef_H_226_246_446,39.460567637960175,0.43740000000000001,0,,,39.460567637960175 +coef_N_77,2.3130640609159334,1.5529999999999999,0,,,2.3130640609159334 +coef_N_78,-0.30407334622872023,0.61839999999999995,0,,,-0.30407334622872023 +coef_N_88,-0.15881741099470906,0.87709999999999999,0,,,-0.15881741099470906 +coef_H_124_122_144,0.33133065098679593,0.95730000000000004,0,,,0.33133065098679593 +coef_H_126_146,1.5514557242593745,0.29389999999999999,0,,,1.5514557242593745 +coef_H_222_224_244,-9.203498518007823,0.98809999999999998,0,,,-9.203498518007823 +coef_H_226_246_446,34.311085923988777,0.43740000000000001,0,,,34.311085923988777 coef_H_266_466,0.47470000000000001,0.47470000000000001,0,,,0.47470000000000001 -coef_H_xxxxx,-4.4491970695320449,-8.6210000000000004,0,,,-4.4491970695320449 -coef_M_111,0.26484774815280648,0.31330000000000002,0,,,0.26484774815280648 -coef_M_112_114,-0.13344931506337007,0.34949999999999998,0,,,-0.13344931506337007 +coef_H_xxxxx,-4.3950349906415909,-8.6210000000000004,0,,,-4.3950349906415909 +coef_M_111,0.22029613237472445,0.31330000000000002,0,,,0.22029613237472445 +coef_M_112_114,-0.0040326112064052802,0.34949999999999998,0,,,-0.0040326112064052802 coef_M_666,-0.3906,-0.3906,0,,,-0.3906 -coef_M_xxxxx,-0.040213814658512113,-1.528,0,,,-0.040213814658512113 -coef_N_112_114,-0.45158995385066814,0.4637,0,,,-0.45158995385066814 -coef_N_124_122_144,0.99671035413350717,0.34910000000000002,0,,,0.99671035413350717 -coef_N_166,-1.9761012859626266,0.3553,0,,,-1.9761012859626266 -coef_N_222_224_444,-2.0462327181587683,-1.3859999999999999,0,,,-2.0462327181587683 -coef_N_246_226_446,-0.85937641201738035,-0.85709999999999997,0,,,-0.85937641201738035 -coef_N_xxxxx,-0.82569094329321668,-3.4529999999999998,0,,,-0.82569094329321668 +coef_M_xxxxx,-0.094419014924773184,-1.528,0,,,-0.094419014924773184 +coef_N_112_114,0.38961790319791489,0.4637,0,,,0.38961790319791489 +coef_N_124_122_144,0.58723976265654321,0.34910000000000002,0,,,0.58723976265654321 +coef_N_166,-1.6454258220246625,0.3553,0,,,-1.6454258220246625 +coef_N_222_224_444,-0.73749437161692599,-1.3859999999999999,0,,,-0.73749437161692599 +coef_N_246_226_446,-0.85860667485009734,-0.85709999999999997,0,,,-0.85860667485009734 +coef_N_xxxxx,-1.0332955358547824,-3.4529999999999998,0,,,-1.0332955358547824 diff --git a/activitysim/estimation/test/test_larch_estimation/test_cdap_model_loglike.csv b/activitysim/estimation/test/test_larch_estimation/test_cdap_model_loglike.csv index 1e2087566a..dcd755572e 100644 --- a/activitysim/estimation/test/test_larch_estimation/test_cdap_model_loglike.csv +++ b/activitysim/estimation/test/test_larch_estimation/test_cdap_model_loglike.csv @@ -1,2 +1,2 @@ ,loglike_prior,loglike_converge -0,-2477.2003381662212,-2354.458830163564 +0,-2460.72797581254872057,-2335.77494762967762654 diff --git a/activitysim/examples/__init__.py b/activitysim/examples/__init__.py index e69de29bb2..642b315907 100644 --- a/activitysim/examples/__init__.py +++ b/activitysim/examples/__init__.py @@ -0,0 +1,5 @@ +from pathlib import Path + +from activitysim.cli.create import get_example # noqa: F401 + +path = Path(__file__).parent diff --git a/activitysim/examples/example_estimation/configs/logging.yaml b/activitysim/examples/example_estimation/configs/logging.yaml index 9531f40bbe..f4902943d6 100644 --- a/activitysim/examples/example_estimation/configs/logging.yaml +++ b/activitysim/examples/example_estimation/configs/logging.yaml @@ -33,14 +33,16 @@ logging: elogfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['estimation.log'] + filename: + get_log_file_path: 'estimation.log' mode: w formatter: fileFormatter level: NOTSET logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -63,4 +65,3 @@ logging: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/examples/example_estimation/scripts/infer.py b/activitysim/examples/example_estimation/scripts/infer.py index f60b94900f..6b6991992f 100644 --- a/activitysim/examples/example_estimation/scripts/infer.py +++ b/activitysim/examples/example_estimation/scripts/infer.py @@ -10,7 +10,7 @@ import yaml from activitysim.abm.models.util import canonical_ids as cid -from activitysim.abm.models.util import tour_frequency as tf +from activitysim.core import workflow from activitysim.core.util import reindex logger = logging.getLogger(__name__) @@ -69,7 +69,6 @@ def unmangle_ids(ids): def infer_cdap_activity(persons, tours, joint_tour_participants): - mandatory_tour_types = ["work", "school"] non_mandatory_tour_types = [ "escort", @@ -118,7 +117,6 @@ def infer_cdap_activity(persons, tours, joint_tour_participants): def infer_mandatory_tour_frequency(persons, tours): - num_work_tours = ( tours[tours.tour_type == "work"] .groupby("person_id") @@ -402,9 +400,8 @@ def read_tdd_alts(): return tdds.tdd -def patch_tour_ids(persons, tours, joint_tour_participants): - def set_tour_index(tours, parent_tour_num_col, is_joint): - +def patch_tour_ids(state: workflow.State, persons, tours, joint_tour_participants): + def set_tour_index(state, tours, parent_tour_num_col, is_joint): group_cols = ["person_id", "tour_category", "tour_type"] if "parent_tour_num" in tours: @@ -415,7 +412,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ) return cid.set_tour_index( - tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint + state, tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint ) assert "mandatory_tour_frequency" in persons @@ -426,6 +423,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # mandatory tours ##################### mandatory_tours = set_tour_index( + state, tours[tours.tour_category == "mandatory"], parent_tour_num_col=None, is_joint=False, @@ -450,7 +448,9 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): temp_point_persons.person_id, joint_tours.household_id ) - joint_tours = set_tour_index(joint_tours, parent_tour_num_col=None, is_joint=True) + joint_tours = set_tour_index( + state, joint_tours, parent_tour_num_col=None, is_joint=True + ) joint_tours["person_id"] = joint_tours["cache_point_person_id"] del joint_tours["cache_point_person_id"] @@ -475,6 +475,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ##################### non_mandatory_tours = set_tour_index( + state, tours[tours.tour_category == "non_mandatory"], parent_tour_num_col=None, is_joint=False, @@ -523,7 +524,7 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): ) atwork_tours = set_tour_index( - atwork_tours, parent_tour_num_col="parent_tour_num", is_joint=False + state, atwork_tours, parent_tour_num_col="parent_tour_num", is_joint=False ) del atwork_tours["parent_tour_num"] @@ -565,7 +566,6 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): def infer_atwork_subtour_frequency(configs_dir, tours): - # first column is 'atwork_subtour_frequency' nickname, remaining columns are trip type counts alts = pd.read_csv( os.path.join(configs_dir, "atwork_subtour_frequency_alternatives.csv"), @@ -640,7 +640,7 @@ def infer_atwork_subtour_frequency(configs_dir, tours): return atwork_subtour_frequency -def patch_trip_ids(tours, trips): +def patch_trip_ids(state: workflow.State, tours, trips): """ replace survey trip_ids with asim standard trip_id replace survey tour_id foreign key with asim standard tour_id @@ -672,7 +672,7 @@ def patch_trip_ids(tours, trips): + 1 ) - cid.set_trip_index(trips) + cid.set_trip_index(state, trips) assert trips.index.name == ASIM_TRIP_ID trips = trips.reset_index().rename(columns={"trip_id": ASIM_TRIP_ID}) @@ -681,7 +681,6 @@ def patch_trip_ids(tours, trips): def infer_stop_frequency(configs_dir, tours, trips): - # alt,out,in # 0out_0in,0,0 # 0out_1in,0,1 @@ -707,7 +706,6 @@ def infer_stop_frequency(configs_dir, tours, trips): def read_tables(input_dir, tables): - for table, info in tables.items(): table = pd.read_csv( os.path.join(input_dir, info["file_name"]), index_col=info.get("index") @@ -730,7 +728,6 @@ def read_tables(input_dir, tables): def check_controls(table_name, column_name): - table = survey_tables[table_name].get("table") c_table = control_tables[table_name].get("table") @@ -754,8 +751,7 @@ def check_controls(table_name, column_name): return True -def infer(configs_dir, input_dir, output_dir): - +def infer(state: workflow.State, configs_dir, input_dir, output_dir): households, persons, tours, joint_tour_participants, trips = read_tables( input_dir, survey_tables ) @@ -801,7 +797,7 @@ def infer(configs_dir, input_dir, output_dir): # patch_tour_ids tours, joint_tour_participants = patch_tour_ids( - persons, tours, joint_tour_participants + state, persons, tours, joint_tour_participants ) survey_tables["tours"]["table"] = tours survey_tables["joint_tour_participants"]["table"] = joint_tour_participants @@ -864,4 +860,4 @@ def infer(configs_dir, input_dir, output_dir): if apply_controls: read_tables(input_dir, control_tables) -infer(configs_dir, input_dir, output_dir) +infer(state, configs_dir, input_dir, output_dir) diff --git a/activitysim/examples/example_manifest.yaml b/activitysim/examples/example_manifest.yaml index d8fc462156..f2d9174513 100644 --- a/activitysim/examples/example_manifest.yaml +++ b/activitysim/examples/example_manifest.yaml @@ -48,6 +48,10 @@ - prototype_mtc_extended/configs_mp - prototype_mtc_extended/output - prototype_mtc_extended/README.MD + subdirs: + configs_dir: + - configs_extended/configs + - configs - name: prototype_mtc_extended_full description: Prototype MTC example model using data from the full 1475-zone MTC region with 2.8M households and 7.5M persons @@ -126,6 +130,11 @@ data_sf/skims.omx 579d6007266db3b055d0f9e4814004f4d5ccfae27a36e40f4881e3662bc3d3f1 - prototype_mtc/output + subdirs: + configs_dir: + - configs_estimation/configs + - configs + data_dir: data_sf - name: placeholder_2_zone description: 2 zone system test example based on prototype MTC @@ -741,6 +750,12 @@ - placeholder_sandag/configs_3_zone - placeholder_sandag/configs_skip_accessibility - placeholder_sandag/output_3 + subdirs: + configs_dir: + - configs_3_zone + - prototype_mtc/configs + data_dir: data_3 + output_dir: output_3 - name: placeholder_sandag_3_zone_full description: full 3-zone example for the SANDAG region diff --git a/activitysim/examples/external.py b/activitysim/examples/external.py new file mode 100644 index 0000000000..447cbe0ced --- /dev/null +++ b/activitysim/examples/external.py @@ -0,0 +1,334 @@ +""" +Tools to download and use example models from external sources. + +The tools in this module allow for automated access to *external* example models, +which are not necessarily maintained or supported by the ActivitySim Consortium. +These models can be test-sized or full scale representations of models operated +by various agencies, and can contain thousands of zones and/or millions of simulated +households. +""" + +from __future__ import annotations + +import logging +import os +import tarfile +import zipfile +from pathlib import Path + +import platformdirs +import yaml + +from activitysim.cli.create import download_asset + +logger = logging.getLogger(__name__) + + +def registered_external_example( + name: str, working_dir: Path, registry: Path | None = None +) -> Path: + """ + Download a registered external example and copy into a working directory. + + Parameters + ---------- + name : str + The unique name for the registered external example. See + `activitysim/examples/external_example_manifest.yaml` or run + `list_registered_examples()` for the names of the built-in registered + examples. + working_dir : path-like + The location to install the external example. + registry : path-like, optional + Provide the file location of an alternative example registry. This + should be a yaml file with information about the location of examples. + When not provided, the default external example registry is used, + which is found at `activitysim/examples/external_example_manifest.yaml`. + + Returns + ------- + Path + The location where the example was installed, generally a subdirectory + of `working_dir`. + """ + if registry is None: + registry = Path(__file__).parent.joinpath("external_example_manifest.yaml") + with open(registry) as eem: + registered_examples = yaml.load(eem, yaml.SafeLoader) + if name not in registered_examples: + raise KeyError(f"{name!r} is not a registered external example") + if "name" not in registered_examples[name]: + registered_examples[name]["name"] = name + return download_external_example( + working_dir, + **registered_examples[name], + ) + + +def list_registered_examples(registry: Path | None = None) -> list[str]: + """ + Read a list of registered example names. + + Parameters + ---------- + registry : path-like, optional + Provide the file location of an alternative example registry. This + should be a yaml file with information about the location of examples. + When not provided, the default external example registry is used, + which is found at `activitysim/examples/external_example_manifest.yaml`. + + Returns + ------- + list[str] + """ + if registry is None: + registry = Path(__file__).parent.joinpath("external_example_manifest.yaml") + with open(registry) as eem: + registered_examples = yaml.load(eem, yaml.SafeLoader) + return list(registered_examples.keys()) + + +def exercise_external_example( + name: str, + working_dir: Path, + maxfail: int = None, + verbose: int = 2, + durations: int = 0, + registry: Path | None = None, +) -> int: + """ + Use pytest to ensure that an external example is functioning correctly. + + Parameters + ---------- + name : str + The unique name for the registered external example. See + `activitysim/examples/external_example_manifest.yaml` or run + `list_registered_examples()` for the names of the registered examples. + working_dir : path-like + The location to install a copy of the external example for testing. + maxfail : int, optional + Stop testing after this many failures have been detected. + verbose : int, default 2 + Verbosity level given to pytest. + durations : int, default 0 + Report the durations of this many of the slowest tests conducted. + Leave as 0 to report all durations, or set to None to report no + durations. + + Returns + ------- + int + The result code returned by pytest. + """ + try: + directory = registered_external_example(name, working_dir, registry) + except Exception as err: + logger.exception(err) + raise + import pytest + + args = [] + if verbose: + args.append("-" + "v" * verbose) + if maxfail: + args.append(f"--maxfail={int(maxfail)}") + if durations is not None: + args.append(f"--durations={int(durations)}") + args.append(os.path.relpath(os.path.normpath(os.path.realpath(directory)))) + return pytest.main(args) + + +def _run_tests_on_example(name): + import tempfile + + tempdir = tempfile.TemporaryDirectory() + resultcode = exercise_external_example(name, tempdir.name) + return resultcode + + +def default_cache_dir() -> Path: + """ + Get the default external example cache directory. + + Returns + ------- + Path + """ + return Path(platformdirs.user_cache_dir(appname="ActivitySim")).joinpath( + "External-Examples" + ) + + +def _decompress_archive(archive_path: Path, target_location: Path): + # decompress archive file into working directory + if archive_path.suffixes[-2:] == [".tar", ".gz"]: + with tarfile.open(archive_path) as tfile: + common_prefix = os.path.commonprefix(tfile.getnames()) + if common_prefix in {"", ".", "./", None}: + working_dir = target_location + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + else: + working_subdir = target_location.joinpath(common_prefix) + tfile.extractall(working_dir) + elif archive_path.suffixes[-2:] == [".tar", ".zst"]: + working_dir = target_location + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + from sharrow.utils.tar_zst import extract_zst + + extract_zst(archive_path, working_dir) + elif archive_path.suffix == ".zip": + with zipfile.ZipFile(archive_path, "r") as zf: + common_prefix = os.path.commonprefix(zf.namelist()) + if common_prefix in {"", ".", "./", None}: + working_dir = target_location + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + else: + working_subdir = target_location.joinpath(common_prefix) + zf.extractall(working_dir) + else: + raise ValueError(f"unknown archive file type {''.join(archive_path.suffixes)}") + return working_subdir + + +def download_external_example( + working_dir: Path, + url: str | None = None, + cache_dir: Path | None = None, + cache_file_name: str | None = None, + sha256: str | None = None, + name=None, + assets: dict = None, + link_assets: bool = True, +) -> Path: + """ + Download an external example. + + Parameters + ---------- + working_dir : Path + The working directory where the external example files will be installed. + The `name` subdirectory of this directory will be created if it does not + exist, and downloaded files will be installed there. + url : str, optional + The main url for the example to download. This should point to an + archive file (e.g. blah.tar.gz) that will be unpacked into the target + working subdirectory. + cache_dir : Path, optional + The compressed archive(s) will be downloaded and cached in this + directory. If not provided, a suitable cache location is chosen based + on the suggested user cache locations from platformdirs library. + cache_file_name : str, optional + The archive at the primary url will be cached with this filename. It + is typically not necessary to provide this file name explicitly, as a + file name will be generated automatically from the url if not given. + sha256 : str, optional + This checksum is used to validate the download and/or the cached + archive file. If the cached file exists but the checksum does not match, + the file will be re-downloaded. + name : str, optional + The name of the external example. This will become the working + subdirectory name where files are installed, unless the main archive + has an embedded name (as a common prefix) in which case the name is + ignored. + assets : dict, optional + Instructions for additional files to be downloaded to support this + external example (e.g. large data files not in the main archive). + link_assets : bool, default True + If set to True, this function will attempt to symlink assets from the + cache into the target directory instead of copying them. This can + save disk space when the same external example is installed multiple + times. + + Returns + ------- + Path + The working subdirectory name where files are installed. + """ + # set up cache dir + if cache_dir is None: + cache_dir = default_cache_dir() + else: + cache_dir = Path(cache_dir) + if name: + cache_dir = cache_dir.joinpath(name) + cache_dir.mkdir(parents=True, exist_ok=True) + + working_dir = Path(working_dir) + working_dir.mkdir(parents=True, exist_ok=True) + common_prefix = "." + + if url: + # check if target file exists in cache dir + if cache_file_name is None: + cache_file_name = url + if cache_file_name.startswith("https://github.com/"): + cache_file_name = cache_file_name.replace("https://github.com/", "") + if "//" in cache_file_name: + cache_file_name = cache_file_name.split("//", 1)[1] + cache_file_name = cache_file_name.replace("/", "_").replace("\\", "_") + + target_path = cache_dir.joinpath(cache_file_name) + + download_asset(url, target_path, sha256, link=False) + + # decompress cache file into working directory + if target_path.suffixes[-2:] == [".tar", ".gz"]: + with tarfile.open(target_path) as tfile: + common_prefix = os.path.commonprefix(tfile.getnames()) + if name is not None and common_prefix in {"", ".", "./", None}: + common_prefix = name + working_dir = working_dir.joinpath(name) + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + else: + working_subdir = working_dir.joinpath(common_prefix) + tfile.extractall(working_dir) + elif target_path.suffixes[-2:] == [".tar", ".zst"]: + working_dir = working_dir.joinpath(name) + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + from sharrow.utils.tar_zst import extract_zst + + extract_zst(target_path, working_dir) + elif target_path.suffix == ".zip": + with zipfile.ZipFile(target_path, "r") as zf: + common_prefix = os.path.commonprefix(zf.namelist()) + if name is not None and common_prefix in {"", ".", "./", None}: + common_prefix = name + working_dir = working_dir.joinpath(name) + working_dir.mkdir(parents=True, exist_ok=True) + working_subdir = working_dir + else: + working_subdir = working_dir.joinpath(common_prefix) + zf.extractall(working_dir) + else: + raise ValueError( + f"unknown archive file type {''.join(target_path.suffixes)}" + ) + + # download assets if any: + if assets: + for asset_name, asset_info in assets.items(): + if link_assets or asset_info.get("unpack", False): + asset_target_path = working_subdir.joinpath(asset_name) + download_asset( + asset_info.get("url"), + asset_target_path, + sha256=asset_info.get("sha256", "deadbeef"), + link=cache_dir, + base_path=working_subdir, + unpack=asset_info.get("unpack"), + ) + else: + # TODO should cache and copy, this just downloads to new locations + download_asset( + asset_info.get("url"), + working_subdir.joinpath(asset_name), + sha256=asset_info.get("sha256", "deadbeef"), + ) + + return working_subdir diff --git a/activitysim/examples/external_example_manifest.yaml b/activitysim/examples/external_example_manifest.yaml new file mode 100644 index 0000000000..0be2706364 --- /dev/null +++ b/activitysim/examples/external_example_manifest.yaml @@ -0,0 +1,43 @@ +# +# Registered External Examples +# +# +# : -- a unique name for the example +# url: -- the public url for the archive +# sha256: -- the checksum for the archive +# assets: -- additional files for testing (optional) +# : -- filename for asset relative to common root +# url: -- url for each extra file +# sha256: -- the checksum for extra file +# unpack: -- (optional) decompress asset archive file here +# + +prototype_mtc: + url: https://github.com/jpn--/activitysim-prototype-mtc/archive/refs/tags/v1.3.1.tar.gz + sha256: ec53c6e72da1444bd5808de8c644cea75db284dfcc419b776575ba532b3ccb87 + assets: + test/prototype_mtc_reference_pipeline.zip: + url: https://github.com/jpn--/activitysim-prototype-mtc/releases/download/v1.3.1/prototype_mtc_reference_pipeline.zip + sha256: 394e5b403d4c61d5214493cefe161432db840ba4967c23c999d914178d43a1f0 + +estimation_example: + url: https://github.com/ActivitySim/activitysim-estimation-example/archive/refs/tags/v0.0.2.tar.gz + sha256: 88c8208ee250a20e7d77036d77bc71122f21cbfeaba1eaf3b644120799f9d023 + assets: + data_sf.tar.zst: + url: https://github.com/ActivitySim/activitysim-estimation-example/releases/download/v0.0.2/data_sf.tar.zst + sha256: e0775ee2211367e25de541fa210350009d7ac142d49deba41c5e154fc908146e + unpack: data_sf + +legacy_mtc: + url: https://github.com/camsys/legacy_mtc_tm2_data/releases/download/v1.2/legacy-mtc.tar.zst + sha256: b62b462eb5178d01e61506274272508f163e3921777f34ba9d04fdb2c4be0fdf + +legacy_mtc_skims: + url: https://github.com/camsys/legacy_mtc_tm2_data/releases/download/v1.2/skims.zarr.tar.zst + sha256: 8c5ae22ce28b52bb5633627986f114b88df7fee27a42931a2aa0e454f0dd84b9 + name: legacy_mtc/skims.zarr + +prototype_psrc_in_development: + url: https://github.com/jpn--/psrc_activitysim/archive/refs/tags/v23.06.21.tar.gz + sha256: 2241816746559cd13d125e59a80deb1bf144519fec41383c891ff1fe28d7b5a2 diff --git a/activitysim/examples/placeholder_multiple_zone/configs_3_zone_marin/logging.yaml b/activitysim/examples/placeholder_multiple_zone/configs_3_zone_marin/logging.yaml index df20cf0c7e..9addd30706 100755 --- a/activitysim/examples/placeholder_multiple_zone/configs_3_zone_marin/logging.yaml +++ b/activitysim/examples/placeholder_multiple_zone/configs_3_zone_marin/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -51,4 +52,3 @@ logging: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/examples/placeholder_multiple_zone/test/output/.gitignore b/activitysim/examples/placeholder_multiple_zone/test/output/.gitignore index bf5bf15e3e..31230d824c 100644 --- a/activitysim/examples/placeholder_multiple_zone/test/output/.gitignore +++ b/activitysim/examples/placeholder_multiple_zone/test/output/.gitignore @@ -1,3 +1,4 @@ +** *.csv *.log *.prof diff --git a/activitysim/examples/placeholder_multiple_zone/test/reference_pipeline_2_zone.zip b/activitysim/examples/placeholder_multiple_zone/test/reference_pipeline_2_zone.zip new file mode 100644 index 0000000000..a41d46f00f Binary files /dev/null and b/activitysim/examples/placeholder_multiple_zone/test/reference_pipeline_2_zone.zip differ diff --git a/activitysim/examples/placeholder_multiple_zone/test/reference_trace_2_zone.tar.gz b/activitysim/examples/placeholder_multiple_zone/test/reference_trace_2_zone.tar.gz new file mode 100644 index 0000000000..3a92c473ea Binary files /dev/null and b/activitysim/examples/placeholder_multiple_zone/test/reference_trace_2_zone.tar.gz differ diff --git a/activitysim/examples/placeholder_multiple_zone/test/regress/.gitignore b/activitysim/examples/placeholder_multiple_zone/test/regress/.gitignore new file mode 100644 index 0000000000..c5200d0c42 --- /dev/null +++ b/activitysim/examples/placeholder_multiple_zone/test/regress/.gitignore @@ -0,0 +1 @@ +*_last_run.csv diff --git a/activitysim/examples/placeholder_multiple_zone/test/test_multiple_zone.py b/activitysim/examples/placeholder_multiple_zone/test/test_multiple_zone.py index 40d4a41a06..f0c9e1587f 100644 --- a/activitysim/examples/placeholder_multiple_zone/test/test_multiple_zone.py +++ b/activitysim/examples/placeholder_multiple_zone/test/test_multiple_zone.py @@ -1,20 +1,17 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os import subprocess import sys +from pathlib import Path import pandas as pd -import pandas.testing as pdt import pkg_resources import pytest -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import test, workflow def example_path(dirname): @@ -57,7 +54,9 @@ def regress(zone): test_path(f"regress/final_tours_{zone}_zone_last_run.csv"), index=False ) print("regress tours") - pdt.assert_frame_equal(tours_df, regress_tours_df, rtol=1e-03) + test.assert_frame_substantively_equal( + tours_df, regress_tours_df, rtol=1e-03, check_dtype=False + ) # regress trips regress_trips_df = pd.read_csv( @@ -68,7 +67,9 @@ def regress(zone): test_path(f"regress/final_trips_{zone}_zone_last_run.csv"), index=False ) print("regress trips") - pdt.assert_frame_equal(trips_df, regress_trips_df, rtol=1e-03) + test.assert_frame_substantively_equal( + trips_df, regress_trips_df, rtol=1e-03, check_dtype=False + ) file_path = os.path.join(os.path.dirname(__file__), "simulation.py") @@ -116,6 +117,93 @@ def test_3_zone_mp(data): run_test(zone="3", multiprocess=True) +EXPECTED_MODELS = [ + "initialize_landuse", + "initialize_households", + "compute_accessibility", + "school_location", + "workplace_location", + "auto_ownership_simulate", + "free_parking", + "cdap_simulate", + "mandatory_tour_frequency", + "mandatory_tour_scheduling", + "joint_tour_frequency", + "joint_tour_composition", + "joint_tour_participation", + "joint_tour_destination", + "joint_tour_scheduling", + "non_mandatory_tour_frequency", + "non_mandatory_tour_destination", + "non_mandatory_tour_scheduling", + "tour_mode_choice_simulate", + "atwork_subtour_frequency", + "atwork_subtour_destination", + "atwork_subtour_scheduling", + "atwork_subtour_mode_choice", + "stop_frequency", + "trip_purpose", + "trip_destination", + "trip_purpose_and_destination", + "trip_scheduling", + "trip_mode_choice", + "write_data_dictionary", + "track_skim_usage", + "write_trip_matrices", + "write_tables", + "summarize", +] + + +@test.run_if_exists("reference_pipeline_2_zone.zip") +def test_multizone_progressive(zone="2"): + + zone = str(zone) + + import activitysim.abm # register components + + def test_path(dirname): + return os.path.join(os.path.dirname(__file__), dirname) + + if zone == "3": + settings_file_name = "settings_static.yaml" + else: + settings_file_name = "settings.yaml" + + state = workflow.State.make_default( + configs_dir=( + test_path(f"configs_{zone}_zone"), + example_path(f"configs_{zone}_zone"), + mtc_example_path("configs"), + ), + data_dir=(example_path(f"data_{zone}"),), + output_dir=test_path("output"), + settings_file_name=settings_file_name, + ) + + assert state.settings.models == EXPECTED_MODELS + assert state.settings.chunk_size == 0 + assert state.settings.sharrow == False + + state.settings.trace_hh_id = 1099626 + state.tracing.validation_directory = ( + Path(__file__).parent / "reference_trace_2_zone" + ) + + for step_name in EXPECTED_MODELS: + state.run.by_name(step_name) + try: + state.checkpoint.check_against( + Path(__file__).parent.joinpath("reference_pipeline_2_zone.zip"), + checkpoint_name=step_name, + ) + except Exception: + print(f"> {zone} zone {step_name}: ERROR") + raise + else: + print(f"> {zone} zone {step_name}: ok") + + if __name__ == "__main__": build_data() diff --git a/activitysim/examples/placeholder_psrc/configs/logging.yaml b/activitysim/examples/placeholder_psrc/configs/logging.yaml index 46838a7a2a..2921a73d27 100755 --- a/activitysim/examples/placeholder_psrc/configs/logging.yaml +++ b/activitysim/examples/placeholder_psrc/configs/logging.yaml @@ -40,7 +40,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET diff --git a/activitysim/examples/placeholder_psrc/configs/settings_mp.yaml b/activitysim/examples/placeholder_psrc/configs/settings_mp.yaml index 5a95144a4c..c43f78b65e 100644 --- a/activitysim/examples/placeholder_psrc/configs/settings_mp.yaml +++ b/activitysim/examples/placeholder_psrc/configs/settings_mp.yaml @@ -27,7 +27,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True - name: mp_households begin: school_location slice: @@ -50,4 +50,3 @@ output_tables: - persons - tours - trips - diff --git a/activitysim/examples/placeholder_psrc/configs_accessibility/settings_mp.yaml b/activitysim/examples/placeholder_psrc/configs_accessibility/settings_mp.yaml index fca3c67cbf..e1aad2b430 100644 --- a/activitysim/examples/placeholder_psrc/configs_accessibility/settings_mp.yaml +++ b/activitysim/examples/placeholder_psrc/configs_accessibility/settings_mp.yaml @@ -35,7 +35,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True - name: mp_summarize begin: write_data_dictionary @@ -48,4 +48,3 @@ output_tables: - checkpoints - accessibility - land_use - diff --git a/activitysim/examples/placeholder_psrc/test/test_psrc.py b/activitysim/examples/placeholder_psrc/test/test_psrc.py index b265d7d5f9..3d30d1ea3c 100644 --- a/activitysim/examples/placeholder_psrc/test/test_psrc.py +++ b/activitysim/examples/placeholder_psrc/test/test_psrc.py @@ -1,25 +1,30 @@ +from __future__ import annotations + +import atexit +import importlib.resources + # ActivitySim # See full license in LICENSE.txt. import os import subprocess import sys +from contextlib import ExitStack import pandas as pd -import pandas.testing as pdt -import pkg_resources - -from activitysim.core import inject +import pytest - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import workflow +from activitysim.core.test import assert_frame_substantively_equal def _test_psrc(sharrow=False): def example_path(dirname): - resource = os.path.join("examples", "placeholder_psrc", dirname) - return pkg_resources.resource_filename("activitysim", resource) + file_manager = ExitStack() + atexit.register(file_manager.close) + ref = importlib.resources.files("activitysim").joinpath( + "examples", "placeholder_psrc", dirname + ) + return file_manager.enter_context(importlib.resources.as_file(ref)) def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) @@ -31,7 +36,7 @@ def regress(): # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] - pdt.assert_frame_equal(final_trips_df, regress_trips_df) + assert_frame_substantively_equal(final_trips_df, regress_trips_df) file_path = os.path.join(os.path.dirname(__file__), "simulation.py") @@ -67,6 +72,37 @@ def test_psrc_sharrow(): _test_psrc(sharrow=True) -if __name__ == "__main__": +@pytest.mark.parametrize("use_sharrow", [False, True]) +def test_psrc_no_taz_input(use_sharrow): + import activitysim.abm # noqa: F401 + + configs_dir = ("configs", "../configs") + if use_sharrow: + configs_dir = ("configs_sharrow",) + configs_dir + + state = workflow.State.make_default( + working_dir=__file__, + configs_dir=configs_dir, + data_dir=("../data",), + output_dir="output_no_taz", + ) + + # strip out land_use_taz from input_table_list + given = state.settings.input_table_list + state.settings.input_table_list = [ + i for i in given if i.tablename != "land_use_taz" + ] + state.run.all() + + def test_path(dirname): + return os.path.join(os.path.dirname(__file__), dirname) + + regress_trips_df = pd.read_csv(test_path("regress/final_trips.csv")) + final_trips_df = pd.read_csv(test_path("output_no_taz/final_trips.csv")) + + assert_frame_substantively_equal(final_trips_df, regress_trips_df) + + +if __name__ == "__main__": test_psrc() diff --git a/activitysim/examples/placeholder_sandag/3_zone_change_log.txt b/activitysim/examples/placeholder_sandag/3_zone_change_log.txt index 39354c3ee7..4bc750e9d6 100644 --- a/activitysim/examples/placeholder_sandag/3_zone_change_log.txt +++ b/activitysim/examples/placeholder_sandag/3_zone_change_log.txt @@ -23,7 +23,7 @@ multiprocess_steps: # except: # - land_use # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True # global switch to turn on or off presampling of destination alternatives at TAZ level (multizone models only) @@ -87,7 +87,3 @@ maz_to_tap: # # not trimming because drive_maz_tap utility calculations take into account both drive and walk time and cost # # though some sort of trimming appears to have been done as there are not so many of these in marin data # #tap_line_distance_col: DDIST - - - - diff --git a/activitysim/examples/placeholder_sandag/configs_1_zone/logging.yaml b/activitysim/examples/placeholder_sandag/configs_1_zone/logging.yaml index 13e533abba..779a4cbab9 100644 --- a/activitysim/examples/placeholder_sandag/configs_1_zone/logging.yaml +++ b/activitysim/examples/placeholder_sandag/configs_1_zone/logging.yaml @@ -40,7 +40,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -48,7 +49,9 @@ logging: console: class: logging.StreamHandler stream: ext://sys.stdout - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatter: elapsedFormatter formatters: diff --git a/activitysim/examples/placeholder_sandag/configs_3_zone/logging.yaml b/activitysim/examples/placeholder_sandag/configs_3_zone/logging.yaml index 93cf6cea93..0a7df5d478 100644 --- a/activitysim/examples/placeholder_sandag/configs_3_zone/logging.yaml +++ b/activitysim/examples/placeholder_sandag/configs_3_zone/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET diff --git a/activitysim/examples/placeholder_sandag/test/configs_3_zone/settings.yaml b/activitysim/examples/placeholder_sandag/test/configs_3_zone/settings.yaml index 9f4e768b6a..86df7817d5 100644 --- a/activitysim/examples/placeholder_sandag/test/configs_3_zone/settings.yaml +++ b/activitysim/examples/placeholder_sandag/test/configs_3_zone/settings.yaml @@ -27,3 +27,5 @@ output_tables: tables: - trips - tours + +recode_pipeline_columns: False diff --git a/activitysim/examples/placeholder_sandag/test/output_1/.gitignore b/activitysim/examples/placeholder_sandag/test/output_1/.gitignore index bf5bf15e3e..5252eb1129 100644 --- a/activitysim/examples/placeholder_sandag/test/output_1/.gitignore +++ b/activitysim/examples/placeholder_sandag/test/output_1/.gitignore @@ -5,3 +5,4 @@ *.txt *.yaml *.omx +*.parquet diff --git a/activitysim/examples/placeholder_sandag/test/output_1/cache/.gitignore b/activitysim/examples/placeholder_sandag/test/output_1/cache/.gitignore index 3dd2e62f9e..46968a4f0c 100644 --- a/activitysim/examples/placeholder_sandag/test/output_1/cache/.gitignore +++ b/activitysim/examples/placeholder_sandag/test/output_1/cache/.gitignore @@ -1,2 +1,3 @@ *.mmap *.feather +** diff --git a/activitysim/examples/placeholder_sandag/test/output_3/.gitignore b/activitysim/examples/placeholder_sandag/test/output_3/.gitignore index bf5bf15e3e..5252eb1129 100644 --- a/activitysim/examples/placeholder_sandag/test/output_3/.gitignore +++ b/activitysim/examples/placeholder_sandag/test/output_3/.gitignore @@ -5,3 +5,4 @@ *.txt *.yaml *.omx +*.parquet diff --git a/activitysim/examples/placeholder_sandag/test/output_3/cache/.gitignore b/activitysim/examples/placeholder_sandag/test/output_3/cache/.gitignore index 3dd2e62f9e..46968a4f0c 100644 --- a/activitysim/examples/placeholder_sandag/test/output_3/cache/.gitignore +++ b/activitysim/examples/placeholder_sandag/test/output_3/cache/.gitignore @@ -1,2 +1,3 @@ *.mmap *.feather +** diff --git a/activitysim/examples/placeholder_sandag/test/placeholder_sandag_3_zone_reference_pipeline.zip b/activitysim/examples/placeholder_sandag/test/placeholder_sandag_3_zone_reference_pipeline.zip new file mode 100644 index 0000000000..4964267eb3 Binary files /dev/null and b/activitysim/examples/placeholder_sandag/test/placeholder_sandag_3_zone_reference_pipeline.zip differ diff --git a/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_tours.csv b/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_tours.csv index a27174eec9..f2404fb783 100644 --- a/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_tours.csv +++ b/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_tours.csv @@ -1,78 +1,79 @@ -tour_id,person_id,tour_type,tour_type_count,tour_type_num,tour_num,tour_count,tour_category,number_of_participants,destination,origin,household_id,tdd,start,end,duration,composition,destination_logsum,tour_mode,mode_choice_logsum,od_atap,od_btap,od_path_set,do_atap,do_btap,do_path_set,atwork_subtour_frequency,parent_tour_id,stop_frequency,primary_purpose -1359025,33146,work,1,1,1,1,mandatory,1,608.0,68.0,12593,12.0,5.0,17.0,12.0,,,DRIVEALONEFREE,1.0261372219238083,,,,,,,no_subtours,,2out_0in,work -1359066,33147,work,1,1,1,1,mandatory,1,578.0,68.0,12593,61.0,8.0,15.0,7.0,,,WALK_TRANSIT,2.96683804953191,1500.0,1558.0,fastest,1558.0,1500.0,fastest,no_subtours,,0out_1in,work -1494647,36454,shopping,2,1,1,2,non_mandatory,1,233.0,88.0,13797,128.0,13.0,17.0,4.0,,11.594151670549255,DRIVEALONEFREE,0.054207142123309696,,,,,,,,,0out_1in,shopping -1494648,36454,shopping,2,2,2,2,non_mandatory,1,237.0,88.0,13797,169.0,18.0,18.0,0.0,,11.643159811140553,DRIVEALONEFREE,0.28212638921439376,,,,,,,,,1out_0in,shopping -1494680,36455,othdiscr,1,1,1,1,non_mandatory,1,131.0,88.0,13797,159.0,16.0,21.0,5.0,,13.153011009083952,DRIVEALONEFREE,0.527847608692733,,,,,,,,,0out_2in,othdiscr -1494694,36455,work,1,1,1,1,mandatory,1,70.0,88.0,13797,58.0,8.0,12.0,4.0,,,WALK,0.9189166731374394,,,,,,,no_subtours,,0out_0in,work -1709911,41705,eatout,1,1,1,1,non_mandatory,1,578.0,153.0,15777,76.0,9.0,15.0,6.0,,12.194280696346672,WALK_TRANSIT,0.4629895150090414,1500.0,1611.0,fastest,1611.0,1500.0,fastest,,,0out_0in,eatout -1709950,41706,eat,1,1,1,1,atwork,1,497.0,362.0,15777,112.0,12.0,12.0,0.0,,18.60610008578578,WALK_TRANSIT,-0.8459966967797716,1666.0,1662.0,shortest,1662.0,1666.0,shortest,,1709985.0,0out_0in,atwork -1709985,41706,work,1,1,1,1,mandatory,1,362.0,153.0,15777,65.0,8.0,19.0,11.0,,,SHARED2FREE,0.6600577445845757,,,,,,,eat,,0out_1in,work -2051448,50035,eatout,1,1,1,1,joint,3,578.0,265.0,18261,156.0,16.0,18.0,2.0,mixed,11.830383451121103,SHARED3FREE,-5.832306485220664,,,,,,,,,1out_0in,eatout -2051466,50035,school,1,1,1,1,mandatory,1,427.0,265.0,18261,43.0,7.0,13.0,6.0,,,WALK_TRANSIT,1.2158966517539807,1618.0,1748.0,shortest,1748.0,1618.0,shortest,,,0out_0in,school -2051468,50035,shopping,1,1,1,1,non_mandatory,1,504.0,265.0,18261,175.0,19.0,19.0,0.0,,11.561744392977129,DRIVEALONEFREE,-0.18465816333379353,,,,,,,,,0out_0in,shopping -2051504,50036,othmaint,1,1,1,1,non_mandatory,1,385.0,265.0,18261,44.0,7.0,14.0,7.0,,12.388911839471218,SHARED2FREE,0.07492591448018238,,,,,,,,,0out_0in,othmaint -2051556,50037,work,1,1,1,1,mandatory,1,578.0,265.0,18261,46.0,7.0,16.0,9.0,,,WALK_TRANSIT,1.2858135302346076,1500.0,1651.0,shortest,1651.0,1500.0,fastest,no_subtours,,0out_0in,work -2268889,55338,school,1,1,1,1,mandatory,1,207.0,302.0,19758,40.0,7.0,10.0,3.0,,,WALK_TRANSIT,1.4625238798878268,1608.0,1652.0,cheapest,1652.0,1608.0,shortest,,,0out_0in,school -2268938,55339,work,1,1,1,1,mandatory,1,254.0,302.0,19758,11.0,5.0,16.0,11.0,,,DRIVEALONEFREE,0.899253309377072,,,,,,,no_subtours,,0out_0in,work -2373816,57897,work,1,1,1,1,mandatory,1,578.0,337.0,20552,50.0,7.0,20.0,13.0,,,WALK_TRANSIT,1.8152536207268481,1500.0,1754.0,fastest,1754.0,1500.0,fastest,no_subtours,,0out_0in,work -2373818,57898,business,1,1,1,1,atwork,1,456.0,456.0,20552,101.0,11.0,13.0,2.0,,10.851045587482702,WALK,0.8933720529791014,,,,,,,,2373857.0,1out_0in,atwork -2373857,57898,work,1,1,1,1,mandatory,1,456.0,337.0,20552,105.0,11.0,17.0,6.0,,,DRIVEALONEFREE,0.22661990599461826,,,,,,,business1,,0out_1in,work -2373898,57899,work,1,1,1,1,mandatory,1,195.0,337.0,20552,47.0,7.0,17.0,10.0,,,WALK,1.4775079963774336,,,,,,,no_subtours,,0out_0in,work -2373980,57901,work,2,1,1,2,mandatory,1,216.0,337.0,20552,30.0,6.0,17.0,11.0,,,DRIVEALONEFREE,0.5418945770189671,,,,,,,no_subtours,,0out_0in,work -2373981,57901,work,2,2,2,2,mandatory,1,216.0,337.0,20552,175.0,19.0,19.0,0.0,,,SHARED2FREE,0.7907156266134642,,,,,,,no_subtours,,1out_0in,work -2563802,62531,school,1,1,1,1,mandatory,1,442.0,408.0,21869,180.0,20.0,20.0,0.0,,,WALK_TRANSIT,0.9438904813360477,1682.0,1633.0,shortest,1633.0,1682.0,fastest,,,0out_0in,school -2563821,62532,escort,1,1,1,1,non_mandatory,1,155.0,408.0,21869,20.0,6.0,7.0,1.0,,12.523318372217064,SHARED2FREE,-1.2299200748899615,,,,,,,,,0out_0in,escort -2563862,62533,escort,3,1,1,4,non_mandatory,1,203.0,408.0,21869,1.0,5.0,6.0,1.0,,12.561374920193618,SHARED3FREE,-1.221458591960288,,,,,,,,,0out_3in,escort -2563863,62533,escort,3,2,2,4,non_mandatory,1,26.0,408.0,21869,99.0,11.0,11.0,0.0,,12.495641230564226,SHARED3FREE,-1.66067049305692,,,,,,,,,0out_0in,escort -2563864,62533,escort,3,3,3,4,non_mandatory,1,352.0,408.0,21869,135.0,14.0,14.0,0.0,,12.525318452376542,SHARED2FREE,-1.4032965315757244,,,,,,,,,0out_0in,escort -2563878,62533,othdiscr,1,1,4,4,non_mandatory,1,578.0,408.0,21869,100.0,11.0,12.0,1.0,,13.255455507967595,WALK_TRANSIT,1.0602578337312834,1500.0,73.0,fastest,73.0,1500.0,shortest,,,0out_0in,othdiscr -2563925,62534,school,1,1,1,1,mandatory,1,301.0,408.0,21869,55.0,8.0,9.0,1.0,,,WALK_TRANSIT,-0.605105363231441,1748.0,73.0,shortest,73.0,1748.0,fastest,,,0out_0in,school -2787968,67999,escort,1,1,1,2,non_mandatory,1,269.0,481.0,23619,124.0,13.0,13.0,0.0,,12.995994495540678,SHARED3FREE,-0.6687154899908945,,,,,,,,,0out_2in,escort -2787995,67999,social,1,1,2,2,non_mandatory,1,496.0,481.0,23619,165.0,17.0,20.0,3.0,,12.759866800729268,WALK,1.4266623253551904,,,,,,,,,0out_0in,social -2788039,68000,work,1,1,1,1,mandatory,1,578.0,481.0,23619,51.0,7.0,21.0,14.0,,,WALK_TRANSIT,1.4899770411640134,1500.0,1588.0,shortest,1588.0,1500.0,shortest,no_subtours,,1out_0in,work -3238088,78977,school,1,1,1,1,mandatory,1,492.0,589.0,26897,44.0,7.0,14.0,7.0,,,WALK_TRANSIT,0.8204920756912042,1618.0,1584.0,shortest,1584.0,1618.0,cheapest,,,0out_0in,school -3238143,78979,eat,1,1,1,1,atwork,1,578.0,405.0,26897,72.0,9.0,11.0,2.0,,18.599435534464497,DRIVEALONEFREE,-0.2744746714261923,,,,,,,,3238178.0,0out_0in,atwork -3238178,78979,work,1,1,1,1,mandatory,1,405.0,589.0,26897,48.0,7.0,18.0,11.0,,,DRIVEALONEFREE,0.5976010496480395,,,,,,,eat,,0out_0in,work -52627721,1283602,work,1,1,1,1,mandatory,1,586.0,29.0,435012,64.0,8.0,18.0,10.0,,,WALK_TRANSIT,5.710266024459359,1584.0,1238.0,shortest,1608.0,1584.0,fastest,no_subtours,,0out_0in,work -52638588,1283868,business,1,1,1,1,atwork,1,578.0,578.0,435278,112.0,12.0,12.0,0.0,,19.462265786986954,WALK,0.6243659427953904,,,,,,,,52638627.0,0out_0in,atwork -52638594,1283868,eatout,1,1,1,1,non_mandatory,1,578.0,45.0,435278,172.0,18.0,21.0,3.0,,11.758619181788253,WALK_TRANSIT,0.6406085296048486,1500.0,1558.0,fastest,1558.0,1500.0,shortest,,,0out_1in,eatout -52638627,1283868,work,1,1,1,1,mandatory,1,578.0,45.0,435278,79.0,9.0,18.0,9.0,,,WALK_TRANSIT,1.206681423660966,1500.0,1558.0,shortest,1558.0,1500.0,shortest,business1,,0out_0in,work -52641825,1283946,work,1,1,1,1,mandatory,1,578.0,31.0,435356,49.0,7.0,19.0,12.0,,,WALK_TRANSIT,1.1978831616543206,1500.0,1604.0,cheapest,1604.0,1500.0,shortest,no_subtours,,0out_0in,work -52668557,1284598,work,1,1,1,1,mandatory,1,61.0,70.0,436008,80.0,9.0,19.0,10.0,,,DRIVEALONEFREE,0.7713829364314543,,,,,,,no_subtours,,0out_0in,work -52734819,1286215,eat,1,1,1,1,atwork,1,585.0,114.0,437625,88.0,10.0,13.0,3.0,,17.98372758878566,WALK_TRANSIT,-0.8798916104770301,1500.0,1562.0,shortest,1562.0,1500.0,cheapest,,52734854.0,0out_0in,atwork -52734854,1286215,work,1,1,1,1,mandatory,1,114.0,164.0,437625,65.0,8.0,19.0,11.0,,,DRIVEALONEFREE,0.8500450570238016,,,,,,,eat,,0out_2in,work -52897544,1290184,business,1,1,1,1,atwork,1,240.0,485.0,441594,99.0,11.0,11.0,0.0,,18.649706295366613,WALK,0.7361914154560637,,,,,,,,52897583.0,0out_0in,atwork -52897550,1290184,eatout,1,1,4,4,non_mandatory,1,347.0,604.0,441594,184.0,21.0,21.0,0.0,,11.9769068256242,SHARED2FREE,-0.10785441309565102,,,,,,,,,0out_1in,eatout -52897569,1290184,othdiscr,3,1,1,4,non_mandatory,1,22.0,604.0,441594,2.0,5.0,7.0,2.0,,13.340592587029276,DRIVEALONEFREE,0.9729989937926249,,,,,,,,,0out_0in,othdiscr -52897570,1290184,othdiscr,3,2,2,4,non_mandatory,1,241.0,604.0,441594,185.0,21.0,22.0,1.0,,13.194495693775464,BIKE,0.24171550040319387,,,,,,,,,0out_0in,othdiscr -52897571,1290184,othdiscr,3,3,3,4,non_mandatory,1,161.0,604.0,441594,189.0,23.0,23.0,0.0,,13.296474125502314,WALK_TRANSIT,0.4347637705876943,1238.0,1516.0,fastest,1516.0,1238.0,fastest,,,0out_1in,othdiscr -52897583,1290184,work,1,1,1,1,mandatory,1,485.0,604.0,441594,82.0,9.0,21.0,12.0,,,DRIVEALONEFREE,-0.05454824120553145,,,,,,,business1,,0out_1in,work -52915705,1290626,work,1,1,1,1,mandatory,1,578.0,608.0,442036,64.0,8.0,18.0,10.0,,,WALK_TRANSIT,1.4241213520203073,1500.0,1584.0,cheapest,1584.0,1500.0,fastest,no_subtours,,1out_0in,work -76379130,1862905,othdiscr,1,1,1,1,non_mandatory,1,494.0,468.0,721960,151.0,15.0,21.0,6.0,,13.046748957111205,BIKE,0.6336063394695053,,,,,,,,,0out_0in,othdiscr -76379171,1862906,othdiscr,1,1,1,1,non_mandatory,1,500.0,468.0,721960,104.0,11.0,16.0,5.0,,13.068781711323547,WALK,0.6246154614956402,,,,,,,,,0out_0in,othdiscr -80946571,1974306,othdiscr,1,1,1,1,non_mandatory,1,72.0,17.0,760593,62.0,8.0,16.0,8.0,,13.609717599377104,WALK,1.2877790624306358,,,,,,,,,0out_0in,othdiscr -80946591,1974307,eat,1,1,1,1,atwork,1,324.0,204.0,760593,113.0,12.0,13.0,1.0,,18.711722700372903,SHARED3FREE,-1.1971419362787026,,,,,,,,80946626.0,0out_0in,atwork -80946626,1974307,work,1,1,1,1,mandatory,1,204.0,17.0,760593,79.0,9.0,18.0,9.0,,,WALK_TRANSIT,0.9146373054676561,1748.0,1603.0,shortest,1603.0,1748.0,shortest,eat,,1out_1in,work -80946637,1974308,escort,1,1,1,1,non_mandatory,1,224.0,17.0,760593,0.0,5.0,5.0,0.0,,12.668506375964688,SHARED3FREE,-0.8073126044826399,,,,,,,,,0out_0in,escort -81048440,1976791,escort,1,1,1,1,non_mandatory,1,483.0,121.0,761445,124.0,13.0,13.0,0.0,,12.714018122401603,SHARED2FREE,-0.7186631667509052,,,,,,,,,0out_0in,escort -81048508,1976792,social,1,1,1,1,non_mandatory,1,494.0,121.0,761445,140.0,14.0,19.0,5.0,,12.159236859223718,DRIVEALONEFREE,0.5721200907291696,,,,,,,,,0out_0in,social -81048511,1976792,work,1,1,1,1,mandatory,1,25.0,121.0,761445,7.0,5.0,12.0,7.0,,,WALK_TRANSIT,0.4973097110196049,1681.0,1238.0,fastest,1238.0,1681.0,fastest,no_subtours,,0out_0in,work -81130344,1978788,social,1,1,1,1,non_mandatory,1,271.0,469.0,762159,66.0,8.0,20.0,12.0,,12.430877422027173,SHARED2FREE,1.1233925360416968,,,,,,,,,0out_0in,social -81130399,1978790,escort,1,1,1,1,non_mandatory,1,500.0,469.0,762159,54.0,8.0,8.0,0.0,,12.604750255865364,SHARED2FREE,-0.7891363476365268,,,,,,,,,0out_0in,escort -81130429,1978790,work,1,1,1,1,mandatory,1,180.0,469.0,762159,63.0,8.0,17.0,9.0,,,DRIVEALONEFREE,0.7319548477569705,,,,,,,no_subtours,,0out_0in,work -81130470,1978791,work,1,1,1,1,mandatory,1,578.0,469.0,762159,47.0,7.0,17.0,10.0,,,WALK_TRANSIT,1.2232315365174313,1500.0,1762.0,shortest,1762.0,1500.0,shortest,no_subtours,,0out_0in,work -102419958,2498047,school,1,1,1,1,mandatory,1,432.0,238.0,922602,77.0,9.0,16.0,7.0,,,WALK_TRANSIT,4.547714059900562,1673.0,1608.0,shortest,1608.0,1673.0,fastest,,,0out_0in,school -102420007,2498048,work,1,1,1,1,mandatory,1,243.0,238.0,922602,46.0,7.0,16.0,9.0,,,WALK_TRANSIT,2.571846550195381,1592.0,1612.0,shortest,1612.0,1592.0,shortest,no_subtours,,0out_0in,work -102420048,2498049,work,1,1,1,1,mandatory,1,271.0,238.0,922602,29.0,6.0,16.0,10.0,,,WALK,2.3529244532782645,,,,,,,no_subtours,,0out_0in,work -107509903,2622192,school,1,1,1,1,mandatory,1,501.0,533.0,952720,44.0,7.0,14.0,7.0,,,WALK_TRANSIT,0.7689513133515259,1664.0,1666.0,fastest,1666.0,1664.0,shortest,,,0out_0in,school -107509922,2622193,escort,1,1,1,2,non_mandatory,1,281.0,533.0,952720,19.0,6.0,6.0,0.0,,13.120197714243572,WALK,-0.6633544461803778,,,,,,,,,0out_0in,escort -107509941,2622193,othmaint,1,1,2,2,non_mandatory,1,58.0,533.0,952720,61.0,8.0,15.0,7.0,,13.296284550786224,WALK_TRANSIT,0.6229120227429399,1238.0,1666.0,fastest,1666.0,1238.0,shortest,,,0out_0in,othmaint -107509987,2622194,shopping,1,1,1,1,non_mandatory,1,504.0,533.0,952720,74.0,9.0,13.0,4.0,,12.281026585137033,SHARED3FREE,0.8228058752894183,,,,,,,,,0out_0in,shopping -107510034,2622195,work,1,1,1,1,mandatory,1,578.0,533.0,952720,63.0,8.0,17.0,9.0,,,WALK_TRANSIT,1.4377271736671302,1500.0,1666.0,fastest,1666.0,1500.0,shortest,no_subtours,,0out_0in,work -116640406,2844887,work,1,1,1,1,mandatory,1,385.0,354.0,1028031,108.0,11.0,20.0,9.0,,,WALK_TRANSIT,0.6883028572332556,1754.0,1695.0,shortest,1748.0,1754.0,fastest,no_subtours,,0out_0in,work -120287676,2933845,school,1,1,1,1,mandatory,1,197.0,82.0,1048898,121.0,12.0,21.0,9.0,,,WALK_TRANSIT,-0.05133861963844917,1608.0,1559.0,shortest,1559.0,1608.0,fastest,,,0out_0in,school -120287717,2933846,school,1,1,1,1,mandatory,1,23.0,82.0,1048898,62.0,8.0,16.0,8.0,,,SHARED2FREE,-1.0238153653421556,,,,,,,,,0out_0in,school -120287752,2933847,othdiscr,1,1,1,1,non_mandatory,1,165.0,82.0,1048898,42.0,7.0,12.0,5.0,,12.971588539704879,WALK_TRANSIT,0.8435740874298338,1238.0,1559.0,shortest,1559.0,1238.0,fastest,,,0out_1in,othdiscr -120287807,2933848,work,1,1,1,1,mandatory,1,10.0,82.0,1048898,31.0,6.0,18.0,12.0,,,WALK_TRANSIT,0.3435861763727887,1333.0,1559.0,fastest,1559.0,1333.0,shortest,no_subtours,,0out_0in,work -131881533,3216622,school,1,1,1,1,mandatory,1,582.0,584.0,1148260,136.0,14.0,15.0,1.0,,,WALK_TRANSIT,10.730879630807463,1500.0,1516.0,fastest,1516.0,1500.0,shortest,,,0out_0in,univ +person_id,tour_type,tour_type_count,tour_type_num,tour_num,tour_count,tour_category,number_of_participants,destination,origin,household_id,tdd,start,end,duration,composition,destination_logsum,tour_mode,mode_choice_logsum,od_atap,od_btap,od_path_set,do_atap,do_btap,do_path_set,atwork_subtour_frequency,parent_tour_id,stop_frequency,primary_purpose,tour_id +33146,work,1,1,1,1,mandatory,1,1100,560,12593,12,5,17,12,,,DRIVEALONEFREE,1.0261372219238083,,,,,,,no_subtours,,2out_0in,work,1359025 +33147,work,1,1,1,1,mandatory,1,1070,560,12593,61,8,15,7,,,WALK_TRANSIT,2.96683804953191,1500.0,1558.0,fastest,1558.0,1500.0,fastest,no_subtours,,0out_1in,work,1359066 +36454,shopping,2,1,1,2,non_mandatory,1,725,580,13797,128,13,17,4,,11.688295162318989,DRIVEALONEFREE,0.185754815801926,,,,,,,,,0out_1in,shopping,1494647 +36454,shopping,2,2,2,2,non_mandatory,1,729,580,13797,169,18,18,0,,11.689029568940938,DRIVEALONEFREE,0.443644139698639,,,,,,,,,1out_0in,shopping,1494648 +36455,othdiscr,1,1,1,1,non_mandatory,1,623,580,13797,159,16,21,5,,13.239720693390336,SHARED2FREE,0.738377038878971,,,,,,,,,0out_2in,othdiscr,1494680 +36455,work,1,1,1,1,mandatory,1,562,580,13797,58,8,12,4,,,SHARED3FREE,0.8851264867711228,,,,,,,no_subtours,,0out_0in,work,1494694 +41705,eatout,1,1,1,1,non_mandatory,1,1070,645,15777,76,9,15,6,,12.070246786563017,WALK_TRANSIT,0.4629895150090414,1500.0,1611.0,fastest,1611.0,1500.0,fastest,,,0out_0in,eatout,1709911 +41706,eat,1,1,1,1,atwork,1,989,854,15777,112,12,12,0,,18.50954665100095,WALK_TRANSIT,-0.6180823333381719,1666.0,1662.0,shortest,1662.0,1666.0,shortest,,1709985.0,0out_0in,atwork,1709950 +41706,work,1,1,1,1,mandatory,1,854,645,15777,65,8,19,11,,,SHARED2FREE,0.613674174636675,,,,,,,eat,,0out_1in,work,1709985 +50035,eatout,1,1,1,1,joint,3,1070,757,18261,156,16,18,2,mixed,11.85828127629614,SHARED3FREE,-5.832306485220664,,,,,,,,,1out_0in,eatout,2051448 +50035,school,1,1,1,1,mandatory,1,919,757,18261,43,7,13,6,,,WALK_TRANSIT,1.2158966517539809,1618.0,1748.0,shortest,1748.0,1618.0,shortest,,,0out_0in,school,2051466 +50035,shopping,1,1,1,1,non_mandatory,1,996,757,18261,175,19,19,0,,11.577466521097683,DRIVEALONEFREE,-0.1846581633337935,,,,,,,,,0out_0in,shopping,2051468 +50036,othmaint,1,1,1,1,non_mandatory,1,877,757,18261,44,7,14,7,,12.412335687839308,SHARED2FREE,0.0818069109613803,,,,,,,,,0out_0in,othmaint,2051504 +50037,work,1,1,1,1,mandatory,1,1070,757,18261,46,7,16,9,,,WALK_TRANSIT,1.2858135302346076,1500.0,1651.0,shortest,1651.0,1500.0,fastest,no_subtours,,0out_0in,work,2051556 +55338,school,1,1,1,1,mandatory,1,699,794,19758,40,7,10,3,,,WALK_TRANSIT,1.426179634142238,1608.0,1652.0,cheapest,1652.0,1608.0,shortest,,,0out_0in,school,2268889 +55339,work,1,1,1,1,mandatory,1,762,794,19758,11,5,16,11,,,DRIVEALONEFREE,0.6319219718888117,,,,,,,no_subtours,,0out_0in,work,2268938 +57897,work,1,1,1,1,mandatory,1,1070,829,20552,50,7,20,13,,,WALK_TRANSIT,1.815253620726848,1500.0,1754.0,fastest,1754.0,1500.0,fastest,no_subtours,,0out_0in,work,2373816 +57898,business,1,1,1,1,atwork,1,948,948,20552,101,11,13,2,,10.856328670591866,WALK,1.0849001811607144,,,,,,,,2373857.0,1out_0in,atwork,2373818 +57898,work,1,1,1,1,mandatory,1,948,829,20552,105,11,17,6,,,DRIVEALONEFREE,0.2266199059946182,,,,,,,business1,,0out_1in,work,2373857 +57899,work,1,1,1,1,mandatory,1,687,829,20552,47,7,17,10,,,WALK,1.4925958516365831,,,,,,,no_subtours,,0out_0in,work,2373898 +57901,work,2,1,1,2,mandatory,1,708,829,20552,30,6,17,11,,,DRIVEALONEFREE,0.5418945770189671,,,,,,,no_subtours,,0out_0in,work,2373980 +57901,work,2,2,2,2,mandatory,1,708,829,20552,175,19,19,0,,,SHARED2FREE,0.7907156266134642,,,,,,,no_subtours,,1out_0in,work,2373981 +62531,school,1,1,1,1,mandatory,1,938,900,21869,180,20,20,0,,,WALK_TRANSIT,1.0406873179045706,1673.0,1633.0,shortest,1633.0,1673.0,fastest,,,0out_0in,school,2563802 +62532,escort,1,1,1,1,non_mandatory,1,647,900,21869,20,6,7,1,,12.49976194812137,SHARED2FREE,-1.2299200748899617,,,,,,,,,0out_0in,escort,2563821 +62533,escort,3,1,1,4,non_mandatory,1,695,900,21869,1,5,6,1,,12.55798878217292,SHARED3FREE,-1.221458591960288,,,,,,,,,0out_3in,escort,2563862 +62533,escort,3,2,2,4,non_mandatory,1,518,900,21869,99,11,11,0,,12.502003827155296,SHARED3FREE,-1.66067049305692,,,,,,,,,0out_0in,escort,2563863 +62533,escort,3,3,3,4,non_mandatory,1,844,900,21869,135,14,14,0,,12.530459798779978,SHARED2FREE,-1.4032965315757244,,,,,,,,,0out_0in,escort,2563864 +62533,othdiscr,1,1,4,4,non_mandatory,1,1070,900,21869,100,11,12,1,,13.216136797297771,WALK_TRANSIT,1.0118533227067692,1500.0,73.0,fastest,73.0,1500.0,shortest,,,0out_0in,othdiscr,2563878 +62534,school,1,1,1,1,mandatory,1,793,900,21869,55,8,9,1,,,WALK_TRANSIT,-0.605105363231441,1748.0,73.0,shortest,73.0,1748.0,fastest,,,0out_0in,school,2563925 +67999,escort,1,1,1,2,non_mandatory,1,767,973,23619,124,13,13,0,,12.970512717862755,SHARED3FREE,-0.7145790466379958,,,,,,,,,0out_2in,escort,2787968 +67999,social,1,1,2,2,non_mandatory,1,988,973,23619,165,17,20,3,,12.777797143559908,WALK,1.394261833722182,,,,,,,,,0out_0in,social,2787995 +68000,work,1,1,1,1,mandatory,1,1070,973,23619,51,7,21,14,,,WALK_TRANSIT,1.4899770411640134,1500.0,1588.0,shortest,1588.0,1500.0,shortest,no_subtours,,1out_0in,work,2788039 +78977,school,1,1,1,1,mandatory,1,984,1081,26897,44,7,14,7,,,WALK,1.0391837359866254,,,,,,,,,0out_0in,school,3238088 +78979,eat,1,1,1,1,atwork,1,1070,897,26897,72,9,11,2,,18.50007970900582,DRIVEALONEFREE,-0.2864770605138423,,,,,,,,3238178.0,0out_0in,atwork,3238143 +78979,work,1,1,1,1,mandatory,1,897,1081,26897,48,7,18,11,,,DRIVEALONEFREE,0.5747485518145101,,,,,,,eat,,0out_0in,work,3238178 +1283602,work,1,1,1,1,mandatory,1,1078,521,435012,64,8,18,10,,,WALK_TRANSIT,5.710266024459359,1584.0,1238.0,shortest,1608.0,1584.0,fastest,no_subtours,,0out_0in,work,52627721 +1283868,business,1,1,1,1,atwork,1,1070,1070,435278,112,12,12,0,,19.38117027561944,WALK,0.6487888616012731,,,,,,,,52638627.0,0out_0in,atwork,52638588 +1283868,eatout,1,1,1,1,non_mandatory,1,1070,537,435278,172,18,21,3,,11.755720899317405,WALK_TRANSIT,0.6406085296048486,1500.0,1558.0,fastest,1558.0,1500.0,shortest,,,0out_1in,eatout,52638594 +1283868,work,1,1,1,1,mandatory,1,1070,537,435278,79,9,18,9,,,WALK_TRANSIT,1.206681423660966,1500.0,1558.0,shortest,1558.0,1500.0,shortest,business1,,0out_0in,work,52638627 +1283946,work,1,1,1,1,mandatory,1,1070,523,435356,49,7,19,12,,,WALK_TRANSIT,1.1978831616543206,1500.0,1604.0,cheapest,1604.0,1500.0,shortest,no_subtours,,0out_0in,work,52641825 +1284598,work,1,1,1,1,mandatory,1,553,562,436008,80,9,19,10,,,DRIVEALONEFREE,0.7312528815647517,,,,,,,no_subtours,,0out_0in,work,52668557 +1286215,eat,1,1,1,1,atwork,1,1077,606,437625,88,10,13,3,,18.149830935609163,WALK_TRANSIT,-0.8798916104770301,1500.0,1562.0,shortest,1562.0,1500.0,cheapest,,52734854.0,0out_0in,atwork,52734819 +1286215,work,1,1,1,1,mandatory,1,606,656,437625,65,8,19,11,,,DRIVEALONEFREE,0.8985332322364314,,,,,,,eat,,0out_2in,work,52734854 +1290184,business,1,1,1,1,atwork,1,732,977,441594,99,11,11,0,,18.610370946991612,SHARED2FREE,0.4316303329603235,,,,,,,,52897583.0,0out_0in,atwork,52897544 +1290184,eatout,1,1,4,4,non_mandatory,1,949,1096,441594,184,21,21,0,,11.904706692046382,SHARED2FREE,-0.1665710953231831,,,,,,,,,0out_1in,eatout,52897550 +1290184,othdiscr,3,1,1,4,non_mandatory,1,684,1096,441594,2,5,7,2,,12.918529902113356,DRIVEALONEFREE,0.2317787046274241,,,,,,,,,0out_0in,othdiscr,52897569 +1290184,othdiscr,3,2,2,4,non_mandatory,1,762,1096,441594,185,21,22,1,,12.959861282775266,WALK_TRANSIT,0.2522845990878585,1748.0,1516.0,fastest,1516.0,1748.0,shortest,,,0out_0in,othdiscr,52897570 +1290184,othdiscr,3,3,3,4,non_mandatory,1,725,1096,441594,189,23,23,0,,13.001363665570665,WALK,0.3854413463230495,,,,,,,,,0out_0in,othdiscr,52897571 +1290184,work,1,1,1,1,mandatory,1,977,1096,441594,82,9,21,12,,,DRIVEALONEFREE,-0.0423820492992088,,,,,,,business1,,0out_1in,work,52897583 +1290626,work,1,1,1,1,mandatory,1,1070,1100,442036,64,8,18,10,,,WALK_TRANSIT,1.4095982213888998,1500.0,1584.0,cheapest,1584.0,1500.0,fastest,no_subtours,,1out_0in,work,52915705 +1862905,othdiscr,1,1,1,1,non_mandatory,1,986,960,721960,151,15,21,6,,12.957346522442643,WALK_TRANSIT,0.3212522660901984,1618.0,1762.0,fastest,1762.0,1618.0,cheapest,,,0out_0in,othdiscr,76379130 +1862906,othdiscr,1,1,1,1,non_mandatory,1,992,960,721960,104,11,16,5,,12.9976638956563,SHARED3FREE,0.3134675853717786,,,,,,,,,0out_1in,othdiscr,76379171 +1974306,othdiscr,1,1,1,1,non_mandatory,1,564,509,760593,62,8,16,8,,13.57415728090829,WALK,1.2942555315594169,,,,,,,,,0out_0in,othdiscr,80946571 +1974307,eat,1,1,1,1,atwork,1,913,739,760593,113,12,13,1,,18.622653002301583,SHARED3FREE,-0.7672328773497228,,,,,,,,80946626.0,0out_0in,atwork,80946591 +1974307,work,1,1,1,1,mandatory,1,739,509,760593,79,9,18,9,,,WALK_TRANSIT,0.6201259698287482,1748.0,1603.0,fastest,1603.0,1748.0,fastest,eat,,1out_1in,work,80946626 +1974308,escort,1,1,1,1,non_mandatory,1,716,509,760593,0,5,5,0,,12.671799977139084,SHARED3FREE,-0.8180421450990675,,,,,,,,,0out_0in,escort,80946637 +1976791,escort,1,1,1,1,non_mandatory,1,908,613,761445,124,13,13,0,,12.82099191373982,SHARED2FREE,-0.9528167859302504,,,,,,,,,0out_0in,escort,81048440 +1976792,eat,1,1,1,1,atwork,1,517,517,761445,70,9,9,0,,17.910037480926228,SHARED2FREE,0.7181216397530412,,,,,,,,81048511.0,0out_0in,atwork,81048476 +1976792,eatout,1,1,1,1,non_mandatory,1,648,613,761445,130,13,19,6,,11.955235691700665,DRIVEALONEFREE,1.3804088714164349,,,,,,,,,0out_0in,eatout,81048478 +1976792,work,1,1,1,1,mandatory,1,517,613,761445,7,5,12,7,,,WALK_TRANSIT,0.5051019359212883,1681.0,1238.0,fastest,1238.0,1681.0,fastest,eat,,0out_0in,work,81048511 +1978788,social,1,1,1,1,non_mandatory,1,763,961,762159,66,8,20,12,,12.393316604403664,SHARED2FREE,1.1233925360416968,,,,,,,,,0out_0in,social,81130344 +1978790,escort,1,1,1,1,non_mandatory,1,992,961,762159,54,8,8,0,,12.58022321361327,SHARED2FREE,-0.9361958649826466,,,,,,,,,0out_0in,escort,81130399 +1978790,work,1,1,1,1,mandatory,1,672,961,762159,63,8,17,9,,,DRIVEALONEFREE,0.7319548477569705,,,,,,,no_subtours,,0out_0in,work,81130429 +1978791,work,1,1,1,1,mandatory,1,1070,961,762159,47,7,17,10,,,WALK_TRANSIT,1.223231536517431,1500.0,1762.0,shortest,1762.0,1500.0,shortest,no_subtours,,0out_0in,work,81130470 +2498047,school,1,1,1,1,mandatory,1,865,730,922602,77,9,16,7,,,WALK_TRANSIT,4.515115660099912,1698.0,1608.0,shortest,1608.0,1698.0,fastest,,,0out_0in,school,102419958 +2498048,work,1,1,1,1,mandatory,1,735,730,922602,46,7,16,9,,,WALK_TRANSIT,2.37613290143943,1592.0,1612.0,shortest,1612.0,1592.0,shortest,no_subtours,,0out_0in,work,102420007 +2498049,work,1,1,1,1,mandatory,1,763,730,922602,29,6,16,10,,,WALK,2.522839906360249,,,,,,,no_subtours,,0out_0in,work,102420048 +2622192,school,1,1,1,1,mandatory,1,995,1025,952720,44,7,14,7,,,WALK_TRANSIT,0.7240925397746193,1664.0,1666.0,fastest,1666.0,1664.0,shortest,,,0out_0in,school,107509903 +2622193,escort,1,1,1,2,non_mandatory,1,773,1025,952720,19,6,6,0,,12.840047018957565,SHARED3FREE,-0.7318852867656347,,,,,,,,,0out_0in,escort,107509922 +2622193,othmaint,1,1,2,2,non_mandatory,1,550,1025,952720,61,8,15,7,,12.86750275018236,WALK_TRANSIT,0.3002279969733954,1238.0,1666.0,fastest,1666.0,1238.0,shortest,,,0out_0in,othmaint,107509941 +2622194,shopping,1,1,1,1,non_mandatory,1,989,1025,952720,72,9,11,2,,11.974087902319576,SHARED3FREE,0.5678792550825605,,,,,,,,,0out_0in,shopping,107509987 +2622195,work,1,1,1,1,mandatory,1,1021,1025,952720,63,8,17,9,,,DRIVEALONEFREE,0.8640931880479102,,,,,,,no_subtours,,0out_0in,work,107510034 +2844887,work,1,1,1,1,mandatory,1,845,846,1028031,111,11,23,12,,,WALK_TRANSIT,0.9802398468480144,1730.0,1695.0,shortest,1695.0,1748.0,fastest,no_subtours,,0out_0in,work,116640406 +2933845,school,1,1,1,1,mandatory,1,666,574,1048898,121,12,21,9,,,WALK,-0.0074452524495141,,,,,,,,,0out_0in,school,120287676 +2933846,school,1,1,1,1,mandatory,1,515,574,1048898,62,8,16,8,,,SHARED2FREE,-1.016021399976598,,,,,,,,,0out_0in,school,120287717 +2933847,othdiscr,1,1,1,1,non_mandatory,1,657,574,1048898,42,7,12,5,,13.002449945790438,WALK_TRANSIT,0.8636594074657713,1238.0,1559.0,shortest,1559.0,1238.0,fastest,,,0out_1in,othdiscr,120287752 +2933848,work,1,1,1,1,mandatory,1,502,574,1048898,31,6,18,12,,,WALK_TRANSIT,0.3885773372794716,1333.0,1559.0,fastest,1559.0,1333.0,shortest,no_subtours,,0out_0in,work,120287807 +3216622,school,1,1,1,1,mandatory,1,1074,1076,1148260,136,14,15,1,,,WALK_TRANSIT,10.725339744357893,1500.0,1516.0,fastest,1516.0,1500.0,shortest,,,0out_0in,univ,131881533 diff --git a/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_trips.csv b/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_trips.csv index 8c93f8ec82..48165dcd5e 100644 --- a/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_trips.csv +++ b/activitysim/examples/placeholder_sandag/test/regress/final_3_zone_trips.csv @@ -1,183 +1,185 @@ -trip_id,person_id,household_id,primary_purpose,trip_num,outbound,trip_count,destination,origin,tour_id,purpose,destination_logsum,depart,trip_mode,mode_choice_logsum,atap,btap,path_set -10872201,33146,12593,work,1,True,3,350,68,1359025,escort,11.62710514749821,5,DRIVEALONEFREE,-0.3153159028910412,,, -10872202,33146,12593,work,2,True,3,491,350,1359025,escort,12.424454211009824,7,DRIVEALONEFREE,-0.1330752275781039,,, -10872203,33146,12593,work,3,True,3,608,491,1359025,work,,7,DRIVEALONEFREE,0.1534821760951137,,, -10872205,33146,12593,work,1,False,1,68,608,1359025,home,,17,WALK,-0.3102996602636911,,, -10872529,33147,12593,work,1,True,1,578,68,1359066,work,,8,WALK_TRANSIT,2.7167630046939766,1500.0,1558.0,shortest -10872533,33147,12593,work,1,False,2,578,578,1359066,escort,19.372183137979995,15,WALK,2.9941670645943352,,, -10872534,33147,12593,work,2,False,2,68,578,1359066,home,,15,WALK,1.93207085094047,,, -11957177,36454,13797,shopping,1,True,1,233,88,1494647,shopping,,13,DRIVEALONEFREE,0.1611535351527045,,, -11957181,36454,13797,shopping,1,False,2,423,233,1494647,shopping,12.025184457266546,17,TAXI,-0.0282876166283608,,, -11957182,36454,13797,shopping,2,False,2,88,423,1494647,home,,17,DRIVEALONEFREE,0.0612368483543265,,, -11957185,36454,13797,shopping,1,True,2,195,88,1494648,escort,13.40305488007496,18,WALK,0.3896474852723683,,, -11957186,36454,13797,shopping,2,True,2,237,195,1494648,shopping,,18,DRIVEALONEFREE,0.4918839198395139,,, -11957189,36454,13797,shopping,1,False,1,88,237,1494648,home,,18,TAXI,0.2042092214903569,,, -11957441,36455,13797,othdiscr,1,True,1,131,88,1494680,othdiscr,,16,DRIVEALONEFREE,0.1718894334261086,,, -11957445,36455,13797,othdiscr,1,False,3,578,131,1494680,othmaint,12.886372901931878,21,TAXI,-0.1803784427916115,,, -11957446,36455,13797,othdiscr,2,False,3,578,578,1494680,social,14.222827339077435,21,WALK,1.1995504881185008,,, -11957447,36455,13797,othdiscr,3,False,3,88,578,1494680,home,,21,TAXI,0.0855272375513731,,, -11957553,36455,13797,work,1,True,1,70,88,1494694,work,,8,WALK,0.2791473443211477,,, -11957557,36455,13797,work,1,False,1,88,70,1494694,home,,12,WALK,0.2791984027393967,,, -13679289,41705,15777,eatout,1,True,1,578,153,1709911,eatout,,9,WALK_TRANSIT,3.0405322129710664,1500.0,1611.0,fastest -13679293,41705,15777,eatout,1,False,1,153,578,1709911,home,,15,TAXI,1.1068614998173167,,, -13679601,41706,15777,atwork,1,True,1,497,362,1709950,atwork,,12,WALK_TRANSIT,1.7190241715037766,1618.0,1748.0,fastest -13679605,41706,15777,atwork,1,False,1,362,497,1709950,work,,12,SHARED2FREE,1.0527994338149784,,, -13679881,41706,15777,work,1,True,1,362,153,1709985,work,,8,SHARED2FREE,0.5758380783277821,,, -13679885,41706,15777,work,1,False,2,597,362,1709985,escort,14.841057893338451,18,SHARED2FREE,0.4353503862435287,,, -13679886,41706,15777,work,2,False,2,153,597,1709985,home,,19,SHARED2FREE,0.4996880766938729,,, -16411585,50035,18261,eatout,1,True,2,498,265,2051448,shopping,10.05333950335537,16,SHARED3FREE,-0.6138900950914048,,, -16411586,50035,18261,eatout,2,True,2,578,498,2051448,eatout,,17,WALK,-0.1273881390654513,,, -16411589,50035,18261,eatout,1,False,1,265,578,2051448,home,,18,SHARED3FREE,-1.5064572485047776,,, -16411729,50035,18261,school,1,True,1,427,265,2051466,school,,7,SHARED3FREE,0.7485359100139714,,, -16411733,50035,18261,school,1,False,1,265,427,2051466,home,,13,WALK_TRANSIT,0.8841733805272283,1748.0,1618.0,fastest -16411745,50035,18261,shopping,1,True,1,504,265,2051468,shopping,,19,DRIVEALONEFREE,-0.4672462178894699,,, -16411749,50035,18261,shopping,1,False,1,265,504,2051468,home,,19,DRIVEALONEFREE,-0.4597066008562396,,, -16412033,50036,18261,othmaint,1,True,1,385,265,2051504,othmaint,,7,SHARED2FREE,0.8019298221360518,,, -16412037,50036,18261,othmaint,1,False,1,265,385,2051504,home,,14,SHARED2FREE,0.7815669264302088,,, -16412449,50037,18261,work,1,True,1,578,265,2051556,work,,7,SHARED2FREE,2.569476979448462,,, -16412453,50037,18261,work,1,False,1,265,578,2051556,home,,16,WALK_TRANSIT,1.0586461337073116,1651.0,1500.0,fastest -18151113,55338,19758,school,1,True,1,207,302,2268889,school,,7,WALK_TRANSIT,1.3220566998920782,1608.0,1652.0,cheapest -18151117,55338,19758,school,1,False,1,302,207,2268889,home,,10,SHARED2FREE,1.0450588497025504,,, -18151505,55339,19758,work,1,True,1,254,302,2268938,work,,5,DRIVEALONEFREE,0.1112722333894017,,, -18151509,55339,19758,work,1,False,1,302,254,2268938,home,,16,DRIVEALONEFREE,0.1112770043068081,,, -18990529,57897,20552,work,1,True,1,578,337,2373816,work,,7,WALK_TRANSIT,2.167693637863771,1500.0,1754.0,fastest -18990533,57897,20552,work,1,False,1,337,578,2373816,home,,20,SHARED2FREE,0.8134213565684749,,, -18990545,57898,20552,atwork,1,True,2,456,456,2373818,othmaint,6.956302743667657,11,WALK,-0.5725554350319478,,, -18990546,57898,20552,atwork,2,True,2,456,456,2373818,atwork,,11,WALK,-0.5725554350319478,,, -18990549,57898,20552,atwork,1,False,1,456,456,2373818,work,,13,WALK,-0.5725554350319478,,, -18990857,57898,20552,work,1,True,1,456,337,2373857,work,,11,DRIVEALONEFREE,-0.4249793219573366,,, -18990861,57898,20552,work,1,False,2,350,456,2373857,escort,11.522885117323073,17,DRIVEALONEFREE,-0.4224341222895262,,, -18990862,57898,20552,work,2,False,2,337,350,2373857,home,,17,DRIVEALONEFREE,-0.0678724579075491,,, -18991185,57899,20552,work,1,True,1,195,337,2373898,work,,7,WALK,0.497663225159644,,, -18991189,57899,20552,work,1,False,1,337,195,2373898,home,,17,WALK,0.4231764292042089,,, -18991841,57901,20552,work,1,True,1,216,337,2373980,work,,6,DRIVEALONEFREE,-0.2156500818117738,,, -18991845,57901,20552,work,1,False,1,337,216,2373980,home,,17,DRIVEALONEFREE,-0.8860210329051176,,, -18991849,57901,20552,work,1,True,2,195,337,2373981,othmaint,13.258114756000287,19,WALK,1.189890260596198,,, -18991850,57901,20552,work,2,True,2,216,195,2373981,work,,19,SHARED2FREE,0.5495070616058875,,, -18991853,57901,20552,work,1,False,1,337,216,2373981,home,,19,DRIVEALONEFREE,0.2682235453386703,,, -20510417,62531,21869,school,1,True,1,442,408,2563802,school,,20,TAXI,0.9918167280007733,,, -20510421,62531,21869,school,1,False,1,408,442,2563802,home,,20,SHARED3FREE,0.9273027824432368,,, -20510569,62532,21869,escort,1,True,1,155,408,2563821,escort,,6,SHARED2FREE,0.335383128230131,,, -20510573,62532,21869,escort,1,False,1,408,155,2563821,home,,7,DRIVEALONEFREE,0.3214092822815929,,, -20510897,62533,21869,escort,1,True,1,203,408,2563862,escort,,5,SHARED3FREE,0.7146320921116115,,, -20510901,62533,21869,escort,1,False,4,504,203,2563862,shopping,13.905386682389064,6,SHARED3FREE,0.8499729610956421,,, -20510902,62533,21869,escort,2,False,4,73,504,2563862,eatout,14.52262097584395,6,SHARED2FREE,0.5819032445094066,,, -20510903,62533,21869,escort,3,False,4,607,73,2563862,escort,14.887504187131842,6,SHARED3FREE,0.5359149885298558,,, -20510904,62533,21869,escort,4,False,4,408,607,2563862,home,,6,SHARED2FREE,0.8747878537896981,,, -20510905,62533,21869,escort,1,True,1,26,408,2563863,escort,,11,SHARED2FREE,0.403734784317824,,, -20510909,62533,21869,escort,1,False,1,408,26,2563863,home,,11,SHARED3FREE,0.4094721213082503,,, -20510913,62533,21869,escort,1,True,1,352,408,2563864,escort,,14,SHARED2FREE,0.2145990810721203,,, -20510917,62533,21869,escort,1,False,1,408,352,2563864,home,,14,SHARED2FREE,0.2175421651558981,,, -20511025,62533,21869,othdiscr,1,True,1,578,408,2563878,othdiscr,,11,WALK_TRANSIT,3.124262835713407,1500.0,73.0,fastest -20511029,62533,21869,othdiscr,1,False,1,408,578,2563878,home,,12,WALK_TRANSIT,1.320541626578869,73.0,1500.0,fastest -20511401,62534,21869,school,1,True,1,301,408,2563925,school,,8,SHARED3FREE,0.6297085882553292,,, -20511405,62534,21869,school,1,False,1,408,301,2563925,home,,9,SHARED3FREE,0.5963697573192646,,, -22303745,67999,23619,escort,1,True,1,269,481,2787968,escort,,13,DRIVEALONEFREE,0.9471497272884896,,, -22303749,67999,23619,escort,1,False,3,514,269,2787968,eatout,15.292255065772112,13,DRIVEALONEFREE,1.0997559458267394,,, -22303750,67999,23619,escort,2,False,3,496,514,2787968,escort,16.470852149311927,13,DRIVEALONEFREE,0.9919349481465494,,, -22303751,67999,23619,escort,3,False,3,481,496,2787968,home,,13,SHARED2FREE,1.0470193699425838,,, -22303961,67999,23619,social,1,True,1,496,481,2787995,social,,17,WALK,-0.5194290001855648,,, -22303965,67999,23619,social,1,False,1,481,496,2787995,home,,20,WALK,-0.5194329974199648,,, -22304313,68000,23619,work,1,True,2,437,481,2788039,social,22.336188858055703,7,SHARED3FREE,1.245370428774317,,, -22304314,68000,23619,work,2,True,2,578,437,2788039,work,,8,WALK_TRANSIT,3.2620308996092504,1500.0,1717.0,fastest -22304317,68000,23619,work,1,False,1,481,578,2788039,home,,21,WALK_TRANSIT,1.5243469382674963,1588.0,1500.0,fastest -25904705,78977,26897,school,1,True,1,492,589,3238088,school,,7,WALK,1.1991023619246646,,, -25904709,78977,26897,school,1,False,1,589,492,3238088,home,,14,WALK_TRANSIT,1.1320593632124774,1584.0,1618.0,cheapest -25905145,78979,26897,atwork,1,True,1,578,405,3238143,atwork,,9,DRIVEALONEFREE,-0.0496509953086405,,, -25905149,78979,26897,atwork,1,False,1,405,578,3238143,work,,11,DRIVEALONEFREE,-0.0461032274958093,,, -25905425,78979,26897,work,1,True,1,405,589,3238178,work,,7,DRIVEALONEFREE,0.119565485964625,,, -25905429,78979,26897,work,1,False,1,589,405,3238178,home,,18,DRIVEALONEFREE,0.1179742410301269,,, -421021769,1283602,435012,work,1,True,1,586,29,52627721,work,,8,WALK_TRANSIT,1.8021163156603703,1584.0,1608.0,fastest -421021773,1283602,435012,work,1,False,1,29,586,52627721,home,,18,WALK_TRANSIT,0.8450040063014843,1238.0,1584.0,shortest -421108705,1283868,435278,atwork,1,True,1,578,578,52638588,atwork,,12,WALK,2.459897236353819,,, -421108709,1283868,435278,atwork,1,False,1,578,578,52638588,work,,12,WALK,2.459897236353819,,, -421108753,1283868,435278,eatout,1,True,1,578,45,52638594,eatout,,18,WALK_TRANSIT,3.4599905231494112,1500.0,1558.0,shortest -421108757,1283868,435278,eatout,1,False,2,531,578,52638594,shopping,16.971131751329413,21,WALK,2.4125169566233917,,, -421108758,1283868,435278,eatout,2,False,2,45,531,52638594,home,,21,SHARED2FREE,0.9951467720082716,,, -421109017,1283868,435278,work,1,True,1,578,45,52638627,work,,9,WALK_TRANSIT,2.556266300862197,1500.0,1558.0,cheapest -421109021,1283868,435278,work,1,False,1,45,578,52638627,home,,18,WALK,1.4769191108284825,,, -421134601,1283946,435356,work,1,True,1,578,31,52641825,work,,7,WALK_TRANSIT,2.564484302870523,1500.0,1604.0,shortest -421134605,1283946,435356,work,1,False,1,31,578,52641825,home,,19,TNC_SINGLE,1.2220841443471149,,, -421348457,1284598,436008,work,1,True,1,61,70,52668557,work,,9,WALK,0.38649989959854,,, -421348461,1284598,436008,work,1,False,1,70,61,52668557,home,,19,WALK,0.3859946435001655,,, -421878553,1286215,437625,atwork,1,True,1,585,114,52734819,atwork,,10,WALK_TRANSIT,1.081956268867972,1500.0,1562.0,cheapest -421878557,1286215,437625,atwork,1,False,1,114,585,52734819,work,,13,WALK_TRANSIT,1.6218413932594848,1562.0,1500.0,fastest -421878833,1286215,437625,work,1,True,1,114,164,52734854,work,,8,DRIVEALONEFREE,0.0108507049503425,,, -421878837,1286215,437625,work,1,False,3,505,114,52734854,othmaint,11.678285931399188,17,DRIVEALONEFREE,-0.2251500347921951,,, -421878838,1286215,437625,work,2,False,3,238,505,52734854,othmaint,11.653319784580876,19,DRIVEALONEFREE,-0.0918226143933905,,, -421878839,1286215,437625,work,3,False,3,164,238,52734854,home,,19,DRIVEALONEFREE,0.1142972915542419,,, -423180353,1290184,441594,atwork,1,True,1,240,485,52897544,atwork,,11,WALK,1.1616704858421698,,, -423180357,1290184,441594,atwork,1,False,1,485,240,52897544,work,,11,WALK,1.1616704548482053,,, -423180401,1290184,441594,eatout,1,True,1,347,604,52897550,eatout,,21,SHARED2FREE,0.1039931558703346,,, -423180405,1290184,441594,eatout,1,False,2,600,347,52897550,othmaint,13.35678663215775,21,DRIVEALONEFREE,0.1168155383854849,,, -423180406,1290184,441594,eatout,2,False,2,604,600,52897550,home,,21,WALK,0.9240213001163314,,, -423180553,1290184,441594,othdiscr,1,True,1,22,604,52897569,othdiscr,,5,WALK,-0.0748834380443437,,, -423180557,1290184,441594,othdiscr,1,False,1,604,22,52897569,home,,7,TAXI,-0.2702437909392063,,, -423180561,1290184,441594,othdiscr,1,True,1,241,604,52897570,othdiscr,,21,BIKE,-1.0261281159570592,,, -423180565,1290184,441594,othdiscr,1,False,1,604,241,52897570,home,,22,BIKE,-1.0364857021941367,,, -423180569,1290184,441594,othdiscr,1,True,1,161,604,52897571,othdiscr,,23,TAXI,0.93738966972432,,, -423180573,1290184,441594,othdiscr,1,False,2,232,161,52897571,eatout,15.99440404367919,23,SHARED2FREE,1.631333771028581,,, -423180574,1290184,441594,othdiscr,2,False,2,604,232,52897571,home,,23,WALK_TRANSIT,0.5700662414192166,1516.0,1608.0,shortest -423180665,1290184,441594,work,1,True,1,485,604,52897583,work,,9,DRIVEALONEFREE,-0.2074790120685403,,, -423180669,1290184,441594,work,1,False,2,504,485,52897583,shopping,10.648841698636746,20,WALK,0.2135574369121218,,, -423180670,1290184,441594,work,2,False,2,604,504,52897583,home,,21,DRIVEALONEFREE,-0.1688195503141319,,, -423325641,1290626,442036,work,1,True,2,578,608,52915705,work,22.793846679159277,8,WALK_TRANSIT,2.6947489444098363,1500.0,1584.0,cheapest -423325642,1290626,442036,work,2,True,2,578,578,52915705,work,,9,TAXI,2.9785405530718863,,, -423325645,1290626,442036,work,1,False,1,608,578,52915705,home,,18,WALK_TRANSIT,1.5204714046677252,1584.0,1500.0,shortest -611033041,1862905,721960,othdiscr,1,True,1,494,468,76379130,othdiscr,,15,BIKE,-0.0551141326390255,,, -611033045,1862905,721960,othdiscr,1,False,1,468,494,76379130,home,,21,BIKE,-0.0503449080980652,,, -611033369,1862906,721960,othdiscr,1,True,1,500,468,76379171,othdiscr,,11,WALK,-1.4043276286248658,,, -611033373,1862906,721960,othdiscr,1,False,1,468,500,76379171,home,,16,WALK,-1.3842194916923618,,, -647572569,1974306,760593,othdiscr,1,True,1,72,17,80946571,othdiscr,,8,WALK,0.0729983438076541,,, -647572573,1974306,760593,othdiscr,1,False,1,17,72,80946571,home,,16,WALK,0.0730661118420978,,, -647572729,1974307,760593,atwork,1,True,1,324,204,80946591,atwork,,12,SHARED3FREE,2.153561514861549,,, -647572733,1974307,760593,atwork,1,False,1,204,324,80946591,work,,13,SHARED3FREE,2.1604129937838445,,, -647573009,1974307,760593,work,1,True,2,244,17,80946626,shopping,16.850702372805465,9,WALK_TRANSIT,1.2419694217829829,1748.0,1603.0,shortest -647573010,1974307,760593,work,2,True,2,204,244,80946626,work,,10,WALK_TRANSIT,1.6771107500758995,1748.0,1660.0,shortest -647573013,1974307,760593,work,1,False,2,176,204,80946626,work,18.744021387869157,17,WALK_TRANSIT,1.7298767083660849,1333.0,1658.0,shortest -647573014,1974307,760593,work,2,False,2,17,176,80946626,home,,18,SHARED3FREE,1.5876337419644198,,, -647573097,1974308,760593,escort,1,True,1,224,17,80946637,escort,,5,SHARED3FREE,0.8922078793578596,,, -647573101,1974308,760593,escort,1,False,1,17,224,80946637,home,,5,SHARED3FREE,0.8942651098665495,,, -648387521,1976791,761445,escort,1,True,1,483,121,81048440,escort,,13,SHARED2FREE,0.5482474086094538,,, -648387525,1976791,761445,escort,1,False,1,121,483,81048440,home,,13,SHARED2FREE,0.5356122283077811,,, -648388065,1976792,761445,social,1,True,1,494,121,81048508,social,,14,DRIVEALONEFREE,0.4863963073134856,,, -648388069,1976792,761445,social,1,False,1,121,494,81048508,home,,19,TAXI,0.5958102526909501,,, -648388089,1976792,761445,work,1,True,1,25,121,81048511,work,,5,TAXI,1.5879637864872944,,, -648388093,1976792,761445,work,1,False,1,121,25,81048511,home,,12,SHARED2FREE,1.1814737849875496,,, -649042753,1978788,762159,social,1,True,1,271,469,81130344,social,,8,SHARED2FREE,0.3356357191681573,,, -649042757,1978788,762159,social,1,False,1,469,271,81130344,home,,20,SHARED2FREE,0.3505516593935055,,, -649043193,1978790,762159,escort,1,True,1,500,469,81130399,escort,,8,SHARED2FREE,0.5471590891799881,,, -649043197,1978790,762159,escort,1,False,1,469,500,81130399,home,,8,SHARED2FREE,0.5333524059928323,,, -649043433,1978790,762159,work,1,True,1,180,469,81130429,work,,8,DRIVEALONEFREE,-0.3341036694423639,,, -649043437,1978790,762159,work,1,False,1,469,180,81130429,home,,17,DRIVEALONEFREE,-0.3318966430958522,,, -649043761,1978791,762159,work,1,True,1,578,469,81130470,work,,7,WALK_TRANSIT,2.58470148067105,1500.0,1762.0,fastest -649043765,1978791,762159,work,1,False,1,469,578,81130470,home,,17,TAXI,1.0619367021498107,,, -819359665,2498047,922602,school,1,True,1,432,238,102419958,school,,9,TAXI,1.009935909154772,,, -819359669,2498047,922602,school,1,False,1,238,432,102419958,home,,16,SHARED2FREE,1.7812941984342985,,, -819360057,2498048,922602,work,1,True,1,243,238,102420007,work,,7,SHARED3FREE,1.630772738193186,,, -819360061,2498048,922602,work,1,False,1,238,243,102420007,home,,16,WALK_TRANSIT,1.9203484754329128,1612.0,1592.0,shortest -819360385,2498049,922602,work,1,True,1,271,238,102420048,work,,6,WALK,-1.149581787441591,,, -819360389,2498049,922602,work,1,False,1,238,271,102420048,home,,16,WALK,-1.1497709160537464,,, -860079225,2622192,952720,school,1,True,1,501,533,107509903,school,,7,TNC_SINGLE,1.6140702309701105,,, -860079229,2622192,952720,school,1,False,1,533,501,107509903,home,,14,SHARED3FREE,1.5475600453518767,,, -860079377,2622193,952720,escort,1,True,1,281,533,107509922,escort,,6,WALK,-0.5578556373070639,,, -860079381,2622193,952720,escort,1,False,1,533,281,107509922,home,,6,WALK,-0.5573194757596159,,, -860079529,2622193,952720,othmaint,1,True,1,58,533,107509941,othmaint,,8,TAXI,1.632612282318792,,, -860079533,2622193,952720,othmaint,1,False,1,533,58,107509941,home,,15,WALK_TRANSIT,1.504929950475204,1666.0,1238.0,shortest -860079897,2622194,952720,shopping,1,True,1,504,533,107509987,shopping,,9,SHARED3FREE,1.5569490328652331,,, -860079901,2622194,952720,shopping,1,False,1,533,504,107509987,home,,13,SHARED3FREE,1.5615906994764628,,, -860080273,2622195,952720,work,1,True,1,578,533,107510034,work,,8,WALK_TRANSIT,2.753496136291756,1500.0,1666.0,fastest -860080277,2622195,952720,work,1,False,1,533,578,107510034,home,,17,WALK,1.6442514073833965,,, -933123249,2844887,1028031,work,1,True,1,385,354,116640406,work,,11,SHARED3FREE,1.076717359011956,,, -933123253,2844887,1028031,work,1,False,1,354,385,116640406,home,,20,SHARED2FREE,1.076810290051006,,, -962301409,2933845,1048898,school,1,True,1,197,82,120287676,school,,12,WALK_TRANSIT,1.28199080402723,1608.0,1559.0,fastest -962301413,2933845,1048898,school,1,False,1,82,197,120287676,home,,21,WALK_TRANSIT,1.089904965843972,1559.0,1608.0,cheapest -962301737,2933846,1048898,school,1,True,1,23,82,120287717,school,,8,SHARED2FREE,-2.124600564910897,,, -962301741,2933846,1048898,school,1,False,1,82,23,120287717,home,,16,SHARED2FREE,-2.1298532205579024,,, -962302017,2933847,1048898,othdiscr,1,True,1,165,82,120287752,othdiscr,,7,WALK_TRANSIT,1.3340675832011153,1238.0,1559.0,cheapest -962302021,2933847,1048898,othdiscr,1,False,2,578,165,120287752,eatout,16.449525883467185,12,WALK_TRANSIT,3.482863279857927,3.0,1238.0,shortest -962302022,2933847,1048898,othdiscr,2,False,2,82,578,120287752,home,,12,WALK_TRANSIT,1.1970215357429652,1559.0,1500.0,fastest -962302457,2933848,1048898,work,1,True,1,10,82,120287807,work,,6,WALK_TRANSIT,1.075493937379867,1333.0,1559.0,fastest -962302461,2933848,1048898,work,1,False,1,82,10,120287807,home,,18,TAXI,1.0317861580544103,,, -1055052265,3216622,1148260,univ,1,True,1,582,584,131881533,univ,,14,TAXI,0.8904812765761871,,, -1055052269,3216622,1148260,univ,1,False,1,584,582,131881533,home,,15,WALK,0.8480171082358124,,, +person_id,household_id,primary_purpose,trip_num,outbound,trip_count,destination,origin,tour_id,purpose,destination_logsum,depart,trip_mode,mode_choice_logsum,atap,btap,path_set,trip_id +33146,12593,work,1,True,3,879,560,1359025,escort,11.70885284520791,5,DRIVEALONEFREE,-0.3548746577733072,,,,10872201 +33146,12593,work,2,True,3,989,879,1359025,escort,12.022433281644073,7,DRIVEALONEFREE,-0.361684345054043,,,,10872202 +33146,12593,work,3,True,3,1100,989,1359025,work,,7,DRIVEALONEFREE,0.1801428827020189,,,,10872203 +33146,12593,work,1,False,1,560,1100,1359025,home,,17,DRIVEALONEFREE,-0.368339088809198,,,,10872205 +33147,12593,work,1,True,1,1070,560,1359066,work,,8,WALK_TRANSIT,2.7167630046939766,1500.0,1558.0,shortest,10872529 +33147,12593,work,1,False,2,909,1070,1359066,escort,18.76869362022653,15,SHARED2FREE,1.3303021092897034,,,,10872533 +33147,12593,work,2,False,2,560,909,1359066,home,,15,SHARED2FREE,1.1209924956902448,,,,10872534 +36454,13797,shopping,1,True,1,725,580,1494647,shopping,,13,DRIVEALONEFREE,0.1717448995480592,,,,11957177 +36454,13797,shopping,1,False,2,803,725,1494647,shopping,12.016029658018422,17,TAXI,0.0352267728440656,,,,11957181 +36454,13797,shopping,2,False,2,580,803,1494647,home,,17,DRIVEALONEFREE,0.07359791064631,,,,11957182 +36454,13797,shopping,1,True,2,687,580,1494648,escort,13.593599471819292,18,WALK,0.7788942047635835,,,,11957185 +36454,13797,shopping,2,True,2,729,687,1494648,shopping,,18,DRIVEALONEFREE,0.489210171281661,,,,11957186 +36454,13797,shopping,1,False,1,580,729,1494648,home,,18,TAXI,0.2313076964301194,,,,11957189 +36455,13797,othdiscr,1,True,1,623,580,1494680,othdiscr,,16,DRIVEALONEFREE,0.7744647479734814,,,,11957441 +36455,13797,othdiscr,1,False,3,784,623,1494680,othmaint,15.224351954248831,21,SHARED2FREE,0.6723499661375587,,,,11957445 +36455,13797,othdiscr,2,False,3,687,784,1494680,social,14.665238994089853,21,SHARED2FREE,0.8916379345159234,,,,11957446 +36455,13797,othdiscr,3,False,3,580,687,1494680,home,,21,WALK,1.6583917720273251,,,,11957447 +36455,13797,work,1,True,1,562,580,1494694,work,,8,WALK,1.0769131864309256,,,,11957553 +36455,13797,work,1,False,1,580,562,1494694,home,,12,SHARED3FREE,1.1126409173456748,,,,11957557 +41705,15777,eatout,1,True,1,1070,645,1709911,eatout,,9,WALK_TRANSIT,3.0405322129710664,1500.0,1611.0,fastest,13679289 +41705,15777,eatout,1,False,1,645,1070,1709911,home,,15,TAXI,1.0589757414053866,,,,13679293 +41706,15777,atwork,1,True,1,989,854,1709950,atwork,,12,WALK_TRANSIT,1.744638602675838,1618.0,1748.0,fastest,13679601 +41706,15777,atwork,1,False,1,854,989,1709950,work,,12,SHARED2FREE,1.1051431249206285,,,,13679605 +41706,15777,work,1,True,1,854,645,1709985,work,,8,SHARED2FREE,0.484767983512682,,,,13679881 +41706,15777,work,1,False,2,1080,854,1709985,escort,14.795756318103102,18,SHARED2FREE,0.4510582957043532,,,,13679885 +41706,15777,work,2,False,2,645,1080,1709985,home,,19,SHARED2FREE,0.4925057974553318,,,,13679886 +50035,18261,eatout,1,True,2,897,757,2051448,shopping,9.84722541528066,16,SHARED3FREE,-0.8214696655451806,,,,16411585 +50035,18261,eatout,2,True,2,1070,897,2051448,eatout,,17,WALK,0.1729919081975451,,,,16411586 +50035,18261,eatout,1,False,1,757,1070,2051448,home,,18,SHARED3FREE,-1.5064572485047776,,,,16411589 +50035,18261,school,1,True,1,919,757,2051466,school,,7,SHARED3FREE,0.7485359100139714,,,,16411729 +50035,18261,school,1,False,1,757,919,2051466,home,,13,WALK_TRANSIT,0.8841733805272283,1748.0,1618.0,fastest,16411733 +50035,18261,shopping,1,True,1,996,757,2051468,shopping,,19,DRIVEALONEFREE,-0.4672462178894699,,,,16411745 +50035,18261,shopping,1,False,1,757,996,2051468,home,,19,DRIVEALONEFREE,-0.4597066008562396,,,,16411749 +50036,18261,othmaint,1,True,1,877,757,2051504,othmaint,,7,SHARED2FREE,0.8103017184288349,,,,16412033 +50036,18261,othmaint,1,False,1,757,877,2051504,home,,14,SHARED2FREE,0.7939513107930013,,,,16412037 +50037,18261,work,1,True,1,1070,757,2051556,work,,7,SHARED2FREE,2.569476979448462,,,,16412449 +50037,18261,work,1,False,1,757,1070,2051556,home,,16,WALK_TRANSIT,1.0586461337073116,1651.0,1500.0,fastest,16412453 +55338,19758,school,1,True,1,699,794,2268889,school,,7,WALK_TRANSIT,1.2437700951912316,1608.0,1652.0,cheapest,18151113 +55338,19758,school,1,False,1,794,699,2268889,home,,10,SHARED2FREE,0.9734243621086656,,,,18151117 +55339,19758,work,1,True,1,762,794,2268938,work,,5,DRIVEALONEFREE,-0.0975185793139585,,,,18151505 +55339,19758,work,1,False,1,794,762,2268938,home,,16,DRIVEALONEFREE,-0.0075877722519422,,,,18151509 +57897,20552,work,1,True,1,1070,829,2373816,work,,7,WALK_TRANSIT,2.167693637863771,1500.0,1754.0,fastest,18990529 +57897,20552,work,1,False,1,829,1070,2373816,home,,20,SHARED2FREE,0.8134213565684749,,,,18990533 +57898,20552,atwork,1,True,2,948,948,2373818,othmaint,7.505984009752515,11,WALK,-0.4339456610259058,,,,18990545 +57898,20552,atwork,2,True,2,948,948,2373818,atwork,,11,WALK,-0.4339456610259058,,,,18990546 +57898,20552,atwork,1,False,1,948,948,2373818,work,,13,WALK,-0.4339456610259058,,,,18990549 +57898,20552,work,1,True,1,948,829,2373857,work,,11,DRIVEALONEFREE,-0.4249793219573366,,,,18990857 +57898,20552,work,1,False,2,739,948,2373857,escort,11.536722530937924,17,DRIVEALONEFREE,-0.4351552970051177,,,,18990861 +57898,20552,work,2,False,2,829,739,2373857,home,,17,DRIVEALONEFREE,-0.0991454272062134,,,,18990862 +57899,20552,work,1,True,1,687,829,2373898,work,,7,WALK,0.5220452718494769,,,,18991185 +57899,20552,work,1,False,1,829,687,2373898,home,,17,WALK,0.6195549538462863,,,,18991189 +57901,20552,work,1,True,1,708,829,2373980,work,,6,DRIVEALONEFREE,-0.2156500818117738,,,,18991841 +57901,20552,work,1,False,1,829,708,2373980,home,,17,DRIVEALONEFREE,-0.8860210329051176,,,,18991845 +57901,20552,work,1,True,2,687,829,2373981,othmaint,13.312507041737812,19,WALK,1.203803918419165,,,,18991849 +57901,20552,work,2,True,2,708,687,2373981,work,,19,SHARED2FREE,0.5575710262749435,,,,18991850 +57901,20552,work,1,False,1,829,708,2373981,home,,19,DRIVEALONEFREE,0.2682235453386703,,,,18991853 +62531,21869,school,1,True,1,938,900,2563802,school,,20,TAXI,0.9896882699261176,,,,20510417 +62531,21869,school,1,False,1,900,938,2563802,home,,20,SHARED3FREE,0.9332356383119612,,,,20510421 +62532,21869,escort,1,True,1,647,900,2563821,escort,,6,SHARED2FREE,0.335383128230131,,,,20510569 +62532,21869,escort,1,False,1,900,647,2563821,home,,7,DRIVEALONEFREE,0.3214092822815929,,,,20510573 +62533,21869,escort,1,True,1,695,900,2563862,escort,,5,SHARED3FREE,0.7146320921116115,,,,20510897 +62533,21869,escort,1,False,4,996,695,2563862,shopping,13.9010789936427,6,SHARED3FREE,0.8524136187385734,,,,20510901 +62533,21869,escort,2,False,4,565,996,2563862,eatout,14.39637292108163,6,SHARED2FREE,0.5414028775934869,,,,20510902 +62533,21869,escort,3,False,4,1099,565,2563862,escort,14.873416692380914,6,SHARED3FREE,0.5359149885298558,,,,20510903 +62533,21869,escort,4,False,4,900,1099,2563862,home,,6,SHARED2FREE,0.8757485870243559,,,,20510904 +62533,21869,escort,1,True,1,518,900,2563863,escort,,11,SHARED2FREE,0.403734784317824,,,,20510905 +62533,21869,escort,1,False,1,900,518,2563863,home,,11,SHARED3FREE,0.4094721213082503,,,,20510909 +62533,21869,escort,1,True,1,844,900,2563864,escort,,14,SHARED2FREE,0.2145990810721203,,,,20510913 +62533,21869,escort,1,False,1,900,844,2563864,home,,14,SHARED2FREE,0.2175421651558981,,,,20510917 +62533,21869,othdiscr,1,True,1,1070,900,2563878,othdiscr,,11,WALK_TRANSIT,3.105130323985956,1500.0,73.0,fastest,20511025 +62533,21869,othdiscr,1,False,1,900,1070,2563878,home,,12,WALK_TRANSIT,1.157154797772182,73.0,1500.0,fastest,20511029 +62534,21869,school,1,True,1,793,900,2563925,school,,8,SHARED3FREE,0.6297085882553292,,,,20511401 +62534,21869,school,1,False,1,900,793,2563925,home,,9,SHARED3FREE,0.5963697573192646,,,,20511405 +67999,23619,escort,1,True,1,767,973,2787968,escort,,13,DRIVEALONEFREE,0.890462913926255,,,,22303745 +67999,23619,escort,1,False,3,1023,767,2787968,eatout,15.21145768299369,13,DRIVEALONEFREE,0.9380038576909148,,,,22303749 +67999,23619,escort,2,False,3,993,1023,2787968,escort,16.38782485475755,13,DRIVEALONEFREE,1.1641324346709314,,,,22303750 +67999,23619,escort,3,False,3,973,993,2787968,home,,13,SHARED3FREE,1.1940117377263124,,,,22303751 +67999,23619,social,1,True,1,988,973,2787995,social,,17,WALK,-0.5648561131494263,,,,22303961 +67999,23619,social,1,False,1,973,988,2787995,home,,20,WALK,-0.532826088016878,,,,22303965 +68000,23619,work,1,True,2,929,973,2788039,social,22.32425539722021,7,SHARED3FREE,1.2410412266329416,,,,22304313 +68000,23619,work,2,True,2,1070,929,2788039,work,,8,WALK_TRANSIT,3.2620308996092504,1500.0,1717.0,fastest,22304314 +68000,23619,work,1,False,1,973,1070,2788039,home,,21,WALK_TRANSIT,1.5243469382674963,1588.0,1500.0,fastest,22304317 +78977,26897,school,1,True,1,984,1081,3238088,school,,7,WALK,-0.4961040893656598,,,,25904705 +78977,26897,school,1,False,1,1081,984,3238088,home,,14,WALK,-0.4863570793073567,,,,25904709 +78979,26897,atwork,1,True,1,1070,897,3238143,atwork,,9,DRIVEALONEFREE,-0.0790579016887315,,,,25905145 +78979,26897,atwork,1,False,1,897,1070,3238143,work,,11,DRIVEALONEFREE,0.0896991164812165,,,,25905149 +78979,26897,work,1,True,1,897,1081,3238178,work,,7,DRIVEALONEFREE,0.0543705077473041,,,,25905425 +78979,26897,work,1,False,1,1081,897,3238178,home,,18,DRIVEALONEFREE,0.2237278343269858,,,,25905429 +1283602,435012,work,1,True,1,1078,521,52627721,work,,8,WALK_TRANSIT,1.8021163156603703,1584.0,1608.0,fastest,421021769 +1283602,435012,work,1,False,1,521,1078,52627721,home,,18,WALK_TRANSIT,0.6837263082555999,1238.0,1584.0,shortest,421021773 +1283868,435278,atwork,1,True,1,1070,1070,52638588,atwork,,12,WALK,2.4750620100011926,,,,421108705 +1283868,435278,atwork,1,False,1,1070,1070,52638588,work,,12,WALK,2.4750620100011926,,,,421108709 +1283868,435278,eatout,1,True,1,1070,537,52638594,eatout,,18,WALK_TRANSIT,3.4599905231494112,1500.0,1558.0,shortest,421108753 +1283868,435278,eatout,1,False,2,1023,1070,52638594,shopping,16.211897411935006,21,WALK_TRANSIT,1.894666699296724,1664.0,1500.0,cheapest,421108757 +1283868,435278,eatout,2,False,2,537,1023,52638594,home,,21,SHARED2FREE,0.481405926458176,,,,421108758 +1283868,435278,work,1,True,1,1070,537,52638627,work,,9,WALK_TRANSIT,2.556266300862197,1500.0,1558.0,cheapest,421109017 +1283868,435278,work,1,False,1,537,1070,52638627,home,,18,SHARED3FREE,0.9987470907812958,,,,421109021 +1283946,435356,work,1,True,1,1070,523,52641825,work,,7,WALK_TRANSIT,2.564484302870523,1500.0,1604.0,shortest,421134601 +1283946,435356,work,1,False,1,523,1070,52641825,home,,19,TNC_SINGLE,1.0882426323388463,,,,421134605 +1284598,436008,work,1,True,1,553,562,52668557,work,,9,WALK,0.3477587678510122,,,,421348457 +1284598,436008,work,1,False,1,562,553,52668557,home,,19,WALK,0.3521744962789443,,,,421348461 +1286215,437625,atwork,1,True,1,1077,606,52734819,atwork,,10,WALK_TRANSIT,1.081956268867972,1500.0,1562.0,cheapest,421878553 +1286215,437625,atwork,1,False,1,606,1077,52734819,work,,13,WALK_TRANSIT,1.6200392862281745,1562.0,1500.0,fastest,421878557 +1286215,437625,work,1,True,1,606,656,52734854,work,,8,DRIVEALONEFREE,0.1102478757651215,,,,421878833 +1286215,437625,work,1,False,3,934,606,52734854,othmaint,11.66765616556947,17,DRIVEALONEFREE,-0.3317318054812608,,,,421878837 +1286215,437625,work,2,False,3,730,934,52734854,othmaint,11.283765093022993,19,DRIVEALONEFREE,-0.1643308499157778,,,,421878838 +1286215,437625,work,3,False,3,656,730,52734854,home,,19,DRIVEALONEFREE,0.0934059914274113,,,,421878839 +1290184,441594,atwork,1,True,1,732,977,52897544,atwork,,11,WALK,2.454959930326644,,,,423180353 +1290184,441594,atwork,1,False,1,977,732,52897544,work,,11,SHARED2FREE,2.474151530078552,,,,423180357 +1290184,441594,eatout,1,True,1,949,1096,52897550,eatout,,21,SHARED2FREE,0.0292772650529429,,,,423180401 +1290184,441594,eatout,1,False,2,1070,949,52897550,othmaint,13.725541816697769,21,DRIVEALONEFREE,-0.5109700761493449,,,,423180405 +1290184,441594,eatout,2,False,2,1096,1070,52897550,home,,21,WALK,1.478188695251212,,,,423180406 +1290184,441594,othdiscr,1,True,1,684,1096,52897569,othdiscr,,5,DRIVEALONEFREE,-0.0552559508415055,,,,423180553 +1290184,441594,othdiscr,1,False,1,1096,684,52897569,home,,7,TAXI,-0.0112939168809909,,,,423180557 +1290184,441594,othdiscr,1,True,1,762,1096,52897570,othdiscr,,21,WALK_TRANSIT,1.6714757713025346,1748.0,1516.0,fastest,423180561 +1290184,441594,othdiscr,1,False,1,1096,762,52897570,home,,22,SHARED2FREE,0.7128678537533746,,,,423180565 +1290184,441594,othdiscr,1,True,1,725,1096,52897571,othdiscr,,23,WALK,-3.2668547335399403,,,,423180569 +1290184,441594,othdiscr,1,False,1,1096,725,52897571,home,,23,WALK,-3.2660886555615467,,,,423180573 +1290184,441594,work,1,True,1,977,1096,52897583,work,,9,DRIVEALONEFREE,-0.1968641598702008,,,,423180665 +1290184,441594,work,1,False,2,996,977,52897583,shopping,10.57412967355008,20,WALK,0.2087211377734316,,,,423180669 +1290184,441594,work,2,False,2,1096,996,52897583,home,,21,DRIVEALONEFREE,-0.1682790295025423,,,,423180670 +1290626,442036,work,1,True,2,1070,1100,52915705,work,22.736072255128843,8,WALK_TRANSIT,2.680723362376414,1500.0,1584.0,cheapest,423325641 +1290626,442036,work,2,True,2,1070,1070,52915705,work,,9,TAXI,2.982966369402809,,,,423325642 +1290626,442036,work,1,False,1,1100,1070,52915705,home,,18,WALK_TRANSIT,1.487658539961561,1584.0,1500.0,shortest,423325645 +1862905,721960,othdiscr,1,True,1,986,960,76379130,othdiscr,,15,WALK_TRANSIT,1.2392000497595896,1618.0,1762.0,shortest,611033041 +1862905,721960,othdiscr,1,False,1,960,986,76379130,home,,21,TAXI,1.030043298269935,,,,611033045 +1862906,721960,othdiscr,1,True,1,992,960,76379171,othdiscr,,11,SHARED3FREE,0.8858713264739554,,,,611033369 +1862906,721960,othdiscr,1,False,2,551,992,76379171,eatout,14.365988124910029,16,DRIVEALONEFREE,0.6551993708892542,,,,611033373 +1862906,721960,othdiscr,2,False,2,960,551,76379171,home,,16,SHARED2FREE,0.5117735347645973,,,,611033374 +1974306,760593,othdiscr,1,True,1,564,509,80946571,othdiscr,,8,WALK,0.1247752862685705,,,,647572569 +1974306,760593,othdiscr,1,False,1,509,564,80946571,home,,16,WALK,0.116845013634565,,,,647572573 +1974307,760593,atwork,1,True,1,913,739,80946591,atwork,,12,SHARED3FREE,2.326963701825421,,,,647572729 +1974307,760593,atwork,1,False,1,739,913,80946591,work,,13,SHARED3FREE,2.32846863119764,,,,647572733 +1974307,760593,work,1,True,2,746,509,80946626,shopping,17.72989652110887,9,WALK_TRANSIT,1.3491672588128374,1748.0,1603.0,shortest,647573009 +1974307,760593,work,2,True,2,739,746,80946626,work,,10,WALK_TRANSIT,1.938843261360648,1662.0,1748.0,shortest,647573010 +1974307,760593,work,1,False,2,560,739,80946626,work,18.21874805037896,17,WALK_TRANSIT,1.6522805849156224,1558.0,1748.0,shortest,647573013 +1974307,760593,work,2,False,2,509,560,80946626,home,,18,SHARED3FREE,1.442812477343081,,,,647573014 +1974308,760593,escort,1,True,1,716,509,80946637,escort,,5,SHARED3FREE,0.8908384736539703,,,,647573097 +1974308,760593,escort,1,False,1,509,716,80946637,home,,5,SHARED3FREE,0.8921116412562625,,,,647573101 +1976791,761445,escort,1,True,1,908,613,81048440,escort,,13,SHARED2FREE,0.431874605975145,,,,648387521 +1976791,761445,escort,1,False,1,613,908,81048440,home,,13,SHARED2FREE,0.4276938639754877,,,,648387525 +1976792,761445,atwork,1,True,1,517,517,81048476,atwork,,9,WALK,3.011532825794776,,,,648387809 +1976792,761445,atwork,1,False,1,517,517,81048476,work,,9,WALK,3.011532825794776,,,,648387813 +1976792,761445,eatout,1,True,1,648,613,81048478,eatout,,13,DRIVEALONEFREE,0.4724646811902892,,,,648387825 +1976792,761445,eatout,1,False,1,613,648,81048478,home,,19,TAXI,0.5571365851477499,,,,648387829 +1976792,761445,work,1,True,1,517,613,81048511,work,,5,TAXI,1.5969608894805951,,,,648388089 +1976792,761445,work,1,False,1,613,517,81048511,home,,12,SHARED2FREE,1.19217144041821,,,,648388093 +1978788,762159,social,1,True,1,763,961,81130344,social,,8,SHARED2FREE,0.3356357191681573,,,,649042753 +1978788,762159,social,1,False,1,961,763,81130344,home,,20,SHARED2FREE,0.3505516593935055,,,,649042757 +1978790,762159,escort,1,True,1,992,961,81130399,escort,,8,SHARED2FREE,0.5305214939922914,,,,649043193 +1978790,762159,escort,1,False,1,961,992,81130399,home,,8,SHARED2FREE,0.5188517472186277,,,,649043197 +1978790,762159,work,1,True,1,672,961,81130429,work,,8,DRIVEALONEFREE,-0.3341036694423639,,,,649043433 +1978790,762159,work,1,False,1,961,672,81130429,home,,17,DRIVEALONEFREE,-0.3318966430958522,,,,649043437 +1978791,762159,work,1,True,1,1070,961,81130470,work,,7,WALK_TRANSIT,2.58470148067105,1500.0,1762.0,fastest,649043761 +1978791,762159,work,1,False,1,961,1070,81130470,home,,17,TAXI,1.0619367021498107,,,,649043765 +2498047,922602,school,1,True,1,865,730,102419958,school,,9,TAXI,1.1163475417962474,,,,819359665 +2498047,922602,school,1,False,1,730,865,102419958,home,,16,SHARED2FREE,1.8016199836098328,,,,819359669 +2498048,922602,work,1,True,1,735,730,102420007,work,,7,SHARED3FREE,1.5809126110190068,,,,819360057 +2498048,922602,work,1,False,1,730,735,102420007,home,,16,WALK_TRANSIT,1.880736952455109,1612.0,1592.0,shortest,819360061 +2498049,922602,work,1,True,1,763,730,102420048,work,,6,WALK,-0.7841986127882443,,,,819360385 +2498049,922602,work,1,False,1,730,763,102420048,home,,16,WALK,-0.8403247829423018,,,,819360389 +2622192,952720,school,1,True,1,995,1025,107509903,school,,7,TNC_SINGLE,1.6419964172015382,,,,860079225 +2622192,952720,school,1,False,1,1025,995,107509903,home,,14,SHARED3FREE,1.5389530621520078,,,,860079229 +2622193,952720,escort,1,True,1,773,1025,107509922,escort,,6,SHARED3FREE,0.9453815876415104,,,,860079377 +2622193,952720,escort,1,False,1,1025,773,107509922,home,,6,SHARED2FREE,0.9690872179986346,,,,860079381 +2622193,952720,othmaint,1,True,1,550,1025,107509941,othmaint,,8,TAXI,1.500097483373683,,,,860079529 +2622193,952720,othmaint,1,False,1,1025,550,107509941,home,,15,WALK_TRANSIT,1.504929950475204,1666.0,1238.0,shortest,860079533 +2622194,952720,shopping,1,True,1,989,1025,107509987,shopping,,9,WALK,1.6280881165461685,,,,860079897 +2622194,952720,shopping,1,False,1,1025,989,107509987,home,,11,SHARED3FREE,1.5894514263370938,,,,860079901 +2622195,952720,work,1,True,1,1021,1025,107510034,work,,8,DRIVEALONEFREE,0.1307608725915786,,,,860080273 +2622195,952720,work,1,False,1,1025,1021,107510034,home,,17,DRIVEALONEFREE,0.1511979688414838,,,,860080277 +2844887,1028031,work,1,True,1,845,846,116640406,work,,11,WALK,1.405897637665538,,,,933123249 +2844887,1028031,work,1,False,1,846,845,116640406,home,,23,SHARED2FREE,1.3490304072755472,,,,933123253 +2933845,1048898,school,1,True,1,666,574,120287676,school,,12,WALK,-1.1062441750768537,,,,962301409 +2933845,1048898,school,1,False,1,574,666,120287676,home,,21,WALK,-1.227339677708164,,,,962301413 +2933846,1048898,school,1,True,1,515,574,120287717,school,,8,SHARED2FREE,-2.095040038174043,,,,962301737 +2933846,1048898,school,1,False,1,574,515,120287717,home,,16,SHARED2FREE,-2.102822435414125,,,,962301741 +2933847,1048898,othdiscr,1,True,1,657,574,120287752,othdiscr,,7,WALK_TRANSIT,1.341656840196758,1238.0,1559.0,cheapest,962302017 +2933847,1048898,othdiscr,1,False,2,1070,657,120287752,eatout,16.215971752056717,12,WALK_TRANSIT,3.482863279857927,3.0,1238.0,shortest,962302021 +2933847,1048898,othdiscr,2,False,2,574,1070,120287752,home,,12,WALK_TRANSIT,0.9522919082686396,1559.0,1500.0,fastest,962302022 +2933848,1048898,work,1,True,1,502,574,120287807,work,,6,WALK_TRANSIT,1.0919479644277714,1333.0,1559.0,fastest,962302457 +2933848,1048898,work,1,False,1,574,502,120287807,home,,18,WALK_TRANSIT,1.0565178398416113,1559.0,1333.0,shortest,962302461 +3216622,1148260,univ,1,True,1,1074,1076,131881533,univ,,14,TAXI,0.8326272536348395,,,,1055052265 +3216622,1148260,univ,1,False,1,1076,1074,131881533,home,,15,WALK,0.7538832012716944,,,,1055052269 diff --git a/activitysim/examples/placeholder_sandag/test/test_sandag.py b/activitysim/examples/placeholder_sandag/test/test_sandag.py index 8619b8a54b..d354ca1913 100644 --- a/activitysim/examples/placeholder_sandag/test/test_sandag.py +++ b/activitysim/examples/placeholder_sandag/test/test_sandag.py @@ -1,21 +1,18 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os import shutil import subprocess import sys +from pathlib import Path import pandas as pd -import pandas.testing as pdt import pkg_resources import pytest -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import configuration, test, workflow def example_path(dirname): @@ -67,7 +64,7 @@ def regress(zone): test_path(f"regress/final_{zone}_zone_tours_last_run.csv"), index=False ) print("regress tours") - pdt.assert_frame_equal( + test.assert_frame_substantively_equal( tours_df, regress_tours_df, rtol=1e-03, check_dtype=False ) @@ -87,7 +84,7 @@ def regress(zone): test_path(f"regress/final_{zone}_zone_trips_last_run.csv"), index=False ) print("regress trips") - pdt.assert_frame_equal( + test.assert_frame_substantively_equal( trips_df, regress_trips_df, rtol=1e-03, check_dtype=False ) @@ -110,7 +107,11 @@ def regress(zone): if not c.startswith("_original_") ] ] - pdt.assert_frame_equal(final_accessibility_df, regress_accessibility_df) + test.assert_frame_substantively_equal( + final_accessibility_df, + regress_accessibility_df, + check_dtype=False, + ) # run test file_path = os.path.join(os.path.dirname(__file__), "simulation.py") @@ -219,6 +220,94 @@ def test_3_zone_sharrow(data): run_test(zone="3", multiprocess=True, sharrow=True) +EXPECTED_MODELS_3_ZONE = [ + "initialize_landuse", + "initialize_households", + "compute_accessibility", + "initialize_los", + "initialize_tvpb", + "school_location", + "workplace_location", + "auto_ownership_simulate", + "free_parking", + "cdap_simulate", + "mandatory_tour_frequency", + "mandatory_tour_scheduling", + "joint_tour_frequency", + "joint_tour_composition", + "joint_tour_participation", + "joint_tour_destination", + "joint_tour_scheduling", + "non_mandatory_tour_frequency", + "non_mandatory_tour_destination", + "non_mandatory_tour_scheduling", + "tour_mode_choice_simulate", + "atwork_subtour_frequency", + "atwork_subtour_destination", + "atwork_subtour_scheduling", + "atwork_subtour_mode_choice", + "stop_frequency", + "trip_purpose", + "trip_destination", + "trip_purpose_and_destination", + "trip_scheduling", + "trip_mode_choice", + "write_data_dictionary", + "track_skim_usage", + "write_trip_matrices", + "write_tables", +] + + +@test.run_if_exists("placeholder_sandag_3_zone_reference_pipeline.zip") +def test_3_zone_progressive(): + import activitysim.abm # register components + + state = workflow.create_example( + "placeholder_sandag_3_zone", directory="/tmp/placeholder_sandag_3_zone" + ) + + assert state.settings.models == EXPECTED_MODELS_3_ZONE + assert state.settings.chunk_size == 0 + assert state.settings.sharrow == False + + state.settings.recode_pipeline_columns = True + state.settings.treat_warnings_as_errors = False + state.settings.households_sample_size = 30 + state.settings.use_shadow_pricing = False + state.settings.want_dest_choice_sample_tables = False + state.settings.want_dest_choice_presampling = True + state.settings.cleanup_pipeline_after_run = True + state.settings.output_tables = configuration.OutputTables( + h5_store=False, + action="include", + prefix="final_3_zone_", + sort=True, + tables=["trips", "tours"], + ) + from activitysim.abm.tables.skims import network_los_preload + + state.get(network_los_preload) + state.network_settings.read_skim_cache = False + state.network_settings.write_skim_cache = False + state.network_settings.rebuild_tvpb_cache = False + + for step_name in EXPECTED_MODELS_3_ZONE: + state.run.by_name(step_name) + try: + state.checkpoint.check_against( + Path(__file__).parent.joinpath( + "placeholder_sandag_3_zone_reference_pipeline.zip" + ), + checkpoint_name=step_name, + ) + except Exception: + print(f"> placeholder_sandag_3_zone {step_name}: ERROR") + raise + else: + print(f"> placeholder_sandag_3_zone {step_name}: ok") + + if __name__ == "__main__": # call each test explicitly so we get a pass/fail for each diff --git a/activitysim/examples/production_semcog/configs/logging.yaml b/activitysim/examples/production_semcog/configs/logging.yaml index 7b4f407957..3af1d619ff 100644 --- a/activitysim/examples/production_semcog/configs/logging.yaml +++ b/activitysim/examples/production_semcog/configs/logging.yaml @@ -1,53 +1,71 @@ -# Config for logging -# ------------------ -# See http://docs.python.org/2.7/library/logging.config.html#configuration-dictionary-schema - -logging: - version: 1 - disable_existing_loggers: true - - - # Configuring the default (root) logger is highly recommended - root: - level: NOTSET - handlers: [console] - - loggers: - - activitysim: - level: INFO - handlers: [console, logfile] - propagate: false - - orca: - level: WARN - handlers: [console, logfile] - propagate: false - - handlers: - - logfile: - class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] - mode: w - formatter: fileFormatter - level: NOTSET - - console: - class: logging.StreamHandler - stream: ext://sys.stdout - formatter: simpleFormatter - level: NOTSET - - formatters: - - simpleFormatter: - class: logging.Formatter - # format: '%(levelname)s - %(name)s - %(message)s' - format: '%(levelname)s - %(message)s' - datefmt: '%d/%m/%Y %H:%M:%S' - - fileFormatter: - class: logging.Formatter - format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' - datefmt: '%d/%m/%Y %H:%M:%S' +# Config for logging +# ------------------ +# See http://docs.python.org/2.7/library/logging.config.html#configuration-dictionary-schema + +logging: + version: 1 + disable_existing_loggers: true + + + # Configuring the default (root) logger is highly recommended + root: + level: NOTSET + handlers: [console] + + loggers: + + activitysim: + level: DEBUG + handlers: [console, logfile] + propagate: false + + orca: + level: WARN + handlers: [console, logfile] + propagate: false + + filelock: + level: WARN + + sharrow: + level: INFO + + blib2to3: + level: WARN + + black: + level: WARN + + handlers: + + logfile: + class: logging.FileHandler + filename: + get_log_file_path: 'activitysim.log' + mode: w + formatter: fileFormatter + level: NOTSET + + console: + class: logging.StreamHandler + stream: ext://sys.stdout + formatter: elapsedFormatter + level: NOTSET + + formatters: + + simpleFormatter: + class: logging.Formatter + # format: '%(levelname)s - %(name)s - %(message)s' + format: '%(levelname)s - %(message)s' + datefmt: '%d/%m/%Y %H:%M:%S' + + fileFormatter: + class: logging.Formatter + format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' + datefmt: '%d/%m/%Y %H:%M:%S' + + elapsedFormatter: + (): activitysim.core.tracing.ElapsedTimeFormatter + format: '[{elapsedTime}] {levelname:s}: {message:s}' + style: '{' diff --git a/activitysim/examples/production_semcog/configs_mp/logging.yaml b/activitysim/examples/production_semcog/configs_mp/logging.yaml index afaea1bec1..d6bc764418 100644 --- a/activitysim/examples/production_semcog/configs_mp/logging.yaml +++ b/activitysim/examples/production_semcog/configs_mp/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -37,21 +38,20 @@ logging: class: logging.StreamHandler stream: ext://sys.stdout formatter: simpleFormatter - #level: NOTSET - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatters: simpleFormatter: class: logging.Formatter - #format: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' - format: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [ - '%(processName)-10s %(levelname)s - %(name)s - %(message)s', - '%(levelname)s - %(name)s - %(message)s'] + format: + if_sub_task: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' + if_not_sub_task: '%(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' fileFormatter: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/examples/production_semcog/configs_mp/settings.yaml b/activitysim/examples/production_semcog/configs_mp/settings.yaml index 596474835e..b29d3d7145 100644 --- a/activitysim/examples/production_semcog/configs_mp/settings.yaml +++ b/activitysim/examples/production_semcog/configs_mp/settings.yaml @@ -1,4 +1,4 @@ -# Configs File with Sample Rate set by Model Runner +# Configs File with Sample Rate set by Model Runner inherit_settings: True # raise error if any sub-process fails without waiting for others to complete fail_fast: True @@ -68,7 +68,7 @@ models: - write_trip_matrices -multiprocess_steps: +multiprocess_steps: - name: mp_initialize begin: initialize_landuse - name: mp_accessibility @@ -77,7 +77,7 @@ multiprocess_steps: slice: tables: - accessibility - except: True + exclude: True - name: mp_households begin: school_location slice: diff --git a/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py b/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py index 3f39d14a93..ce71108c90 100644 --- a/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py +++ b/activitysim/examples/production_semcog/extensions/parking_location_choice_at_university.py @@ -1,15 +1,13 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import logit +from activitysim.core import logit, los, tracing, workflow # from .util import estimation @@ -58,9 +56,13 @@ def closest_parking_zone_xwalk(univ_zones, parking_zones, network_los): return closest_parking_df -@inject.step() +@workflow.step def parking_location_choice_at_university( - trips, tours, land_use, network_los, chunk_size, trace_hh_id + state: workflow.State, + trips: pd.DataFrame, + tours: pd.DataFrame, + land_use: pd.DataFrame, + network_los: los.Network_LOS, ): """ This model selects a parking location for groups of trips that are on university campuses where @@ -72,7 +74,7 @@ def parking_location_choice_at_university( trace_label = "parking_location_choice_at_university" model_settings_file_name = "parking_location_choice_at_university.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) univ_codes_col = model_settings["LANDUSE_UNIV_CODE_COL_NAME"] univ_codes = model_settings["UNIV_CODES_THAT_REQUIRE_PARKING"] @@ -83,9 +85,7 @@ def parking_location_choice_at_university( parking_tour_modes = model_settings["TOUR_MODES_THAT_REQUIRE_PARKING"] nearest_lot_tour_purposes = model_settings["TOUR_PURPOSES_TO_NEAREST_LOT"] - trips = trips.to_frame() - tours = tours.to_frame() - land_use_df = land_use.to_frame() + land_use_df = land_use # initialize univ parking columns trips["parked_at_university"] = False @@ -149,7 +149,7 @@ def parking_location_choice_at_university( probs.set_index(tour_choosers[parking_tours].index, inplace=True) # making stable choices using ActivitySim's random number generator - choices, rands = logit.make_choices(probs) + choices, rands = logit.make_choices(state, probs) choices = choices.map(pd.Series(probs.columns)) tour_choosers.loc[parking_tours, "univ_parking_zone_id"] = choices @@ -179,8 +179,8 @@ def parking_location_choice_at_university( tours.index.isin(tour_choosers.index), "univ_parking_zone_id" ] = tour_choosers["univ_parking_zone_id"] - pipeline.replace_table("trips", trips) - pipeline.replace_table("tours", tours) + state.add_table("trips", trips) + state.add_table("tours", tours) tracing.print_summary( "parking_location_choice_at_university zones", @@ -188,5 +188,5 @@ def parking_location_choice_at_university( value_counts=True, ) - if trace_hh_id: - tracing.trace_df(tours, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(tours, label=trace_label, warn_if_empty=True) diff --git a/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py b/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py index 3d51d60b84..1d6dcda6d3 100644 --- a/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py +++ b/activitysim/examples/production_semcog/extensions/stop_frequency_university_parking.py @@ -1,24 +1,23 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import expressions +from activitysim.core import tracing, workflow # from .util import estimation logger = logging.getLogger(__name__) -@inject.step() -def stop_frequency_university_parking(trips, tours, chunk_size, trace_hh_id): +@workflow.step +def stop_frequency_university_parking( + state: workflow.State, trips: pd.DataFrame, tours: pd.DataFrame +): """ This model inserts parking trips on drive tours that include university parking as determined in the parking_location_choice_at_university model. Parking trips are added to the trip table before @@ -31,19 +30,15 @@ def stop_frequency_university_parking(trips, tours, chunk_size, trace_hh_id): trace_label = "stop_frequency_university_parking" model_settings_file_name = "stop_frequency_university_parking.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) parking_name = model_settings["PARKING_TRIP_NAME"] - trips = trips.to_frame() - tours = tours.to_frame() - tours_with_parking = tours[tours["univ_parking_zone_id"].notna()] trip_choosers = trips[trips.tour_id.isin(tours_with_parking.index)] trips_without_parking = trips[~trips.tour_id.isin(tours_with_parking.index)] if len(trip_choosers) > 0: - trip_choosers = pd.merge( trip_choosers.reset_index(), tours_with_parking["univ_parking_zone_id"].reset_index(), @@ -192,11 +187,11 @@ def stop_frequency_university_parking(trips, tours, chunk_size, trace_hh_id): trips.set_index("trip_id", inplace=True, verify_integrity=True) - pipeline.replace_table("trips", trips) + state.add_table("trips", trips) # since new trips were added inbetween other trips on the tour, the trip_id's changed # resetting random number generator for trips... does this have unintended consequences? - pipeline.get_rn_generator().drop_channel("trips") - pipeline.get_rn_generator().add_channel("trips", trips) + state.get_rn_generator().drop_channel("trips") + state.get_rn_generator().add_channel("trips", trips) tracing.print_summary( "stop_frequency_university_parking trip purposes", @@ -204,5 +199,5 @@ def stop_frequency_university_parking(trips, tours, chunk_size, trace_hh_id): value_counts=True, ) - if trace_hh_id: - tracing.trace_df(trips, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(trips, label=trace_label, warn_if_empty=True) diff --git a/activitysim/examples/production_semcog/extensions/university_location_zone_override.py b/activitysim/examples/production_semcog/extensions/university_location_zone_override.py index 1d22110cee..cc4354c418 100644 --- a/activitysim/examples/production_semcog/extensions/university_location_zone_override.py +++ b/activitysim/examples/production_semcog/extensions/university_location_zone_override.py @@ -1,15 +1,13 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import logit +from activitysim.core import logit, tracing, workflow # from .util import estimation @@ -17,7 +15,11 @@ def resample_school_zones( - choosers, land_use, model_settings, col_to_override="school_zone_id" + state: workflow.State, + choosers: pd.DataFrame, + land_use: pd.DataFrame, + model_settings: dict, + col_to_override: str = "school_zone_id", ): """ Re-samples the university school zone based only on enrollment. Can apply to the original school @@ -86,16 +88,19 @@ def resample_school_zones( probs.set_index(choosers[choosers_to_override].index, inplace=True) # making stable choices using ActivitySim's random number generator - choices, rands = logit.make_choices(probs) + choices, rands = logit.make_choices(state, probs) choices = choices.map(pd.Series(probs.columns)) choosers.loc[choosers_to_override, "univ_parking_zone_id"] = choices return choosers -@inject.step() +@workflow.step def university_location_zone_override( - persons_merged, persons, land_use, chunk_size, trace_hh_id + state: workflow.State, + persons_merged: pd.DataFrame, + persons: pd.DataFrame, + land_use: pd.DataFrame, ): """ This model overrides the school taz for students attending large universities. New school tazs @@ -109,10 +114,10 @@ def university_location_zone_override( trace_label = "university_location_zone_override" model_settings_file_name = "university_location_zone_override.yaml" - choosers = persons.to_frame() - land_use_df = land_use.to_frame() + choosers = persons + land_use_df = land_use - univ_school_seg = config.read_model_settings("constants.yaml")[ + univ_school_seg = state.filesystem.read_model_settings("constants.yaml")[ "SCHOOL_SEGMENT_UNIV" ] choosers = choosers[ @@ -121,14 +126,13 @@ def university_location_zone_override( logger.info("Running %s for %d university students", trace_label, len(choosers)) - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) choosers = resample_school_zones( - choosers, land_use_df, model_settings, col_to_override="school_zone_id" + state, choosers, land_use_df, model_settings, col_to_override="school_zone_id" ) # Overriding school_zone_id in persons table - persons = persons.to_frame() persons.loc[persons.index.isin(choosers.index), "school_zone_id"] = choosers[ "school_zone_id" ].astype(int) @@ -140,7 +144,7 @@ def university_location_zone_override( persons.index.isin(choosers.index), original_zone_col_name ] = choosers[original_zone_col_name] - pipeline.replace_table("persons", persons) + state.add_table("persons", persons) tracing.print_summary( "university_location_zone_override choices", @@ -148,13 +152,16 @@ def university_location_zone_override( value_counts=True, ) - if trace_hh_id: - tracing.trace_df(persons, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(persons, label=trace_label, warn_if_empty=True) -@inject.step() +@workflow.step def trip_destination_univ_zone_override( - trips, tours, land_use, chunk_size, trace_hh_id + state: workflow.State, + trips: pd.DataFrame, + tours: pd.DataFrame, + land_use: pd.DataFrame, ): """ This model overrides the university trip destination zone for students attending large universities. @@ -169,13 +176,13 @@ def trip_destination_univ_zone_override( trace_label = "trip_destination_univ_zone_override" model_settings_file_name = "university_location_zone_override.yaml" - model_settings = config.read_model_settings(model_settings_file_name) + model_settings = state.filesystem.read_model_settings(model_settings_file_name) univ_purpose = model_settings["TRIP_UNIVERSITY_PURPOSE"] tour_mode_override_dict = model_settings["TOUR_MODE_OVERRIDE_DICT"] - choosers = trips.to_frame() - land_use_df = land_use.to_frame() - tours = tours.to_frame() + choosers = trips.copy() # will edit choosers below adding temp "is_primary_trip" + # TODO do we really want a copy here? probably not + land_use_df = land_use # primary trips are outbound trips where the next trip is not outbound choosers["is_primary_trip"] = np.where( @@ -207,11 +214,10 @@ def trip_destination_univ_zone_override( logger.info("Running %s for %d university students", trace_label, len(choosers)) choosers = resample_school_zones( - choosers, land_use_df, model_settings, col_to_override="destination" + state, choosers, land_use_df, model_settings, col_to_override="destination" ) # Overriding school_zone_id in persons table - trips = trips.to_frame() trips.loc[trips.index.isin(choosers.index), "destination"] = choosers[ "destination" ].astype(int) @@ -235,8 +241,8 @@ def trip_destination_univ_zone_override( original_zone_col_name ] - pipeline.replace_table("trips", trips) - pipeline.replace_table("tours", tours) + state.add_table("trips", trips) + state.add_table("tours", tours) tracing.print_summary( "trip_destination_univ_zone_override for zones", @@ -244,5 +250,5 @@ def trip_destination_univ_zone_override( value_counts=True, ) - if trace_hh_id: - tracing.trace_df(trips, label=trace_label, warn_if_empty=True) + if state.settings.trace_hh_id: + state.tracing.trace_df(trips, label=trace_label, warn_if_empty=True) diff --git a/activitysim/examples/production_semcog/test/test_semcog.py b/activitysim/examples/production_semcog/test/test_semcog.py index caeefcddbb..57e0bf124e 100644 --- a/activitysim/examples/production_semcog/test/test_semcog.py +++ b/activitysim/examples/production_semcog/test/test_semcog.py @@ -1,18 +1,14 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os import subprocess import pandas as pd -import pandas.testing as pdt import pkg_resources -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core.test._tools import assert_frame_substantively_equal def run_test_semcog(multiprocess=False): @@ -24,9 +20,13 @@ def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(): - regress_trips_df = pd.read_csv(test_path("regress/final_trips.csv")) - final_trips_df = pd.read_csv(test_path("output/final_trips.csv")) - pdt.assert_frame_equal(final_trips_df, regress_trips_df) + regress_trips_df = pd.read_csv( + test_path("regress/final_trips.csv"), dtype={"depart": int} + ) + final_trips_df = pd.read_csv( + test_path("output/final_trips.csv"), dtype={"depart": int} + ) + assert_frame_substantively_equal(final_trips_df, regress_trips_df) file_path = os.path.join(os.path.dirname(__file__), "../simulation.py") diff --git a/activitysim/examples/prototype_arc/configs/logging.yaml b/activitysim/examples/prototype_arc/configs/logging.yaml index 1db43e9982..8ebd99114f 100644 --- a/activitysim/examples/prototype_arc/configs/logging.yaml +++ b/activitysim/examples/prototype_arc/configs/logging.yaml @@ -40,7 +40,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET diff --git a/activitysim/examples/prototype_arc/configs/settings.yaml b/activitysim/examples/prototype_arc/configs/settings.yaml index aa1c297ac3..6ff25b94e0 100644 --- a/activitysim/examples/prototype_arc/configs/settings.yaml +++ b/activitysim/examples/prototype_arc/configs/settings.yaml @@ -150,7 +150,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True - name: mp_households begin: school_location num_processes: 5 diff --git a/activitysim/examples/prototype_arc/configs/settings_mp.yaml b/activitysim/examples/prototype_arc/configs/settings_mp.yaml index 5701b40290..40dc990efd 100644 --- a/activitysim/examples/prototype_arc/configs/settings_mp.yaml +++ b/activitysim/examples/prototype_arc/configs/settings_mp.yaml @@ -157,7 +157,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True - name: mp_households begin: school_location slice: diff --git a/activitysim/examples/prototype_arc/test/test_arc.py b/activitysim/examples/prototype_arc/test/test_arc.py index 5b35132496..aeb6a56155 100644 --- a/activitysim/examples/prototype_arc/test/test_arc.py +++ b/activitysim/examples/prototype_arc/test/test_arc.py @@ -1,3 +1,5 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os @@ -8,12 +10,7 @@ import pandas.testing as pdt import pkg_resources -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core.test import assert_frame_substantively_equal def _test_arc(recode=False, sharrow=False): @@ -35,7 +32,7 @@ def regress(): # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] - pdt.assert_frame_equal(final_trips_df, regress_trips_df) + assert_frame_substantively_equal(final_trips_df, regress_trips_df) file_path = os.path.join(os.path.dirname(__file__), "simulation.py") diff --git a/activitysim/examples/prototype_marin/configs/logging.yaml b/activitysim/examples/prototype_marin/configs/logging.yaml index df20cf0c7e..9addd30706 100755 --- a/activitysim/examples/prototype_marin/configs/logging.yaml +++ b/activitysim/examples/prototype_marin/configs/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -51,4 +52,3 @@ logging: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/examples/prototype_marin/test/reference_pipeline.zip b/activitysim/examples/prototype_marin/test/reference_pipeline.zip new file mode 100644 index 0000000000..9342673dd2 Binary files /dev/null and b/activitysim/examples/prototype_marin/test/reference_pipeline.zip differ diff --git a/activitysim/examples/prototype_marin/test/reference_trace.tar.gz b/activitysim/examples/prototype_marin/test/reference_trace.tar.gz new file mode 100644 index 0000000000..70c8adc462 Binary files /dev/null and b/activitysim/examples/prototype_marin/test/reference_trace.tar.gz differ diff --git a/activitysim/examples/prototype_marin/test/test_marin.py b/activitysim/examples/prototype_marin/test/test_marin.py index 2f78de8868..4de5028f19 100644 --- a/activitysim/examples/prototype_marin/test/test_marin.py +++ b/activitysim/examples/prototype_marin/test/test_marin.py @@ -1,36 +1,38 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os import subprocess +from pathlib import Path import pandas as pd import pandas.testing as pdt import pkg_resources -from activitysim.core import inject +from activitysim.core import test, workflow -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +def example_path(dirname): + resource = os.path.join("examples", "prototype_marin", dirname) + return pkg_resources.resource_filename("activitysim", resource) -def test_marin(): - def example_path(dirname): - resource = os.path.join("examples", "prototype_marin", dirname) - return pkg_resources.resource_filename("activitysim", resource) +def _test_path(dirname): + return os.path.join(os.path.dirname(__file__), dirname) - def test_path(dirname): - return os.path.join(os.path.dirname(__file__), dirname) +def test_marin(): def regress(): - regress_trips_df = pd.read_csv(test_path("regress/final_tours.csv")) - final_trips_df = pd.read_csv(test_path("output/final_tours.csv")) + regress_trips_df = pd.read_csv(_test_path("regress/final_tours.csv")) + final_trips_df = pd.read_csv(_test_path("output/final_tours.csv")) # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] - pdt.assert_frame_equal(final_trips_df, regress_trips_df) + test.assert_frame_substantively_equal( + final_trips_df, regress_trips_df, check_dtype=False + ) file_path = os.path.join(os.path.dirname(__file__), "simulation.py") @@ -41,13 +43,13 @@ def regress(): "-a", file_path, "-c", - test_path("configs"), + _test_path("configs"), "-c", example_path("configs"), "-d", example_path("data"), "-o", - test_path("output"), + _test_path("output"), ], check=True, ) @@ -55,6 +57,55 @@ def regress(): regress() -if __name__ == "__main__": +EXPECTED_MODELS = [ + "initialize_landuse", + "initialize_households", + "initialize_tours", + "initialize_los", + "initialize_tvpb", + "tour_mode_choice_simulate", + "write_data_dictionary", + "track_skim_usage", + "write_tables", +] + + +@test.run_if_exists("reference_pipeline.zip") +def test_marin_progressive(): + import activitysim.abm # register components + + state = workflow.State.make_default( + configs_dir=( + _test_path("configs"), + example_path("configs"), + ), + data_dir=(example_path("data"),), + output_dir=_test_path("output"), + ) + + assert state.settings.models == EXPECTED_MODELS + assert state.settings.chunk_size == 0 + assert state.settings.sharrow == False + + state.settings.trace_hh_id = 8268 + trace_validation_directory = Path(__file__).parent / "reference_trace.tar.gz" + if trace_validation_directory.exists(): + state.tracing.validation_directory = trace_validation_directory + for step_name in EXPECTED_MODELS: + state.run.by_name(step_name) + try: + state.checkpoint.check_against( + Path(__file__).parent / "reference_pipeline.zip", + checkpoint_name=step_name, + ) + except Exception: + print(f"> MARIN {step_name}: ERROR") + raise + else: + print(f"> MARIN {step_name}: ok") + + +if __name__ == "__main__": test_marin() + test_marin_progressive() diff --git a/activitysim/examples/prototype_mtc/configs/logging.yaml b/activitysim/examples/prototype_mtc/configs/logging.yaml index 3b2851ddd8..8bf54a0ec7 100644 --- a/activitysim/examples/prototype_mtc/configs/logging.yaml +++ b/activitysim/examples/prototype_mtc/configs/logging.yaml @@ -40,7 +40,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: activitysim.log mode: w formatter: fileFormatter level: NOTSET diff --git a/activitysim/examples/prototype_mtc/configs/settings.yaml b/activitysim/examples/prototype_mtc/configs/settings.yaml index 6577b94b95..adaebdce5e 100644 --- a/activitysim/examples/prototype_mtc/configs/settings.yaml +++ b/activitysim/examples/prototype_mtc/configs/settings.yaml @@ -90,6 +90,7 @@ input_table_list: # convert input CSVs to HDF5 format and save to outputs directory # create_input_store: True +recode_pipeline_columns: True #input_store: ../output/input_data.h5 @@ -262,4 +263,3 @@ household_median_value_of_time: 2: 8.81 3: 10.44 4: 12.86 - diff --git a/activitysim/examples/prototype_mtc/configs_chunktrain/logging.yaml b/activitysim/examples/prototype_mtc/configs_chunktrain/logging.yaml index 74fc31defa..fb8385a041 100644 --- a/activitysim/examples/prototype_mtc/configs_chunktrain/logging.yaml +++ b/activitysim/examples/prototype_mtc/configs_chunktrain/logging.yaml @@ -44,7 +44,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -53,16 +54,17 @@ logging: class: logging.StreamHandler stream: ext://sys.stdout formatter: elapsedFormatter - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatters: simpleFormatter: class: logging.Formatter - #format: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' - format: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [ - '%(processName)-10s %(levelname)s - %(name)s - %(message)s', - '%(levelname)s - %(name)s - %(message)s'] + format: + if_sub_task: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' + if_not_sub_task: '%(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' fileFormatter: diff --git a/activitysim/examples/prototype_mtc/configs_chunktrain/settings.yaml b/activitysim/examples/prototype_mtc/configs_chunktrain/settings.yaml index ecd18ec9ea..aaa6fc139a 100644 --- a/activitysim/examples/prototype_mtc/configs_chunktrain/settings.yaml +++ b/activitysim/examples/prototype_mtc/configs_chunktrain/settings.yaml @@ -73,7 +73,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True - name: mp_households begin: school_location slice: @@ -82,4 +82,3 @@ multiprocess_steps: - persons - name: mp_summarize begin: write_data_dictionary - diff --git a/activitysim/examples/prototype_mtc/configs_mp/logging.yaml b/activitysim/examples/prototype_mtc/configs_mp/logging.yaml index 13e533abba..779a4cbab9 100644 --- a/activitysim/examples/prototype_mtc/configs_mp/logging.yaml +++ b/activitysim/examples/prototype_mtc/configs_mp/logging.yaml @@ -40,7 +40,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -48,7 +49,9 @@ logging: console: class: logging.StreamHandler stream: ext://sys.stdout - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatter: elapsedFormatter formatters: diff --git a/activitysim/examples/prototype_mtc/configs_mp/settings.yaml b/activitysim/examples/prototype_mtc/configs_mp/settings.yaml index 7246c5cf36..768be2fc3d 100644 --- a/activitysim/examples/prototype_mtc/configs_mp/settings.yaml +++ b/activitysim/examples/prototype_mtc/configs_mp/settings.yaml @@ -36,7 +36,7 @@ want_dest_choice_sample_tables: False # - tracing #trace_hh_id: -trace_od: +trace_od: [2,2] # to resume after last successful checkpoint, specify resume_after: _ #resume_after: trip_purpose_and_destination @@ -89,7 +89,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True - name: mp_households begin: school_location slice: @@ -98,4 +98,3 @@ multiprocess_steps: - persons - name: mp_summarize begin: write_data_dictionary - diff --git a/activitysim/examples/prototype_mtc/configs_production/logging.yaml b/activitysim/examples/prototype_mtc/configs_production/logging.yaml index 8939607c8f..bfc2ab4b5c 100644 --- a/activitysim/examples/prototype_mtc/configs_production/logging.yaml +++ b/activitysim/examples/prototype_mtc/configs_production/logging.yaml @@ -38,7 +38,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -47,16 +48,17 @@ logging: class: logging.StreamHandler stream: ext://sys.stdout formatter: elapsedFormatter - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatters: simpleFormatter: class: logging.Formatter - #format: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' - format: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [ - '%(processName)-10s %(levelname)s - %(name)s - %(message)s', - '%(levelname)s - %(name)s - %(message)s'] + format: + if_sub_task: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' + if_not_sub_task: '%(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' fileFormatter: @@ -68,4 +70,3 @@ logging: (): activitysim.core.tracing.ElapsedTimeFormatter format: '[{elapsedTime}] {levelname:s}: {message:s}' style: '{' - diff --git a/activitysim/examples/prototype_mtc/configs_production/settings.yaml b/activitysim/examples/prototype_mtc/configs_production/settings.yaml index 8b3f59cb57..0eb9def196 100644 --- a/activitysim/examples/prototype_mtc/configs_production/settings.yaml +++ b/activitysim/examples/prototype_mtc/configs_production/settings.yaml @@ -73,7 +73,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True + exclude: True - name: mp_households begin: school_location slice: @@ -82,4 +82,3 @@ multiprocess_steps: - persons - name: mp_summarize begin: write_data_dictionary - diff --git a/activitysim/examples/prototype_mtc/test/.gitignore b/activitysim/examples/prototype_mtc/test/.gitignore new file mode 100644 index 0000000000..c4c4ffc6aa --- /dev/null +++ b/activitysim/examples/prototype_mtc/test/.gitignore @@ -0,0 +1 @@ +*.zip diff --git a/activitysim/examples/prototype_mtc/test/test_mtc.py b/activitysim/examples/prototype_mtc/test/test_mtc.py index b79266f8fd..b72ac5c495 100644 --- a/activitysim/examples/prototype_mtc/test/test_mtc.py +++ b/activitysim/examples/prototype_mtc/test/test_mtc.py @@ -1,19 +1,17 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os import subprocess import sys +from pathlib import Path import pandas as pd import pandas.testing as pdt import pkg_resources -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import test, workflow def run_test_mtc(multiprocess=False, chunkless=False, recode=False, sharrow=False): @@ -27,11 +25,7 @@ def test_path(dirname): def regress(): regress_trips_df = pd.read_csv(test_path("regress/final_trips.csv")) final_trips_df = pd.read_csv(test_path("output/final_trips.csv")) - - # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, - # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum - # compare_cols = [] - pdt.assert_frame_equal(final_trips_df, regress_trips_df) + test.assert_frame_substantively_equal(final_trips_df, regress_trips_df) file_path = os.path.join(os.path.dirname(__file__), "simulation.py") @@ -121,8 +115,69 @@ def test_mtc_sharrow(): run_test_mtc(sharrow=True) -if __name__ == "__main__": +EXPECTED_MODELS = [ + "initialize_landuse", + "initialize_households", + "compute_accessibility", + "school_location", + "workplace_location", + "auto_ownership_simulate", + "free_parking", + "cdap_simulate", + "mandatory_tour_frequency", + "mandatory_tour_scheduling", + "joint_tour_frequency", + "joint_tour_composition", + "joint_tour_participation", + "joint_tour_destination", + "joint_tour_scheduling", + "non_mandatory_tour_frequency", + "non_mandatory_tour_destination", + "non_mandatory_tour_scheduling", + "tour_mode_choice_simulate", + "atwork_subtour_frequency", + "atwork_subtour_destination", + "atwork_subtour_scheduling", + "atwork_subtour_mode_choice", + "stop_frequency", + "trip_purpose", + "trip_destination", + "trip_purpose_and_destination", + "trip_scheduling", + "trip_mode_choice", + "write_data_dictionary", + "track_skim_usage", + "write_trip_matrices", + "write_tables", + "summarize", +] + + +@test.run_if_exists("prototype_mtc_reference_pipeline.zip") +def test_mtc_progressive(): + import activitysim.abm # register components + + state = workflow.create_example("prototype_mtc", temp=True) + + assert state.settings.models == EXPECTED_MODELS + assert state.settings.chunk_size == 0 + assert state.settings.sharrow == False + + for step_name in EXPECTED_MODELS: + state.run.by_name(step_name) + try: + state.checkpoint.check_against( + Path(__file__).parent.joinpath("prototype_mtc_reference_pipeline.zip"), + checkpoint_name=step_name, + ) + except Exception: + print(f"> prototype_mtc {step_name}: ERROR") + raise + else: + print(f"> prototype_mtc {step_name}: ok") + +if __name__ == "__main__": run_test_mtc(multiprocess=False) run_test_mtc(multiprocess=True) run_test_mtc(multiprocess=False, chunkless=True) diff --git a/activitysim/examples/prototype_mtc_extended/configs_mp/logging.yaml b/activitysim/examples/prototype_mtc_extended/configs_mp/logging.yaml index e932009c5d..64333d1414 100644 --- a/activitysim/examples/prototype_mtc_extended/configs_mp/logging.yaml +++ b/activitysim/examples/prototype_mtc_extended/configs_mp/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -37,21 +38,20 @@ logging: class: logging.StreamHandler stream: ext://sys.stdout formatter: simpleFormatter - #level: NOTSET - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatters: simpleFormatter: class: logging.Formatter - #format: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' - format: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [ - '%(processName)-10s %(levelname)s - %(name)s - %(message)s', - '%(levelname)s - %(name)s - %(message)s'] + format: + if_sub_task: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' + if_not_sub_task: '%(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' fileFormatter: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/examples/prototype_mtc_extended/configs_mp/settings.yaml b/activitysim/examples/prototype_mtc_extended/configs_mp/settings.yaml index 27e13575f8..b6e75efb4b 100644 --- a/activitysim/examples/prototype_mtc_extended/configs_mp/settings.yaml +++ b/activitysim/examples/prototype_mtc_extended/configs_mp/settings.yaml @@ -104,7 +104,7 @@ multiprocess_steps: tables: - accessibility # don't slice any tables not explicitly listed above in slice.tables - except: True # This is needed after disaggregate accessibilities, otherwise it will return empty logsums tables + exclude: True # This is needed after disaggregate accessibilities, otherwise it will return empty logsums tables - name: mp_simulate begin: school_location slice: diff --git a/activitysim/examples/prototype_mtc_extended/sampling_scenarios.py b/activitysim/examples/prototype_mtc_extended/sampling_scenarios.py index 84e72beddc..a182a3bf3b 100644 --- a/activitysim/examples/prototype_mtc_extended/sampling_scenarios.py +++ b/activitysim/examples/prototype_mtc_extended/sampling_scenarios.py @@ -1,9 +1,11 @@ import argparse import os +import shutil + +import pandas as pd import pkg_resources import yaml -import pandas as pd -import shutil + from activitysim.cli.run import add_run_args, run from activitysim.core.util import named_product @@ -171,8 +173,9 @@ def run_scenarios(): run_model() # Copy results to named folder copy_output(scene_name, model_settings) - except: + except Exception: print(f"Failed on scene {scene_name}") + raise if __name__ == "__main__": diff --git a/activitysim/examples/prototype_mtc_extended/test/configs_mp/logging.yaml b/activitysim/examples/prototype_mtc_extended/test/configs_mp/logging.yaml index e932009c5d..64333d1414 100644 --- a/activitysim/examples/prototype_mtc_extended/test/configs_mp/logging.yaml +++ b/activitysim/examples/prototype_mtc_extended/test/configs_mp/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -37,21 +38,20 @@ logging: class: logging.StreamHandler stream: ext://sys.stdout formatter: simpleFormatter - #level: NOTSET - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatters: simpleFormatter: class: logging.Formatter - #format: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' - format: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [ - '%(processName)-10s %(levelname)s - %(name)s - %(message)s', - '%(levelname)s - %(name)s - %(message)s'] + format: + if_sub_task: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' + if_not_sub_task: '%(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' fileFormatter: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/examples/prototype_mtc_extended/test/prototype_mtc_extended_reference_pipeline.zip b/activitysim/examples/prototype_mtc_extended/test/prototype_mtc_extended_reference_pipeline.zip new file mode 100644 index 0000000000..50c9cdba76 Binary files /dev/null and b/activitysim/examples/prototype_mtc_extended/test/prototype_mtc_extended_reference_pipeline.zip differ diff --git a/activitysim/examples/prototype_mtc_extended/test/test_mtc_extended.py b/activitysim/examples/prototype_mtc_extended/test/test_mtc_extended.py index 758a291d50..c6a791c15c 100644 --- a/activitysim/examples/prototype_mtc_extended/test/test_mtc_extended.py +++ b/activitysim/examples/prototype_mtc_extended/test/test_mtc_extended.py @@ -1,19 +1,18 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os import subprocess import sys +from pathlib import Path import pandas as pd import pandas.testing as pdt import pkg_resources +import pytest -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import configuration, test, workflow def _test_prototype_mtc_extended( @@ -37,7 +36,7 @@ def regress(): if sharrow: regress_suffix += "-sharrow" - output_dir = "output" + output_dir = f"output_{int(sharrow)}{int(shadow_pricing)}" regress_trips_df = pd.read_csv( test_path(f"regress/final_trips{regress_suffix}.csv") ) @@ -63,12 +62,19 @@ def regress(): if i not in regress_accessibility_df.columns ] ) - pdt.assert_frame_equal( - final_accessibiliy_df, regress_accessibility_df, rtol=1.0e-4 + test.assert_frame_substantively_equal( + final_accessibiliy_df, + regress_accessibility_df, + rtol=1.0e-4, + check_dtype=False, ) - pdt.assert_frame_equal(final_trips_df, regress_trips_df, rtol=1.0e-4) - pdt.assert_frame_equal(final_vehicles_df, regress_vehicles_df, rtol=1.0e-4) + test.assert_frame_substantively_equal( + final_trips_df, regress_trips_df, rtol=1.0e-4 + ) + test.assert_frame_substantively_equal( + final_vehicles_df, regress_vehicles_df, rtol=1.0e-4 + ) file_path = os.path.join(os.path.dirname(__file__), "simulation.py") shadowprice_configs = ( @@ -107,7 +113,7 @@ def regress(): "-d", example_mtc_path("data"), "-o", - test_path("output"), + test_path(f"output_{int(sharrow)}{int(shadow_pricing)}"), ] ) if os.environ.get("GITHUB_ACTIONS") == "true": @@ -146,8 +152,153 @@ def test_prototype_mtc_extended_mp_shadow_pricing(): _test_prototype_mtc_extended(multiprocess=True, sharrow=False, shadow_pricing=True) -if __name__ == "__main__": +EXPECTED_MODELS = [ + "initialize_proto_population", + "compute_disaggregate_accessibility", + "initialize_landuse", + "initialize_households", + "compute_accessibility", + "school_location", + "workplace_location", + "auto_ownership_simulate", + "vehicle_type_choice", + "free_parking", + "cdap_simulate", + "mandatory_tour_frequency", + "mandatory_tour_scheduling", + "school_escorting", + "joint_tour_frequency", + "joint_tour_composition", + "joint_tour_participation", + "joint_tour_destination", + "joint_tour_scheduling", + "non_mandatory_tour_frequency", + "non_mandatory_tour_destination", + "non_mandatory_tour_scheduling", + "vehicle_allocation", + "tour_mode_choice_simulate", + "atwork_subtour_frequency", + "atwork_subtour_destination", + "atwork_subtour_scheduling", + "atwork_subtour_mode_choice", + "stop_frequency", + "trip_purpose", + "trip_destination", + "trip_purpose_and_destination", + "trip_scheduling", + "trip_mode_choice", + "write_data_dictionary", + "track_skim_usage", + "write_trip_matrices", + "write_tables", +] + + +@test.run_if_exists("prototype_mtc_extended_reference_pipeline.zip") +def test_prototype_mtc_extended_progressive(): + import activitysim.abm # register components + + state = workflow.create_example("prototype_mtc_extended", temp=True) + + state.settings.households_sample_size = 10 + state.settings.use_shadow_pricing = False + state.settings.want_dest_choice_sample_tables = False + state.settings.want_dest_choice_presampling = True + state.settings.recode_pipeline_columns = False + state.settings.output_tables = configuration.OutputTables( + h5_store=False, + action="include", + prefix="final_", + sort=True, + tables=[ + configuration.OutputTable( + tablename="trips", + decode_columns=dict( + origin="land_use.zone_id", destination="land_use.zone_id" + ), + ), + "vehicles", + "proto_disaggregate_accessibility", + ], + ) + + assert state.settings.models == EXPECTED_MODELS + assert state.settings.chunk_size == 0 + assert state.settings.sharrow == False + for step_name in EXPECTED_MODELS: + state.run.by_name(step_name) + try: + state.checkpoint.check_against( + Path(__file__).parent.joinpath( + "prototype_mtc_extended_reference_pipeline.zip" + ), + checkpoint_name=step_name, + ) + except Exception: + print(f"> prototype_mtc_extended {step_name}: ERROR") + raise + else: + print(f"> prototype_mtc_extended {step_name}: ok") + + +@pytest.mark.parametrize( + "chunksize", + [ + 100_000_000, # will sometimes trigger chunking + 999_999_999_999, # will never actually trigger chunking + ], +) +def test_prototype_mtc_extended_with_chunking(chunksize): + import activitysim.abm # register components + + state = workflow.create_example("prototype_mtc_extended", temp=True) + + state.settings.households_sample_size = 10 + state.settings.use_shadow_pricing = False + state.settings.want_dest_choice_sample_tables = False + state.settings.want_dest_choice_presampling = True + state.settings.recode_pipeline_columns = False + state.settings.output_tables = configuration.OutputTables( + h5_store=False, + action="include", + prefix="final_", + sort=True, + tables=[ + configuration.OutputTable( + tablename="trips", + decode_columns=dict( + origin="land_use.zone_id", destination="land_use.zone_id" + ), + ), + "vehicles", + "proto_disaggregate_accessibility", + ], + ) + state.settings.chunk_size = chunksize + state.settings.chunk_training_mode = "training" + + assert state.settings.models == EXPECTED_MODELS + assert state.settings.sharrow == False + assert state.settings.chunk_size == chunksize + + for step_name in EXPECTED_MODELS: + state.run.by_name(step_name) + try: + state.checkpoint.check_against( + Path(__file__).parent.joinpath( + "prototype_mtc_extended_reference_pipeline.zip" + ), + checkpoint_name=step_name, + ) + except Exception: + print(f"> prototype_mtc_extended {step_name}: ERROR") + raise + else: + print(f"> prototype_mtc_extended {step_name}: ok") + + +if __name__ == "__main__": test_prototype_mtc_extended() test_prototype_mtc_extended_sharrow() test_prototype_mtc_extended_mp() diff --git a/activitysim/examples/prototype_mwcog/configs/logging.yaml b/activitysim/examples/prototype_mwcog/configs/logging.yaml index 5fc7798a3a..c9cd6a1000 100644 --- a/activitysim/examples/prototype_mwcog/configs/logging.yaml +++ b/activitysim/examples/prototype_mwcog/configs/logging.yaml @@ -42,7 +42,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET diff --git a/activitysim/examples/prototype_mwcog/configs_mp/logging.yaml b/activitysim/examples/prototype_mwcog/configs_mp/logging.yaml index e932009c5d..64333d1414 100644 --- a/activitysim/examples/prototype_mwcog/configs_mp/logging.yaml +++ b/activitysim/examples/prototype_mwcog/configs_mp/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET @@ -37,21 +38,20 @@ logging: class: logging.StreamHandler stream: ext://sys.stdout formatter: simpleFormatter - #level: NOTSET - level: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [WARNING, NOTSET] + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET formatters: simpleFormatter: class: logging.Formatter - #format: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' - format: !!python/object/apply:activitysim.core.mp_tasks.if_sub_task [ - '%(processName)-10s %(levelname)s - %(name)s - %(message)s', - '%(levelname)s - %(name)s - %(message)s'] + format: + if_sub_task: '%(processName)-10s %(levelname)s - %(name)s - %(message)s' + if_not_sub_task: '%(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' fileFormatter: class: logging.Formatter format: '%(asctime)s - %(levelname)s - %(name)s - %(message)s' datefmt: '%d/%m/%Y %H:%M:%S' - diff --git a/activitysim/examples/prototype_mwcog/test/test_mwcog.py b/activitysim/examples/prototype_mwcog/test/test_mwcog.py index 98652504f6..27d56a2a2b 100644 --- a/activitysim/examples/prototype_mwcog/test/test_mwcog.py +++ b/activitysim/examples/prototype_mwcog/test/test_mwcog.py @@ -1,3 +1,5 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os @@ -8,12 +10,7 @@ import pandas.testing as pdt import pkg_resources -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import test def _test_mwcog(sharrow=False): @@ -31,7 +28,7 @@ def regress(): # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] - pdt.assert_frame_equal(final_trips_df, regress_trips_df) + test.assert_frame_substantively_equal(final_trips_df, regress_trips_df) file_path = os.path.join(os.path.dirname(__file__), ".." + os.sep + "simulation.py") diff --git a/activitysim/examples/prototype_sandag_xborder/configs/legacy-1.1.3/logging.yaml b/activitysim/examples/prototype_sandag_xborder/configs/legacy-1.1.3/logging.yaml index 1e92b76a32..9addd30706 100755 --- a/activitysim/examples/prototype_sandag_xborder/configs/legacy-1.1.3/logging.yaml +++ b/activitysim/examples/prototype_sandag_xborder/configs/legacy-1.1.3/logging.yaml @@ -28,7 +28,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET diff --git a/activitysim/examples/prototype_sandag_xborder/configs/logging.yaml b/activitysim/examples/prototype_sandag_xborder/configs/logging.yaml index 33072a571b..641739cc7a 100755 --- a/activitysim/examples/prototype_sandag_xborder/configs/logging.yaml +++ b/activitysim/examples/prototype_sandag_xborder/configs/logging.yaml @@ -40,7 +40,8 @@ logging: logfile: class: logging.FileHandler - filename: !!python/object/apply:activitysim.core.config.log_file_path ['activitysim.log'] + filename: + get_log_file_path: 'activitysim.log' mode: w formatter: fileFormatter level: NOTSET diff --git a/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py b/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py index 74e9098562..cc52818324 100644 --- a/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py +++ b/activitysim/examples/prototype_sandag_xborder/extensions/reassign_tour_purpose.py @@ -1,17 +1,19 @@ # ActivitySim # See full license in LICENSE.txt. +from __future__ import annotations + import logging import numpy as np import pandas as pd -from activitysim.core import config, inject, pipeline +from activitysim.core import config, workflow logger = logging.getLogger(__name__) -@inject.step() -def reassign_tour_purpose_by_poe(tours, chunk_size, trace_hh_id): +@workflow.step +def reassign_tour_purpose_by_poe(state: workflow.State, tours: pd.DataFrame) -> None: """ Simulates tour purpose choices after tour origin has been assigned. This @@ -20,13 +22,15 @@ def reassign_tour_purpose_by_poe(tours, chunk_size, trace_hh_id): """ trace_label = "reassign_tour_purpose_by_poe" - probs_df = pd.read_csv(config.config_file_path("tour_purpose_probs_by_poe.csv")) + probs_df = pd.read_csv( + state.filesystem.get_config_file_path("tour_purpose_probs_by_poe.csv") + ) probs_df.columns = [ col if col in ["Purpose", "Description"] else int(col) for col in probs_df.columns ] - tours_df = tours.to_frame(columns=["tour_type", "poe_id"]) + tours_df = tours[["tour_type", "poe_id"]] tour_types = probs_df[["Purpose", "Description"]].set_index("Purpose")[ "Description" ] @@ -36,18 +40,17 @@ def reassign_tour_purpose_by_poe(tours, chunk_size, trace_hh_id): num_tours = len(group) purpose_probs = probs_df[poe] purpose_cum_probs = purpose_probs.values.cumsum() - rands = pipeline.get_rn_generator().random_for_df(group) + rands = state.get_rn_generator().random_for_df(group) purpose_scaled_probs = np.subtract(purpose_cum_probs, rands) purpose = np.argmax((purpose_scaled_probs + 1.0).astype("i4"), axis=1) tours_df.loc[group.index, "purpose_id"] = purpose tours_df["new_tour_type"] = tours_df["purpose_id"].map(tour_types) - tours = tours.to_frame() tours["tour_type"] = tours_df["new_tour_type"].reindex(tours.index) tours["purpose_id"] = tours_df["purpose_id"].reindex(tours.index) tours["tour_category"] = "non_mandatory" tours.loc[tours["tour_type"].isin(["home", "work"]), "tour_category"] = "mandatory" - pipeline.replace_table("tours", tours) + state.add_table("tours", tours) return diff --git a/activitysim/examples/prototype_sandag_xborder/test/prototype_sandag_xborder_reference_pipeline.zip b/activitysim/examples/prototype_sandag_xborder/test/prototype_sandag_xborder_reference_pipeline.zip new file mode 100644 index 0000000000..72f109894b Binary files /dev/null and b/activitysim/examples/prototype_sandag_xborder/test/prototype_sandag_xborder_reference_pipeline.zip differ diff --git a/activitysim/examples/prototype_sandag_xborder/test/test_sandag_xborder.py b/activitysim/examples/prototype_sandag_xborder/test/test_sandag_xborder.py index 261190248e..e9d538cde8 100644 --- a/activitysim/examples/prototype_sandag_xborder/test/test_sandag_xborder.py +++ b/activitysim/examples/prototype_sandag_xborder/test/test_sandag_xborder.py @@ -1,19 +1,18 @@ +from __future__ import annotations + # ActivitySim # See full license in LICENSE.txt. import os import subprocess import sys +from pathlib import Path import pandas as pd import pandas.testing as pdt import pkg_resources -from activitysim.core import inject - - -def teardown_function(func): - inject.clear_cache() - inject.reinject_decorated_tables() +from activitysim.core import workflow +from activitysim.core.test import run_if_exists def _test_sandag_xborder(sharrow=False, mp=True): @@ -32,7 +31,12 @@ def regress(): test_path("regress/final_trips_1_process.csv") ) final_trips_df = pd.read_csv(test_path("output/final_trips.csv")) - pdt.assert_frame_equal(final_trips_df, regress_trips_df) + # column ordering not important + assert sorted(regress_trips_df.columns) == sorted(final_trips_df.columns) + pdt.assert_frame_equal( + final_trips_df[sorted(regress_trips_df.columns)], + regress_trips_df[sorted(regress_trips_df.columns)], + ) file_path = os.path.join(os.path.dirname(__file__), "../simulation.py") @@ -72,6 +76,56 @@ def test_sandag_xborder_sharrow(): _test_sandag_xborder(sharrow=True, mp=False) -if __name__ == "__main__": +EXPECTED_MODELS = [ + "initialize_landuse", + "initialize_households", + "initialize_tours", + "initialize_los", + "initialize_tvpb", + "tour_scheduling_probabilistic", + "tour_od_choice", + "reassign_tour_purpose_by_poe", + "tour_mode_choice_simulate", + "stop_frequency", + "trip_purpose", + "trip_scheduling", + "trip_destination", + "trip_mode_choice", + "write_trip_matrices", + "write_tables", +] + + +@run_if_exists("prototype_sandag_xborder_reference_pipeline.zip") +def test_sandag_xborder_progressive(): + import activitysim.abm # register components # noqa: F401 + + state = workflow.create_example("prototype_sandag_xborder", temp=True) + state.settings.multiprocess = False + state.settings.num_processes = 1 + state.settings.households_sample_size = 10 + state.settings.chunk_size = 0 + state.settings.recode_pipeline_columns = False + state.import_extensions("extensions") + + assert state.settings.models == EXPECTED_MODELS + assert state.settings.sharrow == False + + for step_name in EXPECTED_MODELS: + state.run.by_name(step_name) + try: + state.checkpoint.check_against( + Path(__file__).parent.joinpath( + "prototype_sandag_xborder_reference_pipeline.zip" + ), + checkpoint_name=step_name, + ) + except Exception: + print(f"> prototype_sandag_xborder {step_name}: ERROR") + raise + else: + print(f"> prototype_sandag_xborder {step_name}: ok") + +if __name__ == "__main__": test_sandag_xborder() diff --git a/activitysim/examples/test/__init__.py b/activitysim/examples/test/__init__.py new file mode 100644 index 0000000000..d9431399da --- /dev/null +++ b/activitysim/examples/test/__init__.py @@ -0,0 +1 @@ +# tests for downloading external examples diff --git a/activitysim/examples/test/test_external_examples.py b/activitysim/examples/test/test_external_examples.py new file mode 100644 index 0000000000..50d797bab3 --- /dev/null +++ b/activitysim/examples/test/test_external_examples.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import tempfile + +from activitysim.cli.create import sha256_checksum +from activitysim.examples.external import registered_external_example + + +def test_external_download_unpack(): + """Test the external download mechanism, including unpacking assets""" + t = tempfile.TemporaryDirectory() + p = registered_external_example("estimation_example", t.name) + assert p.joinpath("configs/settings.yaml").is_file() + assert p.joinpath("data_sf/survey_data").is_dir() + assert ( + sha256_checksum(p.joinpath("data_sf/households.csv")) + == "7e782bb59c05a79110503a5f8173e3470b3969a40451af0271795d0d23909069" + ) + assert ( + sha256_checksum(p.joinpath("data_sf/skims.omx")) + == "579d6007266db3b055d0f9e4814004f4d5ccfae27a36e40f4881e3662bc3d3f1" + ) + assert ( + sha256_checksum(p.joinpath("data_sf/land_use.csv")) + == "83e1051fffa23ad1d6ec339fcb675532f0782c94ddf76d54020631d73bfca12f" + ) + assert ( + sha256_checksum(p.joinpath("data_sf/persons.csv")) + == "e24db9ac6c20592e672cd9fc4e8160528fe38a7c16cc54fe4920c516a29d732c" + ) + assert ( + sha256_checksum(p.joinpath("data_sf/survey_data/survey_tours.csv")) + == "633f734d964dcf25a20a4032a859982d861e1d327443d4f1bac64af9ef69cc7a" + ) + + +def test_external_download_basic(): + """Test the external download mechanism, including unpacking assets""" + t = tempfile.TemporaryDirectory() + p = registered_external_example("prototype_mtc", t.name) + assert p.joinpath("configs/settings.yaml").is_file() + assert p.joinpath("test/prototype_mtc_reference_pipeline.zip").is_file() + assert ( + sha256_checksum(p.joinpath("test/prototype_mtc_reference_pipeline.zip")) + == "394e5b403d4c61d5214493cefe161432db840ba4967c23c999d914178d43a1f0" + ) diff --git a/activitysim/workflows/steps/run.py b/activitysim/workflows/steps/run.py index bce4d1d303..8e8fdcdaec 100644 --- a/activitysim/workflows/steps/run.py +++ b/activitysim/workflows/steps/run.py @@ -2,9 +2,9 @@ from pypyr.errors import KeyNotInContextError -from ...standalone.utils import chdir -from .progression import reset_progress_step -from .wrapping import workstep +from activitysim.standalone.utils import chdir +from activitysim.workflows.steps.progression import reset_progress_step +from activitysim.workflows.steps.wrapping import workstep def _get_formatted(context, key, default): @@ -60,18 +60,8 @@ def run_activitysim( reset_progress_step(description=f"{label}", prefix="[bold green]") - # Clear all saved state from ORCA - import orca - - orca.clear_cache() - orca.clear_all() - - # Re-inject everything from ActivitySim - from ...core.inject import reinject_decorated_tables - - reinject_decorated_tables(steps=True) - # Call the run program inside this process + import activitysim.abm # noqa: F401 from activitysim.cli.main import prog with chdir(cwd): diff --git a/conda-environments/activitysim-dev-base.yml b/conda-environments/activitysim-dev-base.yml index 1e63b621db..e7c9d43340 100644 --- a/conda-environments/activitysim-dev-base.yml +++ b/conda-environments/activitysim-dev-base.yml @@ -13,7 +13,7 @@ channels: - conda-forge - nodefaults dependencies: -- python=3.9 +- python=3.10 - pip - asv # for benchmarking - black >= 22.0,<23 @@ -35,6 +35,7 @@ dependencies: - myst-parser # allows markdown in sphinx - nbconvert - nbformat +- nbmake - numba = 0.56.* - numexpr - numpy = 1.23.* @@ -59,6 +60,7 @@ dependencies: - requests = 2.28.* - rich = 13.3.* - ruby # required for benchmarking pre-commit hooks +- ruff - setuptools_scm - scikit-learn = 1.2.* - simwrapper > 1.7 @@ -69,6 +71,7 @@ dependencies: - xarray = 2023.2.* - xmle - zarr = 2.14.* +- zstandard - pip: - autodoc_pydantic diff --git a/conda-environments/activitysim-dev.yml b/conda-environments/activitysim-dev.yml index cac073df65..a34fcd2715 100644 --- a/conda-environments/activitysim-dev.yml +++ b/conda-environments/activitysim-dev.yml @@ -9,7 +9,7 @@ channels: - conda-forge - nodefaults dependencies: -- python=3.9 +- python=3.10 - pip - asv # for benchmarking - black >= 22.0,<23 @@ -55,9 +55,10 @@ dependencies: - requests = 2.28.* - rich = 13.3.* - ruby # required for benchmarking pre-commit hooks +- ruff - setuptools_scm - scikit-learn = 1.2.* -- sharrow >= 2.5.2 +- sharrow >= 2.6.0 - simwrapper > 1.7 - snakeviz # for profiling - sphinx = 6.1.* @@ -66,6 +67,7 @@ dependencies: - xarray = 2023.2.* - xmle - zarr = 2.14.* +- zstandard - pip: - autodoc_pydantic diff --git a/conda-environments/docbuild.yml b/conda-environments/docbuild.yml index 879338d1c7..c77fff75f2 100644 --- a/conda-environments/docbuild.yml +++ b/conda-environments/docbuild.yml @@ -10,8 +10,9 @@ name: docbuild channels: - conda-forge dependencies: -- python=3.9 +- python=3.10 - pip +- altair - black >= 22.0,<23 - bump2version - coveralls @@ -24,6 +25,7 @@ dependencies: - jupyterlab - larch >=5.5.3 - matplotlib +- myst-nb - myst-parser - numba >= 0.56.4 - numpy >= 1.16.1 @@ -31,6 +33,7 @@ dependencies: - openmatrix >= 0.3.4.1 - orca >= 1.6 - pandas >= 1.1.0 +- platformdirs - psutil >= 4.1 - pyarrow >= 2.0 - pydantic @@ -42,9 +45,12 @@ dependencies: - pyyaml >= 5.1 - requests >= 2.7 - scikit-learn >= 1.1 -- sharrow >= 2.5.2 +- sharrow >= 2.6.0 - simwrapper > 1.7 - sphinx-argparse +- sphinx-autosummary-accessors +- sphinx-copybutton +- sphinx-remove-toctrees - sphinx_rtd_theme - xarray >= 0.21 - zarr diff --git a/conda-environments/github-actions-tests.yml b/conda-environments/github-actions-tests.yml index 28d6c51f8e..e4d44e8d74 100644 --- a/conda-environments/github-actions-tests.yml +++ b/conda-environments/github-actions-tests.yml @@ -28,8 +28,10 @@ dependencies: - pytest-regressions - pyyaml = 6.* - requests = 2.28.* +- ruff - scikit-learn = 1.2.* -- sharrow >= 2.5.2 +- sharrow >= 2.6.0 - simwrapper > 1.7 - xarray = 2023.2.* - zarr = 2.14.* +- zstandard diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css index a78755ebcf..231ec5701a 100644 --- a/docs/_static/theme_overrides.css +++ b/docs/_static/theme_overrides.css @@ -1,6 +1,7 @@ a.navbar-brand { font-size: 20pt; font-weight: bold; + padding-right: 30px; } div.sd-card-header { diff --git a/docs/_templates/autosummary/accessor.rst b/docs/_templates/autosummary/accessor.rst new file mode 100644 index 0000000000..fb962b53d4 --- /dev/null +++ b/docs/_templates/autosummary/accessor.rst @@ -0,0 +1,12 @@ +{{ objname }} +{{ underline }} + +{% if module.startswith('activitysim.core.workflow') %} +.. currentmodule:: {{ module.split('.')[:3] | join('.') }} + +.. autoaccessor:: {{ (module.split('.')[3:] + [objname]) | join('.') }} +{% else %} +.. currentmodule:: {{ module.split('.')[0] }} + +.. autoaccessor:: {{ (module.split('.')[1:] + [objname]) | join('.') }} +{% endif %} diff --git a/docs/_templates/autosummary/accessor_attribute.rst b/docs/_templates/autosummary/accessor_attribute.rst new file mode 100644 index 0000000000..bc9972469c --- /dev/null +++ b/docs/_templates/autosummary/accessor_attribute.rst @@ -0,0 +1,12 @@ +{{ objname }} +{{ underline }} + +{% if module.startswith('activitysim.core.workflow') %} +.. currentmodule:: {{ module.split('.')[:3] | join('.') }} + +.. autoaccessorattribute:: {{ (module.split('.')[3:] + [objname]) | join('.') }} +{% else %} +.. currentmodule:: {{ module.split('.')[0] }} + +.. autoaccessorattribute:: {{ (module.split('.')[1:] + [objname]) | join('.') }} +{% endif %} diff --git a/docs/_templates/autosummary/accessor_callable.rst b/docs/_templates/autosummary/accessor_callable.rst new file mode 100644 index 0000000000..39502b2dbb --- /dev/null +++ b/docs/_templates/autosummary/accessor_callable.rst @@ -0,0 +1,12 @@ +{{ objname }} +{{ underline }} + +{% if module.startswith('activitysim.core.workflow') %} +.. currentmodule:: {{ module.split('.')[:3] | join('.') }} + +.. autoaccessorcallable:: {{ (module.split('.')[3:] + [objname]) | join('.') }}.__call__ +{% else %} +.. currentmodule:: {{ module.split('.')[0] }} + +.. autoaccessorcallable:: {{ (module.split('.')[1:] + [objname]) | join('.') }}.__call__ +{% endif %} diff --git a/docs/_templates/autosummary/accessor_method.rst b/docs/_templates/autosummary/accessor_method.rst new file mode 100644 index 0000000000..46d1b7c28c --- /dev/null +++ b/docs/_templates/autosummary/accessor_method.rst @@ -0,0 +1,12 @@ +{{ objname }} +{{ underline }} + +{% if module.startswith('activitysim.core.workflow') %} +.. currentmodule:: {{ module.split('.')[:3] | join('.') }} + +.. autoaccessormethod:: {{ (module.split('.')[3:] + [objname]) | join('.') }} +{% else %} +.. currentmodule:: {{ module.split('.')[0] }} + +.. autoaccessormethod:: {{ (module.split('.')[1:] + [objname]) | join('.') }} +{% endif %} diff --git a/docs/_templates/autosummary/accessor_property.rst b/docs/_templates/autosummary/accessor_property.rst new file mode 100644 index 0000000000..46d1b7c28c --- /dev/null +++ b/docs/_templates/autosummary/accessor_property.rst @@ -0,0 +1,12 @@ +{{ objname }} +{{ underline }} + +{% if module.startswith('activitysim.core.workflow') %} +.. currentmodule:: {{ module.split('.')[:3] | join('.') }} + +.. autoaccessormethod:: {{ (module.split('.')[3:] + [objname]) | join('.') }} +{% else %} +.. currentmodule:: {{ module.split('.')[0] }} + +.. autoaccessormethod:: {{ (module.split('.')[1:] + [objname]) | join('.') }} +{% endif %} diff --git a/docs/_templates/autosummary/base.rst b/docs/_templates/autosummary/base.rst new file mode 100644 index 0000000000..e03319b8d9 --- /dev/null +++ b/docs/_templates/autosummary/base.rst @@ -0,0 +1,5 @@ +{{ objname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. auto{{ objtype }}:: {{ objname }} diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst new file mode 100644 index 0000000000..4de83f6163 --- /dev/null +++ b/docs/_templates/autosummary/class.rst @@ -0,0 +1,29 @@ +{{ objname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block methods %} + {% if methods %} + .. rubric:: {{ _('Methods') }} + + .. autosummary:: + :toctree: + {% for item in methods %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} + + .. autosummary:: + :toctree: + {% for item in attributes %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/docs/_templates/autosummary/class_decorator.rst b/docs/_templates/autosummary/class_decorator.rst new file mode 100644 index 0000000000..e9011ba79a --- /dev/null +++ b/docs/_templates/autosummary/class_decorator.rst @@ -0,0 +1,12 @@ +{{ objname | escape | underline}} + +{% if module.startswith('activitysim.core.workflow') %} +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :show-inheritance: +{% else %} +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} +{% endif %} diff --git a/docs/_templates/autosummary/class_no_init.rst b/docs/_templates/autosummary/class_no_init.rst new file mode 100644 index 0000000000..5e9f1b8f8e --- /dev/null +++ b/docs/_templates/autosummary/class_no_init.rst @@ -0,0 +1,32 @@ +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }}() + + {% block methods %} + {% if methods %} + .. rubric:: {{ _('Methods') }} + + .. autosummary:: + :toctree: generated/ + {% for item in methods %} + {% if item != "__init__" %} + ~{{ [module, name] | join('.') }}.{{ item }} + {% endif %} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} + + .. autosummary:: + :toctree: generated/ + {% for item in attributes %} + ~{{ [module, name] | join('.') }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/docs/_templates/autosummary/module.rst b/docs/_templates/autosummary/module.rst new file mode 100644 index 0000000000..be3cf9874a --- /dev/null +++ b/docs/_templates/autosummary/module.rst @@ -0,0 +1,66 @@ +{{ fullname | escape | underline}} + +.. automodule:: {{ fullname }} + + {% block attributes %} + {% if attributes %} + .. rubric:: Module Attributes + + .. autosummary:: + :toctree: + {% for item in attributes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block functions %} + {% if functions %} + .. rubric:: {{ _('Functions') }} + + .. autosummary:: + :toctree: + {% for item in functions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block classes %} + {% if classes %} + .. rubric:: {{ _('Classes') }} + + .. autosummary:: + :toctree: + :template: autosummary/class.rst + {% for item in classes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block exceptions %} + {% if exceptions %} + .. rubric:: {{ _('Exceptions') }} + + .. autosummary:: + :toctree: + {% for item in exceptions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + +{% block modules %} +{% if modules %} +.. rubric:: Modules + +.. autosummary:: + :toctree: + :template: autosummary/module.rst + :recursive: +{% for item in modules %} + {{ item }} +{%- endfor %} +{% endif %} +{% endblock %} diff --git a/docs/conf.py b/docs/conf.py index 9149b0e37b..0b6af5034a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,6 +12,8 @@ # All configuration values have a default; values that are commented out # serve to show the default. +from __future__ import annotations + import os import sys @@ -36,26 +38,41 @@ "sphinx.ext.mathjax", "numpydoc", "sphinx.ext.autosummary", - "myst_parser", + "myst_nb", + # "myst_parser", "sphinx_design", "sphinxarg.ext", + "sphinx.ext.intersphinx", "sphinxcontrib.autodoc_pydantic", + "sphinx_autosummary_accessors", + "sphinx_remove_toctrees", + "sphinx_copybutton", ] +remove_from_toctrees = [ + "dev-guide/_generated/*", + "dev-guide/_generated2/*", + "users-guide/_generated/*", +] myst_enable_extensions = ["colon_fence"] +myst_heading_anchors = 3 +nb_merge_streams = True numpydoc_show_class_members = False autosummary_generate = True autodoc_pydantic_model_signature_prefix = "settings" autodoc_pydantic_model_show_json = False +autodoc_typehints_format = "short" +python_use_unqualified_type_names = True + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix of source filenames. source_suffix = { ".rst": "restructuredtext", - ".txt": "markdown", - ".md": "markdown", + # ".txt": "markdown", + # ".md": "markdown", } # The encoding of source files. @@ -134,7 +151,7 @@ print("github repo owner: " + GITHUB_REPOSITORY_OWNER) html_theme_options = { - "footer_items": ["version-date", "sphinx-version"], + "footer_start": ["version-date", "sphinx-version"], "switcher": { "json_url": f"https://{GITHUB_REPOSITORY_OWNER}.github.io/activitysim/switcher.json", "version_match": version, @@ -147,7 +164,7 @@ # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -html_title = f"ActivitySim {release}" +html_title = "ActivitySim" # A shorter title for the navigation bar. Default is the same as html_title. # html_short_title = None @@ -215,6 +232,14 @@ # Output file base name for HTML help builder. htmlhelp_basename = "ActivitySimdoc" +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "pandas": ("http://pandas.pydata.org/pandas-docs/stable", None), + "xarray": ("https://docs.xarray.dev/en/stable", None), + "pyarrow": ("https://arrow.apache.org/docs", None), + "numba": ("https://numba.pydata.org/numba-doc/latest", None), + "psutil": ("https://psutil.readthedocs.io/en/latest", None), +} # -- Options for LaTeX output --------------------------------------------- diff --git a/docs/core.rst b/docs/core.rst index c8438e3fe5..082db7bb9a 100644 --- a/docs/core.rst +++ b/docs/core.rst @@ -64,20 +64,6 @@ API .. automodule:: activitysim.core.skim_dictionary :members: -.. _pipeline_in_detail: - -Pipeline -~~~~~~~~ - -Data pipeline manager, which manages the list of model steps, runs them, reads -and writes data tables from/to the pipeline datastore, and supports restarting of the pipeline -at any model step. - -API -^^^ - -.. automodule:: activitysim.core.pipeline - :members: .. _random_in_detail: @@ -719,19 +705,6 @@ API .. automodule:: activitysim.core.config :members: -.. _inject: - -Inject -~~~~~~ - -Model orchestration and data pipeline interaction. - -API -^^^ - -.. automodule:: activitysim.core.inject - :members: - Mem ~~~ diff --git a/docs/dev-guide/checkpointing.md b/docs/dev-guide/checkpointing.md new file mode 100644 index 0000000000..0698a7b63a --- /dev/null +++ b/docs/dev-guide/checkpointing.md @@ -0,0 +1,45 @@ +# Checkpointing + +```{eval-rst} +.. currentmodule:: activitysim.core.workflow.checkpoint +``` + +ActivitySim provides a checkpointing mechanism, whereby the content of data tables +can be stored to disk in an intermediate state. This intermediate state can +subsequently be restored from disk, setting up the data tables to resume +simulation from that point forward. + +There are currently two data file formats available for checkpointing: + +- [HDF5](https://www.hdfgroup.org/solutions/hdf5/), the longstanding default + format for ActivitySim checkpointing, and +- [Apache Parquet](https://parquet.apache.org/), added as an option as of + ActivitySim version 1.3. + +## Usage + +The operation of automatic checkpointing during an ActivitySim run is controlled +via a few values in the top-level settings: + +- [`checkpoint_format`](activitysim.core.configuration.Settings.checkpoint_format) + controls which checkpoint data file format is used. +- [`checkpoints`](activitysim.core.configuration.Settings.checkpoints) + controls how frequently checkpoints are written (after every component, after + only certain components, or not at all). + +For code developers wanting to integrate some aspect of checkpointing into +a manual workflow or a new component, the +[`State.checkpoint`](activitysim.core.workflow.State.checkpoint) +accessor has most of the relevant methods. + +## API + +```{eval-rst} +.. autosummary:: + :toctree: _generated + :recursive: + + GenericCheckpointStore + HdfStore + ParquetStore +``` diff --git a/docs/dev-guide/core-workflow-api.md b/docs/dev-guide/core-workflow-api.md new file mode 100644 index 0000000000..71cb339c15 --- /dev/null +++ b/docs/dev-guide/core-workflow-api.md @@ -0,0 +1,290 @@ +# State API + +```{eval-rst} +.. currentmodule:: activitysim.core.workflow + +.. autosummary:: + :toctree: _generated + :recursive: + + State +``` + +## Constructors + +```{eval-rst} +.. autosummary:: + :toctree: _generated + :recursive: + + State.__init__ + State.make_default + State.make_temp + create_example +``` + +## Model Setup + +```{eval-rst} +.. autosummary:: + + State.init_state + State.import_extensions + State.initialize_filesystem + State.default_settings + State.load_settings + State.settings + State.filesystem + State.network_settings +``` + + + +## Basic Context Management + +The most basic function of the `State` object is to serve as a defined +namespace for storing model-relevant variables. This includes the top-level +model settings, data tables, skims, and any other Python variables +that represent the current state of a particular modeling system (or when +multiprocessing, sub-system). Below are the basic methods to get and set values +in this context in their "raw" form, with minimal additional processing. + +```{eval-rst} +.. autosummary:: + + State.get + State.set + State.drop + State.access + State.get_injectable + State.add_injectable +``` + + +## Data Access and Manipulation + +In addition to "raw" access to context variable, several methods are provided +to simplify different kinds access to the "tables" that represent the +simulation inputs and outputs of ActivitySim. We say "tables" here in the +abstract sense -- historically these tables have been stored internally by +ORCA as `pandas.DataFrame`s, but the exact internal storage format is abstracted +away here in favor of providing access to the data in several specific formats. + +```{eval-rst} + +.. rubric:: Methods + +.. autosummary:: + + State.get_dataset + State.get_dataframe + State.get_dataarray + State.get_dataframe_index_name + State.get_pyarrow + State.add_table + State.is_table + State.registered_tables + State.get_table + +.. rubric:: Accessor + +.. autosummary:: + :toctree: _generated2 + :template: autosummary/accessor.rst + + State.dataset +``` + + +## Run + +Executing model components is handled by methods in the `run` accessor. + +```{eval-rst} + +.. rubric:: Accessor + +.. autosummary:: + :toctree: _generated2 + :template: autosummary/accessor_callable.rst + + State.run + + + +.. rubric:: Attributes + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_attribute.rst + + State.run.heading_level + + + +.. rubric:: Methods + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_method.rst + + State.run.by_name + State.run.all +``` + +(state-checkpoint)= +## Checkpoints + +The `State` object provides access to [checkpointing](checkpointing.md) functions +within the `checkpoint` accessor. + +```{eval-rst} + +.. rubric:: Accessor + +.. autosummary:: + :toctree: _generated2 + :template: autosummary/accessor.rst + + State.checkpoint + + +.. rubric:: Attributes + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_attribute.rst + + State.checkpoint.last_checkpoint + State.checkpoint.checkpoints + State.checkpoint.store + + +.. rubric:: Methods + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_method.rst + + State.checkpoint.store_is_open + State.checkpoint.open_store + State.checkpoint.close_store + State.checkpoint.add + State.checkpoint.list_tables + State.checkpoint.load + State.checkpoint.get_inventory + State.checkpoint.restore + State.checkpoint.restore_from + State.checkpoint.check_against + State.checkpoint.cleanup + State.checkpoint.load_dataframe + State.checkpoint.last_checkpoint_name + State.checkpoint.is_readonly + State.checkpoint.default_pipeline_file_path + +``` + + +## Tracing + +```{eval-rst} + +.. rubric:: Attributes + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_attribute.rst + + State.tracing.traceable_tables + State.tracing.traceable_table_ids + State.tracing.traceable_table_indexes + State.tracing.run_id + State.tracing.validation_directory + + + +.. rubric:: Methods + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_method.rst + + State.tracing.initialize + State.tracing.register_traceable_table + State.tracing.deregister_traceable_table + State.tracing.write_csv + State.tracing.trace_df + State.tracing.trace_interaction_eval_results + State.tracing.get_trace_target + State.tracing.trace_targets + State.tracing.has_trace_targets + State.tracing.dump_df + State.tracing.delete_output_files + State.tracing.delete_trace_files +``` + + +## Logging + +```{eval-rst} + +.. rubric:: Methods + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_method.rst + + State.logging.config_logger + State.logging.rotate_log_directory +``` + + +## Reporting + +```{eval-rst} + +.. rubric:: Accessor + +.. autosummary:: + :toctree: _generated2 + :template: autosummary/accessor.rst + + State.report + + +.. rubric:: Methods + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_method.rst + + State.report.nominal_distribution + State.report.ordinal_distribution + State.report.histogram +``` + + +## Extending + +Methods to extend ActivitySim's functionality are available under the `extend` +accessor. + +```{eval-rst} + +.. rubric:: Accessor + +.. autosummary:: + :toctree: _generated2 + :template: autosummary/accessor.rst + + State.extend + + +.. rubric:: Methods + +.. autosummary:: + :toctree: _generated + :template: autosummary/accessor_method.rst + + State.extend.declare_table +``` diff --git a/docs/dev-guide/core-workflow-steps.md b/docs/dev-guide/core-workflow-steps.md new file mode 100644 index 0000000000..c3d12541e3 --- /dev/null +++ b/docs/dev-guide/core-workflow-steps.md @@ -0,0 +1,76 @@ +(workflow-steps)= +# Workflow Steps + +An ActivitySim component is written as a Python function with a `@workflow.step` +decorator: + +```python +import pandas as pd +from activitysim.core import workflow + +@workflow.step +def component_name( + state: workflow.State, + named_temp_table: pd.DataFrame, + named_table: pd.DataFrame, + cached_object: bool = False, +) -> None: + ... # do something +``` + +Similar to a typical Python class method, the first argument to a +workflow step must be a reference to a [`State`](core-workflow-api.md) +object named `state`. Unlike a typical Python class method, this is +rigorously enforced -- if you decorate a function as a `workflow.step` +and the first argument is not named `state` a `TypeError` will be raised. + +Similar to the legacy ORCA-based implementation of ActivitySim, when called +by the automated processes that orchestrate component execution, the names +of all subsequent arguments should generally match objects that are expected +to be already stored as keys in the `state` context, or have decorated +constructors declared elsewhere in the imported codebase. However, if an +argument is provided with a default value, then the default value is used +unless it is explicitly overloaded in the function call; i.e. the default +value in the function signature takes precedence over any value stored in the +state's context. + +Unlike typical Python functions, the type annotations for the decorated +function's arguments are *not* totally ignored, at least when the function is +called via the [`State.run`](activitysim.core.workflow.State.run) mechanisms. +When asking for a data table to be provided, the type annotation is respected +if it is `pandas.DataFrame` or `xarray.Dataset`; the caller will receive the +table in the indicated format. + +The decorator will spin off a reference of the decorated function in the +`_RUNNABLE_STEPS` class attribute for `State`, facilitating the automatic +discovery and/or execution of this function via the +[`State.run`](activitysim.core.workflow.State.run) mechanisms. +The original function also remains available to import and use without +changes. + +The decorated function may mutate the `state` argument by adding or removing +things from the state's context. Most existing workflow steps operate in this +manner. The return type annotation can be given as "None" to flag that +this mutation behavior is indeed baked in to the decorated function -- indeed, +by implication it must be as there is no other pathway to output a result, +although that is not otherwise checked. + +Alternatively, the wrapped function can return a `Mapping[str, Any]` that +will be used to update the state's context. This happens automatically when +the step is called via the `State.run` accessor, or can (must) be handled +separately by the caller if the function is executed directly. (Future work +may migrate ActivitySim to favor or require this "pure" function behavior.) + + +## API + +```{eval-rst} +.. currentmodule:: activitysim.core.workflow + +.. autosummary:: + :toctree: _generated + :template: autosummary/class_decorator.rst + :recursive: + + step +``` diff --git a/docs/dev-guide/core-workflow-table.md b/docs/dev-guide/core-workflow-table.md new file mode 100644 index 0000000000..16ce5b7882 --- /dev/null +++ b/docs/dev-guide/core-workflow-table.md @@ -0,0 +1,117 @@ +# Workflow Tables + +## Standard Tables + +An ActivitySim table definition is written as a Python function with a +`workflow.table` decorator: + +```python +import pandas as pd +from activitysim.core import workflow + +@workflow.table +def households(state: workflow.State) -> pd.DataFrame: + df = pd.DataFrame(...) + # do something to set up table here + return df +``` + +Similar to a typical Python class method, the first argument to a workflow table +function is always a reference to a [`State`](core-workflow-api.md) +object named `state`. Unlike a typical Python class method, this is rigorously +enforced -- if you decorate a function as a `workflow.table` and the first +argument is not named `state` a `TypeError` will be raised. + +For most tables, the initialization of the table will be defined by values in +the {py:class}`Settings `, and there will +be no other function arguments. + +If table initialization does require access to other tables (e.g. the *vehicles* +table needs access to the *households* table to be initialized) then other tables +can be provided as matching-named arguments, in the same way as +[`workflow.step`](core-workflow-steps) functions. + +The `workflow.table` function should return a `pandas.DataFrame` or +`xarray.Dataset` representation of the table. When this function is called +automatically by the processes that orchestrate execution, this object will be +stored in the state's context as the name of the table. + + +## Temporary Tables + +In addition to the main `workflow.table` decorator, there is also a similar +`workflow.temp_table` decorator for temporary tables. + +```python +import pandas as pd +from activitysim.core import workflow + +@workflow.temp_table +def households_merged( + state: workflow.State, + households: pd.DataFrame, + land_use: pd.DataFrame, + accessibility: pd.DataFrame, +) -> pd.DataFrame: + df = pd.DataFrame(...) + # do something to set up table here + return df +``` + +There are two main differences between regular tables and temporary tables: + +1. Temporary tables are never checkpointed. + + The supposition for temporary tables is that they are generally large, and + easy to re-create on the fly, so storing them to disk is wasteful. Most + temporary tables in ActivitySim are simply merges of other existing tables, + although that is not formally a requirement of a temporary tables. + +2. Temporary tables are dropped when any predicate argument is changed in the same `State`. + + The *predicates* are all the named arguments of the `workflow.temp_table` + wrapped function after the `state`. If another ActivitySim instruction + triggers an update to *any* of these predicate arguments, the temporary + table is dropped from the state's context. It can (presumably) be recreated + easily from the (now different) predicate values if/when needed for later steps. + + +(core-workflow-cached-objects)= +## Other Cached Objects + +Other arbitrary Python objects can also be generated by functions that are +handled by the same automatic system as tables, using the `workflow.cached_object` +decorator. + +```python +from activitysim.core import workflow + +@workflow.cached_object +def name_of_object( + state: workflow.State, + other_thing: bool = False, +): + obj = [1,2,3] if other_thing else [7,8,9] # or any python object + return obj +``` + +Similar to temporary tables, these objects are not stored in checkpoint files. +Unlike temporary tables, they are not formally predicated on their arguments, so +for example in the `cached_object` above, a change in the value of `other_thing` +will cause `name_of_object` to be regenerated if it already exists in the state's +context. + +## API + +```{eval-rst} +.. currentmodule:: activitysim.core.workflow + +.. autosummary:: + :toctree: _generated + :template: autosummary/class_decorator.rst + :recursive: + + table + temp_table + cached_object +``` diff --git a/docs/dev-guide/core-workflow.md b/docs/dev-guide/core-workflow.md new file mode 100644 index 0000000000..d20280902a --- /dev/null +++ b/docs/dev-guide/core-workflow.md @@ -0,0 +1,42 @@ +# Workflow State + +The general framework of each ActivitySim model is defined within an encapsulated +[`State`](core-workflow-api) object. This object maintains references to data and +model structures in a well-defined context, and allow the user to pass that context +around to the various functions and methods that progressively build up the simulated +activity patterns. + +The [`State`](core-workflow-api) object replaces the ORCA framework, and allows for data +from multiple models, or multiple versions of the same model, to co-exist in a single +Python instance simultaneously. The state contains references for overall model +settings, the network level of service features, as well as the state of the +simulated households, persons, etc. Extensive documentation on the +[API](core-workflow-api.md) for working with the state is available. + +The [`State`](core-workflow-api) class for ActivitySim also offers hooks for a +few fundamental elements: + +- [**Steps**](core-workflow-steps), also referred to as "model components", + which represent the fundamental mathematical building blocks of an ActivitySim + model. Each component contains instructions for incrementally augmenting the + state of the model, generally by adding columns or rows to an existing table, + although components are not limited to that and can potentially do other things + as well. +- [**Data Tables**](core-workflow-table), sometimes referred to in older + documentation sections as "pipeline tables". These tables include households, + persons, trips, tours, and potentially other tables that represent aspects of + the simulated agents. +- [**Other Cached Objects**](core-workflow-table.md#other-cached-objects), which can + be any arbitrary Python object that can be created programmatically and stored + in the state's context dictionary. + + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + :hidden: + + core-workflow-api + core-workflow-steps + core-workflow-table +``` diff --git a/docs/dev-guide/index.rst b/docs/dev-guide/index.rst index 2ec96f83ec..6661429f42 100644 --- a/docs/dev-guide/index.rst +++ b/docs/dev-guide/index.rst @@ -23,9 +23,12 @@ Contents :maxdepth: 3 install + core-workflow using-sharrow skim-dataset + checkpointing workflows + logging ../development ../models components/index diff --git a/docs/dev-guide/logging.md b/docs/dev-guide/logging.md new file mode 100644 index 0000000000..f2a5a3db7c --- /dev/null +++ b/docs/dev-guide/logging.md @@ -0,0 +1,71 @@ +# Logging + +ActivitySim uses the usual Python [logging](https://docs.python.org/3/library/logging.html) +infrastructure, with just a few additional features. + +Generally, logging configuration is done via the +[dictConfig](https://docs.python.org/3/library/logging.config.html#logging.config.dictConfig) +interface, with keys and values as documented +[here](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details). +This dictionary fed to this configurator is loaded from the `logging.yaml` +file(s) located in your model's configuration directory(s) following the +usual pattern for finding and loading config files. + +```{versionadded} 1.3 +ActivitySim no longer permits the use of `!!python/object/apply` directives inside +yaml input files. These commands imply the capability to allow arbitrary code +execution, and we would like to move away from that. +``` + +Instead of allowing arbitrary code to be loaded into and modify the logging configuration, +there are just a few particular ActivitySim functions are exposed. + +## Log file locations + +As noted above, the logging configuration implementation relies heavily on the +standard Python logging library, which by default knows nothing about ActivitySim +or its typical layout of output files, including placement of logs in a designated +output directory. Therefore, if you set the filename of a logging FileHandler to +just a string like this: + +```yaml +logfile: + class: logging.FileHandler + filename: just-a-file-name.log +``` + +then that file will be created in the Python current working directory (typically +wherever you invoked the script) and not in your designated output directory. +To fix this and write the log into your designated output directory, you can use +`get_log_file_path` as an intervening key in the configuration between the +`filename` key and the desired value, like this: + +```yaml +logfile: + class: logging.FileHandler + filename: + get_log_file_path: my-file-name.log +``` + +This special formatting will be pre-processed by ActivitySim before configuring +the logging, so that the file will be created in your designated output directory. +This also works when subprocesses are running, in which case the log file will +then be created in (or relative to) the process' log file directory, +not in (or relative to) the main output directory. + +## Identifying Subprocesses + +You may want to have different settings for subprocess workers and the main +ActivitySim process. For example, you may have the main processes log everything it writes +to both the console and a log file, while the subprocesses log mostly to files, and +only write higher priority messages (warnings and errors) to the console. Any +logging configuration can be set to bifurcate like this between the main process and +subtasks by setting "is_sub_task" and "is_not_sub_task" keys like this: + +```yaml +handlers: + console: + level: + if_sub_task: WARNING + if_not_sub_task: NOTSET +``` diff --git a/docs/dev-guide/using-sharrow.md b/docs/dev-guide/using-sharrow.md index 4e7cb90838..1feb8178ab 100644 --- a/docs/dev-guide/using-sharrow.md +++ b/docs/dev-guide/using-sharrow.md @@ -216,6 +216,28 @@ For models with utility expressions that include a lot of string comparisons, been updated) sharrow can be disabled by setting `sharrow_skip: true` in the component's configuration yaml file. +### Multiprocessing Performance + +Sharrow leverages a number of performance enhancing techniques, including +parallelization of various computations. This multi-threading can provide significant +benefits within a single-process, but if enabled alongside ActivitySim's multiprocessing +paradigm, the multi-threading does more harm than good, as too many threads will +compete for limited computational resources. To avoid this, the user should completely +disable multi-threading and rely exclusively on multiprocessing to generate parallelism. +This can be done by setting a number of thread-limiting environment variables before +running Python, or immediately at the start of a Python script before ActivitySim +is loaded: + +```python +import os +os.environ["MKL_NUM_THREADS"] = "1" +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["NUMBA_NUM_THREADS"] = "1" +os.environ["VECLIB_MAXIMUM_THREADS"] = "1" +os.environ["NUMEXPR_NUM_THREADS"] = "1" +``` + ### Limited Tracing and Estimation Mode Capabilities When running sharrow-optimized code, large parts of certain calculations are routed @@ -234,6 +256,44 @@ Similar constraints apply to estimation mode, as complete estimation mode output capabilities are not yet integrated with the sharrow engine. Estimation mode remains fully available when running with sharrow disabled. + +### Arithmetic on Logical Values + +In expressions written in specification files, boolean values must be treated with +care. When an expression is evaluated in the legacy implementation, the addition +of two boolean values will be processed according to numpy logic, such that: + +```python +np.array([True]) + np.array([True]) == np.array([True]) +np.array([True]) + np.array([False]) == np.array([True]) +np.array([False]) + np.array([True]) == np.array([True]) +np.array([False]) + np.array([False]) == np.array([False]) +``` + +When the same expression is evaluated using sharrow, the expression is evaluated +using Pythonesque rules, such that logical values are implicitly upcast to integers, +giving: + +```python +True + True == 2 +True + False == 1 +False + True == 1 +False + False == 0 +``` + +If this value is later upcast to a number and used in a mathematical calculation +(e.g. multiplied by a float-valued coefficient), obviously the results will vary, +as in the first case the result is never other than 1 or 0, but in the latter case +the result can also be 2. This mismatch can be readily avoided by wrapping the +term in an extra logic gate, which will evaluate the same in both environments: + +```python +(True + True)>0 == True +(True + False)>0 == True +(False + True)>0 == True +(False + False)>0 == False +``` + (digital-encoding)= ## Digital Encoding diff --git a/docs/dev-guide/workflows.md b/docs/dev-guide/workflows.md index d9bc2e2d30..133ad64350 100644 --- a/docs/dev-guide/workflows.md +++ b/docs/dev-guide/workflows.md @@ -15,7 +15,7 @@ a number of pre-packaged workflows that are included with ActivitySim. A collection of workflows used to compare the new *sharrow* code against legacy implementations can be found in the -[sharrow-contrast](https://github.com/camsys/activitysim/tree/sharrow-black/activitysim/workflows/sharrow-contrast) +[sharrow-contrast](https://github.com/ActivitySim/activitysim/tree/main/activitysim/workflows/sharrow-contrast) workflow subdirectory. Each of these first runs the relevant example model in test mode to compile the relevant functions, and then runs in production mode to measure runtime and memory usage. This is followed by another run in diff --git a/docs/examples.rst b/docs/examples.rst index f146b1a58b..a70583c0b6 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -4,8 +4,8 @@ .. _example : .. _examples : -Examples -======== +Built-in Examples +================= This page describes the example models included with ActivitySim. There are three basic types of example model: @@ -1531,15 +1531,15 @@ university students attending the University of Michigan (UofM). First off, Uof weighted by enrollment. This happens after both school location choice in the university_location_zone_override model and after university trip destination choice in the trip_destination_univ_zone_override model. Next, parking trips are handled explicitly by first choosing a parking location if the trip destination is in a UofM zone. Parking locations are selected -proportionally to the parking lot size as part of the parking_location_choice_at_university. Finally explicit trips are +proportionally to the parking lot size as part of the parking_location_choice_at_university. Finally explicit trips are inserted into the trips table to and from the parking lot locations in the stop_frequency_university_parking model. While a persons vehicle is parked, trip mode choice treats the tour mode as walk-transit to determine trip mode availability until the person returns back to their car. For more information, please see SEMCOG's final model documentation and the SEMCOG model user quide. These submodels were added to example_semcog as extensions, which is a way for users to add -submodels within their model setup as opposed to formally adding them to the activitysim package. Extension submodels are run through -the `models` settings. However, the model must be run with the `simulation.py` script instead of the command line interface +submodels within their model setup as opposed to formally adding them to the activitysim package. Extension submodels are run through +the `models` settings. However, the model must be run with the `simulation.py` script instead of the command line interface in order to load the extensions folder. diff --git a/docs/gettingstarted.rst b/docs/gettingstarted.rst index 69ed9392d6..6f7a6413d8 100644 --- a/docs/gettingstarted.rst +++ b/docs/gettingstarted.rst @@ -24,7 +24,7 @@ installs a variety of things on your system, and it is quite likely to be flagge Windows, anti-virus, or institutional IT policies as "unusual" software, which may require special treatment to actually install and use. -Download the installer from GitHub `here `__. +Download the installer from GitHub `here `_. It is strongly recommended to choose the option to install "for me only", as this should not require administrator privileges on your machine. Pay attention to the *complete path* of the installation location. You will need to know diff --git a/docs/howitworks.rst b/docs/howitworks.rst index 91046053ce..65e7127296 100644 --- a/docs/howitworks.rst +++ b/docs/howitworks.rst @@ -2,7 +2,7 @@ How the System Works ==================== -This page describes how the software works, how multiprocessing works, and the primary example model data schema. The code snippets below may not exactly match the latest version of the software, but they are close enough to illustrate how the system works. +This page describes how the software works, how multiprocessing works, and the primary example model data schema. The code snippets below may not exactly match the latest version of the software, but they are close enough to illustrate how the system works. .. _how_the_system_works: @@ -209,7 +209,7 @@ as well. The various calls also setup logging, tracing, stable random number ma if trace_hh_id: tracing.register_traceable_table('persons', df) - tracing.trace_df(df, "raw.persons", warn_if_empty=True) + whale.trace_df(df, "raw.persons", warn_if_empty=True) return df @@ -1355,7 +1355,7 @@ Skims are named ___