diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml index 2542ffd54..481128278 100644 --- a/.github/workflows/core_tests.yml +++ b/.github/workflows/core_tests.yml @@ -36,7 +36,7 @@ jobs: uses: actions/setup-python@v5 with: python-version-file: ".python-version" - + - name: Install activitysim run: | uv sync --locked --only-group github-action @@ -128,6 +128,8 @@ jobs: run: | uv run pytest --pyargs activitysim.cli + - run: uv run pytest test/test_skim_name_conflicts.py + - run: uv run pytest test/random_seed/test_random_seed.py builtin_regional_models: needs: foundation @@ -166,7 +168,7 @@ jobs: uses: actions/setup-python@v5 with: python-version-file: ".python-version" - + - name: Install activitysim run: | uv sync --locked --only-group github-action @@ -235,7 +237,7 @@ jobs: uses: actions/setup-python@v5 with: python-version-file: ".python-version" - + - name: Install activitysim run: | uv sync --locked --only-group github-action @@ -277,7 +279,7 @@ jobs: uses: actions/setup-python@v5 with: python-version-file: ".python-version" - + - name: Install activitysim run: | uv sync --locked --only-group github-action @@ -310,7 +312,7 @@ jobs: uses: actions/setup-python@v5 with: python-version-file: ".python-version" - + - name: Install activitysim run: | uv sync --locked --only-group github-action diff --git a/activitysim/core/skim_dataset.py b/activitysim/core/skim_dataset.py index 1ed871ec0..1e359af3d 100644 --- a/activitysim/core/skim_dataset.py +++ b/activitysim/core/skim_dataset.py @@ -3,6 +3,7 @@ import glob import logging import os +import re import time from functools import partial from pathlib import Path @@ -499,8 +500,6 @@ def _apply_digital_encoding(dataset, digital_encodings): As modified """ if digital_encodings: - import re - # apply once, before saving to zarr, will stick around in cache for encoding in digital_encodings: logger.info(f"applying zarr digital-encoding: {encoding}") @@ -753,6 +752,68 @@ def load_skim_dataset_to_shared_memory(state, skim_tag="taz") -> xr.Dataset: d = None # skims are not stored in shared memory, so we need to load them do_not_save_zarr = False + potential_conflicts = network_los_preload.skims_info.get(skim_tag).skim_conflicts + if potential_conflicts: + # There are some conflicts in the skims, where both time-dependent + # and time-agnostic skim with the same name are present. We need + # to check we have sufficient ignore rules in place to correct this + # condition. + + def _should_ignore(ignore, x): + if isinstance(ignore, str): + ignore = [ignore] + if ignore is not None: + for i in ignore: + if re.match(i, x): + return True + return False + + ignore = state.settings.omx_ignore_patterns + problems = [] + for time_agnostic, time_dependent in potential_conflicts.items(): + # option 1, ignore all the time-dependent skims + # if this is fulfilled, we are ok and can proceed + if all((_should_ignore(ignore, i)) for i in time_dependent): + continue + # option 2, ignore the time-agnostic skim + # if this is fulfilled, we are ok and can proceed + if _should_ignore(ignore, time_agnostic): + continue + # otherwise, we have a problem. collect all the problems + # and raise an error at the end listing all of them + problems.append(time_agnostic) + if problems: + solution_1 = "__.+'\n - '^".join(problems) + solution_2 = "$'\n - '^".join(problems) + # we have a problem, raise an error + error_message = ( + f"skims {problems} are present in both time-dependent and time-agnostic formats.\n" + "Please add ignore rules to the omx_ignore_patterns setting to resolve this issue.\n" + "To ignore the time dependent skims, add the following to your settings file:\n" + "\n" + "omx_ignore_patterns:\n" + f" - '^{solution_1}__.+'\n" + "\n" + "To ignore the time agnostic skims, add the following to your settings file:\n" + "\n" + "omx_ignore_patterns:\n" + f" - '^{solution_2}$'\n" + "\n" + "You can also do some variation or combination of the two, as long as you resolve\n" + "the conflict(s). In addition, note that minor edits to model spec files may be\n" + "needed to accommodate these changes in how skim data is represented (e.g. changing\n" + "`odt_skims` to `od_skims`, or similar modifications wherever the offending variable\n" + "names are used). Alternatively, you can modify the skim data in the source files to\n" + "remove the naming conflicts, which is typically done upstream of ActivitySim in\n" + "whatever tool you are using to create the skims in the first place.\n" + "\n" + "See [https://activitysim.github.io/?q=skims] for more information.\n" + ) + # write the error message to the log + logger.error(error_message) + # raise an error to stop the run, put the entire error message there also + raise ValueError(error_message) + if d is None: time_periods = _dedupe_time_periods(network_los_preload) if zarr_file: diff --git a/activitysim/core/skim_dict_factory.py b/activitysim/core/skim_dict_factory.py index 78c705358..e0bd23ef3 100644 --- a/activitysim/core/skim_dict_factory.py +++ b/activitysim/core/skim_dict_factory.py @@ -8,6 +8,7 @@ import os import warnings from abc import ABC +from collections import defaultdict import numpy as np import openmatrix as omx @@ -86,6 +87,7 @@ def __init__(self, state, skim_tag, network_los): self.offset_map_name = None self.offset_map = None self.omx_keys = None + self.skim_conflicts = None self.base_keys = None self.block_offsets = None @@ -117,7 +119,12 @@ def load_skim_info(self, state, skim_tag): logger.debug(f"load_skim_info {skim_tag} reading {omx_file_path}") with omx.open_file(omx_file_path, mode="r") as omx_file: - # fixme call to omx_file.shape() failing in windows p3.5 + + # Check the shape of the skims. All skim files loaded within this + # loop need to have the same shape. For the first file, the + # shape is set to the omx_shape attribute, so we know what shape + # to expect. For subsequent files, we check that the shape is the + # same as the first file. If not, we raise an error. if self.omx_shape is None: self.omx_shape = tuple( int(i) for i in omx_file.shape() @@ -127,6 +134,12 @@ def load_skim_info(self, state, skim_tag): int(i) for i in omx_file.shape() ), f"Mismatch shape {self.omx_shape} != {omx_file.shape()}" + # Check that all the matrix names are unique across all the + # omx files. This check is only looking at the name as stored in + # the file, and is not processing any time period transformations. + # If duplicate names are found, a warning is issued. This is not + # a fatal error, but it is inefficient and may be symptom of a + # deeper problem. for skim_name in omx_file.listMatrices(): if skim_name in self.omx_manifest: warnings.warn( @@ -134,6 +147,11 @@ def load_skim_info(self, state, skim_tag): ) self.omx_manifest[skim_name] = omx_file_path + # We load the offset map if it exists. This is expected to be + # a 1D array of integers that that gives ID values for each TAZ + # in the skims. ActivitySim expects there to be only one such + # mapping, although it can appear multiple times (e.g. once in + # each file). for m in omx_file.listMappings(): if self.offset_map is None: self.offset_map_name = m @@ -146,21 +164,54 @@ def load_skim_info(self, state, skim_tag): f"Multiple mappings in omx file: {self.offset_map_name} != {m}" ) - # - omx_keys dict maps skim key to omx_key - # DISTWALK: DISTWALK - # ('DRV_COM_WLK_BOARDS', 'AM'): DRV_COM_WLK_BOARDS__AM, ... + # Create the `omx_keys` mapping, which connects skim key to omx_key. + # The skim key is either a single string that names a skim that is not + # time-dependent, or a 2-tuple of strings which names a skim and a time + # period. The omx_key is the original name of the skim in the omx file. + # For non-time-dependent skims, the omx_key is the same as the skim key, + # e.g. DISTWALK: DISTWALK. For time-dependent skims, the omx_key is the + # skim key with the time period appended, + # e.g. ('DRV_COM_WLK_BOARDS', 'AM'): DRV_COM_WLK_BOARDS__AM. self.omx_keys = dict() for skim_name in self.omx_manifest.keys(): key1, sep, key2 = skim_name.partition("__") - # - ignore composite tags not in dim3_tags_to_load if dim3_tags_to_load and sep and key2 not in dim3_tags_to_load: + # If a skim is found that has a time period that is not one of + # the known named time periods, a warning is issued, and that + # skim is ignored. This is not a fatal error, but it may be a + # symptom of a deeper problem. + warnings.warn(f"skim '{key1}' has unknown time period '{key2}'") continue - skim_key = (key1, key2) if sep else key1 - self.omx_keys[skim_key] = skim_name + # Create a skim_conflicts set, which identifies any skims that have both + # time-dependent and time-agnostic versions. This condition in and of + # itself is not a fatal error, as it is possible to have both types of skims + # in the same data when using the legacy codebase. When using skim_dataset + # instead of skim_dictionary (the former is required when using sharrow) this + # condition is no longer allowed, although we can potentially recover if + # the user has specified instructions that certain skim variables are not + # to be loaded. The recovery option is checked later, in the skim_dataset + # module, as that is where the skim variables are actually loaded. + time_dependent_skims = defaultdict(set) + time_agnostic_skims = set() + for k, v in self.omx_keys.items(): + if isinstance(k, tuple): + time_dependent_skims[k[0]].add(v) + else: + time_agnostic_skims.add(k) + self.skim_conflicts = { + k: v for k, v in time_dependent_skims.items() if k in time_agnostic_skims + } + if self.skim_conflicts: + msg = "some skims have both time-dependent and time-agnostic versions:" + for k in self.skim_conflicts: + msg += f"\n- {k}" + warnings.warn(msg) + + # Count the number of skims in the omx file self.num_skims = len(self.omx_keys) # - key1_subkeys dict maps key1 to dict of subkeys with that key1 diff --git a/docs/users-guide/model_anatomy.rst b/docs/users-guide/model_anatomy.rst index 78114f7e8..a1e0cc275 100644 --- a/docs/users-guide/model_anatomy.rst +++ b/docs/users-guide/model_anatomy.rst @@ -21,21 +21,21 @@ file and the ``configs\network_los.yaml`` file. The following tables are currently implemented: - * households - household attributes for each household being simulated. Index: ``household_id`` (see ``activitysim.abm.tables.households.py``) - * landuse - zonal land use (such as population and employment) attributes. Index: ``zone_id`` (see ``activitysim.abm.tables.landuse.py``) - * persons - person attributes for each person being simulated. Index: ``person_id`` (see ``activitysim.abm.tables.persons.py``) - * time windows - manages person time windows throughout the simulation. See :ref:`time_windows`. Index: ``person_id`` (see the person_windows table create decorator in ``activitysim.abm.tables.time_windows.py``) - * tours - tour attributes for each tour (mandatory, non-mandatory, joint, and atwork-subtour) being simulated. Index: ``tour_id`` (see ``activitysim.abm.models.util.tour_frequency.py``) - * trips - trip attributes for each trip being simulated. Index: ``trip_id`` (see ``activitysim.abm.models.stop_frequency.py``) +* households - household attributes for each household being simulated. Index: ``household_id`` (see ``activitysim.abm.tables.households.py``) +* landuse - zonal land use (such as population and employment) attributes. Index: ``zone_id`` (see ``activitysim.abm.tables.landuse.py``) +* persons - person attributes for each person being simulated. Index: ``person_id`` (see ``activitysim.abm.tables.persons.py``) +* time windows - manages person time windows throughout the simulation. See :ref:`time_windows`. Index: ``person_id`` (see the person_windows table create decorator in ``activitysim.abm.tables.time_windows.py``) +* tours - tour attributes for each tour (mandatory, non-mandatory, joint, and atwork-subtour) being simulated. Index: ``tour_id`` (see ``activitysim.abm.models.util.tour_frequency.py``) +* trips - trip attributes for each trip being simulated. Index: ``trip_id`` (see ``activitysim.abm.models.stop_frequency.py``) A few additional tables are also used, which are not really tables, but classes: - * input store - reads input data tables from the input data store - * constants - various constants used throughout the model system, such as person type codes - * shadow pricing - shadow price calculator and associated utility methods, see :ref:`shadow_pricing` - * size terms - created by reading the ``destination_choice_size_terms.csv`` input file. Index - ``segment`` (see ``activitysim.abm.tables.size_terms.py``) - * skims - each model runs requires skims, but how the skims are defined can vary significantly depending on the ActivitySim implementation. The skims class defines Inject injectables to access the skim matrices. The skims class reads the skims from the omx_file on disk. - * table dictionary - stores which tables should be registered as random number generator channels for restartability of the pipeline +* input store - reads input data tables from the input data store +* constants - various constants used throughout the model system, such as person type codes +* shadow pricing - shadow price calculator and associated utility methods, see :ref:`shadow_pricing` +* size terms - created by reading the ``destination_choice_size_terms.csv`` input file. Index - ``segment`` (see ``activitysim.abm.tables.size_terms.py``) +* skims - each model runs requires skims, but how the skims are defined can vary significantly depending on the ActivitySim implementation. The skims class defines Inject injectables to access the skim matrices. The skims class reads the skims from the omx_file on disk. +* table dictionary - stores which tables should be registered as random number generator channels for restartability of the pipeline @@ -54,10 +54,10 @@ system for non-motorized travel, and optionally a transit access points (TAPs) z The three versions of multiple zone systems are one-zone, two-zone, and three-zone. - * **One-zone**: This version is based on TM1 and supports only TAZs. All origins and +* **One-zone**: This version is based on TM1 and supports only TAZs. All origins and destinations are represented at the TAZ level, and all skims including auto, transit, and non-motorized times and costs are also represented at the TAZ level. - * **Two-zone**: This version is similar to many DaySim models. It uses microzones (MAZs) +* **Two-zone**: This version is similar to many DaySim models. It uses microzones (MAZs) for origins and destinations, and TAZs for specification of auto and transit times and costs. Impedance for walk or bike all-the-way from the origin to the destination can be specified at the MAZ level for close together origins and destinations, and at @@ -65,7 +65,7 @@ The three versions of multiple zone systems are one-zone, two-zone, and three-zo walk access and egress times with times specified in the MAZ file by transit mode. Careful pre-calculation of the assumed transit walk access and egress time by MAZ and transit mode is required depending on the network scenario. - * **Three-zone**: This version is based on the SANDAG generation of CT-RAMP models. +* **Three-zone**: This version is based on the SANDAG generation of CT-RAMP models. Origins and destinations are represented at the MAZ level. Impedance for walk or bike all-the-way from the origin to the destination can be specified at the MAZ level for close together origins and destinations, and at the TAZ level for further @@ -84,8 +84,15 @@ The three versions of multiple zone systems are one-zone, two-zone, and three-zo combinations of nearby boarding and alighting TAPs for each origin destination MAZ pair. -Regions that have an interest in more precise transit forecasts may wish to adopt the -three-zone approach, while other regions may adopt the one or two-zone approach. The +.. caution:: + The ActivitySim consortium is moving away from the three-zone approach, in favor of + to the one- or two-zone approaches. The code for the three-zone approach remains + available for users who have already implemented it, but it is recommended that + users consider the one- or two-zone approaches for new implementations. + The three-zone system may be formally deprecated and removed in the future. + +Regions that have an interest in more precise transit and non-motorized forecasts +may wish to adopt the two-zone approach, while other regions may adopt the one or two-zone approach. The microzone version requires coding households and land use at the microzone level. Typically an all-streets network is used for representation of non-motorized impedances. This requires a routable all-streets network, with centroids and connectors for @@ -103,10 +110,76 @@ transit modeling. initial development, these examples were insufficient for validation and performance testing of the new software. As a result, the :ref:`prototype_marin` example was created. -Example simple test configurations and inputs for two and three-zone system models are described below. + +.. _omx_skims : + +Skims +~~~~~ + +The basic level-of-service data that represents the transportation system is +made available to ActivitySim via one or more sets of "skims". Skims are +essentially matrices of travel times, costs, and other level of service attributes, +calculated between various zones in the system. This skim data is made available +to ActivitySim using files in the`openmatrix `__ +(OMX) format. All of the skim data can be provided in a single OMX file, or +multiple OMX files can be used (this is typical for larger models, to keep file +sizes manageable). If multiple files are used, the content of those files is +simply concatenated together into a single notional bucket of skim data when +the model is run. Within that bucket, each skim variable is identified by a unique name. +For skim variables that vary across model time periods, the time period is +appended to the skim name, separated by a double underscore (e.g. ``BUS_IVT__AM``). + +.. caution:: + When using "legacy" mode for ActivitySim, it is possible (but not recommended) + to have a skim variable that has both a time period agnostic value as well as + a set of time period dependent values, e.g. "WALK_TIME" and "WALK_TIME__AM". + If you have conflicting names like this, a warning message will be issued, which + will look like this in an ActivitySim log file: + + .. code-block:: text + + WARNING: activitysim/core/skim_dict_factory.py:212: + UserWarning: some skims have both time-dependent and time-agnostic versions: + - BIKE_LOGSUM + - BIKE_TIME + + This is a warning, not an error, and the model will run if not using sharrow. + However, if "sharrow" mode is activated, this will result in an error once the + skims are actually loaded, unless instructions are included in the settings file + to resolve the conflict. The error message will look like this: + + .. code-block:: text + + ERROR: skims ['BIKE_TIME'] are present in both time-dependent and time-agnostic formats. + Please add ignore rules to the omx_ignore_patterns setting to resolve this issue. + To ignore the time dependent skims, add the following to your settings file: + + omx_ignore_patterns: + - '^BIKE_TIME__.+' + + To ignore the time agnostic skims, add the following to your settings file: + + omx_ignore_patterns: + - '^BIKE_TIME$' + + You can also do some variation or combination of the two, as long as you resolve + the conflict(s). In addition, note that minor edits to model spec files may be + needed to accommodate these changes in how skim data is represented (e.g. changing + `odt_skims` to `od_skims`, or similar modifications wherever the offending variable + names are used). Alternatively, you can modify the skim data in the source files to + remove the naming conflicts, which is typically done upstream of ActivitySim in + whatever tool you are using to create the skims in the first place. + + It should be relatively simple to resolve the conflict by following the instructions + in the error message. The cleaner and more reliable solution is to ensure each skim + variable has a unique name, e.g. by changing the name on the time period agnostic + value, so that instead of "BIKE_TIME" it is "BIKE_TIME_BASE". This may also require + minor edits to the model spec files to accommodate the new skim name. + Examples ~~~~~~~~ +Example simple test configurations and inputs for two and three-zone system models are described below. To run the two zone and three zone system examples, do the following: diff --git a/test/test_skim_name_conflicts.py b/test/test_skim_name_conflicts.py new file mode 100644 index 000000000..eafd5ee42 --- /dev/null +++ b/test/test_skim_name_conflicts.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +import os +import shutil +from importlib.resources import files +from pathlib import Path + +import openmatrix +import pytest + +import activitysim.abm # noqa: F401 +from activitysim.core import workflow + + +def example_path(dirname): + resource = files("activitysim.examples.placeholder_sandag").joinpath(dirname) + return str(resource) + + +def mtc_example_path(dirname): + resource = files("activitysim.examples.prototype_mtc").joinpath(dirname) + return str(resource) + + +def psrc_example_path(dirname): + resource = files("activitysim.examples.placeholder_psrc").joinpath(dirname) + return str(resource) + + +@pytest.fixture(scope="session") +def example_data_dir(tmp_path_factory) -> Path: + """Fixture to provide the path to the example data directory.""" + td = tmp_path_factory.mktemp("skim-conflict-data") + shutil.copytree(example_path("data_2"), td.joinpath("data_2")) + shutil.copy( + example_path(os.path.join("data_3", "maz_to_maz_bike.csv")), + td.joinpath("data_2"), + ) + + # add extra skims to OMX to create a conflict + + with openmatrix.open_file(td.joinpath("data_2").joinpath("skims1.omx"), "a") as omx: + for t in ["EA", "AM", "MD", "PM", "EV"]: + # Create a new matrix for each time period + omx.createMatrix(f"DISTBIKE__{t}", obj=omx["DISTBIKE"][:]) + + return td.joinpath("data_2") + + +def test_skim_name_conflicts(example_data_dir, tmp_path_factory): + # when sharrow is required, the run should fail due to conflicting skim names + state = workflow.State.make_default( + data_dir=example_data_dir, + configs_dir=( + example_path("configs_2_zone"), + psrc_example_path("configs"), + ), + output_dir=tmp_path_factory.mktemp("out-fail"), + settings={ + "households_sample_size": 20, + "sharrow": "require", + "disable_zarr": True, + }, + ) + with pytest.raises(ValueError): + state.run( + [ + "initialize_landuse", + "initialize_households", + ] + ) + + +def test_skim_name_conflicts_no_sharrow(example_data_dir, tmp_path_factory): + # when sharrow is disabled, the run should warn about conflicting skim names but not fail + state = workflow.State.make_default( + data_dir=example_data_dir, + configs_dir=( + example_path("configs_2_zone"), + psrc_example_path("configs"), + ), + output_dir=tmp_path_factory.mktemp("out-pass"), + settings={ + "households_sample_size": 20, + "sharrow": False, + "disable_zarr": True, + }, + ) + # Run the beginning workflow with the modified settings, should only warn about the conflict + with pytest.warns( + UserWarning, + match="some skims have both time-dependent and time-agnostic versions", + ): + state.run( + [ + "initialize_landuse", + "initialize_households", + ] + ) + + +@pytest.mark.parametrize("solution", ["^DISTBIKE$", "^DISTBIKE__.+"]) +def test_skim_name_conflicts_ok(example_data_dir, tmp_path_factory, solution): + # when sharrow is required, and omx_ignore_patterns is set correctly, + # the run should work without raising an error + state = workflow.State.make_default( + data_dir=example_data_dir, + configs_dir=( + example_path("configs_2_zone"), + psrc_example_path("configs"), + ), + output_dir=tmp_path_factory.mktemp("out-solved"), + settings={ + "households_sample_size": 20, + "sharrow": "require", + "disable_zarr": True, + "omx_ignore_patterns": [solution], + }, + ) + state.run( + [ + "initialize_landuse", + "initialize_households", + ] + )