TransLinkForecasting · bwentl · Nov 2, 2023 · May 11, 2023 · Jun 23, 2023 · Jun 23, 2023
diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py
@@ -17,6 +17,7 @@
 )
 from activitysim.core.interaction_sample import interaction_sample
 from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
+from activitysim.core.util import reindex
 
 from .util import estimation
 from .util import logsums as logsum
@@ -138,22 +139,17 @@ def _location_sample(
     logger.info("Running %s with %d persons" % (trace_label, len(choosers.index)))
 
     sample_size = model_settings["SAMPLE_SIZE"]
-    if config.setting("disable_destination_sampling", False) or (
-        estimator and estimator.want_unsampled_alternatives
-    ):
-        # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count
-        logger.info(
-            "Estimation mode for %s using unsampled alternatives short_circuit_choices"
-            % (trace_label,)
-        )
-        sample_size = 0
+    if estimator:
+        sample_size = model_settings.get("ESTIMATION_SAMPLE_SIZE", 0)
 
     locals_d = {
         "skims": skims,
         "segment_size": segment_name,
         "orig_col_name": skims.orig_key,  # added for sharrow flows
         "dest_col_name": skims.dest_key,  # added for sharrow flows
         "timeframe": "timeless",
+        "reindex": reindex,
+        "land_use": inject.get_table("land_use").to_frame(),
     }
     constants = config.get_model_constants(model_settings)
     locals_d.update(constants)
@@ -470,6 +466,38 @@ def run_location_sample(
             trace_label=trace_label,
         )
 
+    # FIXME temporary code to ensure sampled alternative is in choices for estimation
+    # Hack to get shorter run times when you don't care about creating EDB for location choice models
+    if estimator:
+        # grabbing survey values
+        survey_persons = estimation.manager.get_survey_table("persons")
+        if "school_location" in trace_label:
+            survey_choices = survey_persons["school_zone_id"].reset_index()
+        elif ("workplace_location" in trace_label) and ("external" not in trace_label):
+            survey_choices = survey_persons["workplace_zone_id"].reset_index()
+        else:
+            return choices
+        survey_choices.columns = ["person_id", "alt_dest"]
+        survey_choices = survey_choices[
+            survey_choices["person_id"].isin(choices.index)
+            & (survey_choices.alt_dest > 0)
+        ]
+        # merging survey destination into table if not available
+        joined_data = survey_choices.merge(
+            choices, on=["person_id", "alt_dest"], how="left", indicator=True
+        )
+        missing_rows = joined_data[joined_data["_merge"] == "left_only"]
+        missing_rows["pick_count"] = 1
+        if len(missing_rows) > 0:
+            new_choices = missing_rows[
+                ["person_id", "alt_dest", "prob", "pick_count"]
+            ].set_index("person_id")
+            choices = choices.append(new_choices, ignore_index=False).sort_index()
+            # making probability the mean of all other sampled destinations by person
+            choices["prob"] = choices["prob"].fillna(
+                choices.groupby("person_id")["prob"].transform("mean")
+            )
+
     return choices
 
 
@@ -601,6 +629,8 @@ def run_location_simulate(
         "orig_col_name": skims.orig_key,  # added for sharrow flows
         "dest_col_name": skims.dest_key,  # added for sharrow flows
         "timeframe": "timeless",
+        "reindex": reindex,
+        "land_use": inject.get_table("land_use").to_frame(),
     }
     constants = config.get_model_constants(model_settings)
     if constants is not None:
@@ -808,6 +838,24 @@ def run_location_choice(
                 )
                 tracing.trace_df(choices_df, estimation_trace_label)
 
+        if want_logsums & (not skip_choice):
+            # grabbing index, could be person_id or proto_person_id
+            index_name = choices_df.index.name
+            # merging mode choice logsum of chosen alternative to choices
+            choices_df = (
+                pd.merge(
+                    choices_df.reset_index(),
+                    location_sample_df.reset_index()[
+                        [index_name, model_settings["ALT_DEST_COL_NAME"], ALT_LOGSUM]
+                    ],
+                    how="left",
+                    left_on=[index_name, "choice"],
+                    right_on=[index_name, model_settings["ALT_DEST_COL_NAME"]],
+                )
+                .drop(columns=model_settings["ALT_DEST_COL_NAME"])
+                .set_index(index_name)
+            )
+
         choices_list.append(choices_df)
 
         if want_sample_table:
@@ -825,7 +873,7 @@ def run_location_choice(
     else:
         # this will only happen with small samples (e.g. singleton) with no (e.g.) school segs
         logger.warning("%s no choices", trace_label)
-        choices_df = pd.DataFrame(columns=["choice", "logsum"])
+        choices_df = pd.DataFrame(columns=["choice", "logsum", ALT_LOGSUM])
 
     if len(sample_list) > 0:
         save_sample_df = pd.concat(sample_list)
@@ -869,7 +917,8 @@ def iterate_location_choice(
     Returns
     -------
     adds choice column model_settings['DEST_CHOICE_COLUMN_NAME']
-    adds logsum column model_settings['DEST_CHOICE_LOGSUM_COLUMN_NAME']- if provided
+    adds destination choice logsum column model_settings['DEST_CHOICE_LOGSUM_COLUMN_NAME']- if provided
+    adds mode choice logsum to selected destination column model_settings['MODE_CHOICE_LOGSUM_COLUMN_NAME']- if provided
     adds annotations to persons table
     """
 
@@ -879,7 +928,11 @@ def iterate_location_choice(
     chooser_filter_column = model_settings["CHOOSER_FILTER_COLUMN_NAME"]
 
     dest_choice_column_name = model_settings["DEST_CHOICE_COLUMN_NAME"]
-    logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME")
+    dc_logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME")
+    mc_logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME")
+    want_logsums = (dc_logsum_column_name is not None) | (
+        mc_logsum_column_name is not None
+    )
 
     sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME")
     want_sample_table = (
@@ -929,7 +982,7 @@ def iterate_location_choice(
             persons_merged_df_,
             network_los,
             shadow_price_calculator=spc,
-            want_logsums=logsum_column_name is not None,
+            want_logsums=want_logsums,
             want_sample_table=want_sample_table,
             estimator=estimator,
             model_settings=model_settings,
@@ -1005,10 +1058,15 @@ def iterate_location_choice(
     )
 
     # add the dest_choice_logsum column to persons dataframe
-    if logsum_column_name:
-        persons_df[logsum_column_name] = (
+    if dc_logsum_column_name:
+        persons_df[dc_logsum_column_name] = (
             choices_df["logsum"].reindex(persons_df.index).astype("float")
         )
+    # add the mode choice logsum column to persons dataframe
+    if mc_logsum_column_name:
+        persons_df[mc_logsum_column_name] = (
+            choices_df[ALT_LOGSUM].reindex(persons_df.index).astype("float")
+        )
 
     if save_sample_df is not None:
         # might be None for tiny samples even if sample_table_name was specified
@@ -1047,9 +1105,13 @@ def iterate_location_choice(
         if trace_hh_id:
             tracing.trace_df(households_df, label=trace_label, warn_if_empty=True)
 
-    if logsum_column_name:
+    if dc_logsum_column_name:
+        tracing.print_summary(
+            dc_logsum_column_name, choices_df["logsum"], value_counts=True
+        )
+    if mc_logsum_column_name:
         tracing.print_summary(
-            logsum_column_name, choices_df["logsum"], value_counts=True
+            mc_logsum_column_name, choices_df[ALT_LOGSUM], value_counts=True
         )
 
     return persons_df

diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py
@@ -1,6 +1,8 @@
 import os
 from pathlib import Path
 from typing import Collection
+import pickle
+from datetime import datetime
 
 import numpy as np
 import pandas as pd
@@ -44,6 +46,8 @@ def location_choice_model(
     settings_file="{name}_model_settings.yaml",
     landuse_file="{name}_landuse.csv",
     return_data=False,
+    alt_values_to_feather=False,
+    chunking_size=None,
 ):
     model_selector = name.replace("_location", "")
     model_selector = model_selector.replace("_destination", "")
@@ -57,12 +61,42 @@ def _read_csv(filename, **kwargs):
         filename = filename.format(name=name)
         return pd.read_csv(os.path.join(edb_directory, filename), **kwargs)
 
+    def _read_feather(filename, **kwargs):
+        filename = filename.format(name=name)
+        return pd.read_feather(os.path.join(edb_directory, filename), **kwargs)
+
+    def _to_feather(df, filename, **kwargs):
+        filename = filename.format(name=name)
+        return df.to_feather(os.path.join(edb_directory, filename), **kwargs)
+
+    def _read_pickle(filename, **kwargs):
+        filename = filename.format(name=name)
+        return pd.read_pickle(os.path.join(edb_directory, filename))
+
+    def _to_pickle(df, filename, **kwargs):
+        filename = filename.format(name=name)
+        return df.to_pickle(os.path.join(edb_directory, filename))
+
+    def _file_exists(filename):
+        filename = filename.format(name=name)
+        return os.path.exists(os.path.join(edb_directory, filename))
+
     coefficients = _read_csv(
         coefficients_file,
         index_col="coefficient_name",
     )
     spec = _read_csv(spec_file, comment="#")
-    alt_values = _read_csv(alt_values_file)
+
+    # read alternative values either as csv or feather file
+    alt_values_fea_file = alt_values_file.replace(".csv", ".fea")
+    if os.path.exists(
+        os.path.join(edb_directory, alt_values_fea_file.format(name=name))
+    ):
+        alt_values = _read_feather(alt_values_fea_file)
+    else:
+        alt_values = _read_csv(alt_values_file)
+        if alt_values_to_feather:
+            _to_feather(df=alt_values, filename=alt_values_fea_file)
     chooser_data = _read_csv(chooser_file)
     landuse = _read_csv(landuse_file, index_col="zone_id")
     master_size_spec = _read_csv(size_spec_file)
@@ -106,6 +140,9 @@ def _read_csv(filename, **kwargs):
         .set_index("segment")
     )
     size_spec = size_spec.loc[:, size_spec.max() > 0]
+    assert (
+        len(size_spec) > 0
+    ), f"Empty size_spec, is model_selector {SIZE_TERM_SELECTOR} in your size term file?"
 
     size_coef = size_coefficients_from_spec(size_spec)
 
@@ -148,7 +185,48 @@ def _read_csv(filename, **kwargs):
 
     chooser_index_name = chooser_data.columns[0]
     x_co = chooser_data.set_index(chooser_index_name)
-    x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]]))
+
+    def split(a, n):
+        k, m = divmod(len(a), n)
+        return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
+
+    # process x_ca with cv_to_ca with or without chunking
+    x_ca_pickle_file = "{name}_x_ca.pkl"
+    if chunking_size == None:
+        x_ca = cv_to_ca(
+            alt_values.set_index([chooser_index_name, alt_values.columns[1]])
+        )
+    elif _file_exists(x_ca_pickle_file):
+        # if pickle file from previous x_ca processing exist, load it to save time
+        time_start = datetime.now()
+        x_ca = _read_pickle(x_ca_pickle_file)
+        print(
+            f"x_ca data loaded from {name}_x_ca.fea - time elapsed {(datetime.now() - time_start).total_seconds()}"
+        )
+    else:
+        time_start = datetime.now()
+        # calculate num_chunks based on chunking_size (or max number of rows per chunk)
+        num_chunks = int(len(alt_values) / chunking_size)
+        all_person_ids = list(alt_values["person_id"].unique())
+        split_ids = list(split(all_person_ids, num_chunks))
+        x_ca_list = []
+        i = 0
+        for chunk_ids in split_ids:
+            alt_values_i = alt_values[alt_values["person_id"].isin(chunk_ids)]
+            x_ca_i = cv_to_ca(
+                alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]])
+            )
+            x_ca_list.append(x_ca_i)
+            print(
+                f"\rx_ca_i compute done for chunk {i}/{num_chunks} - time elapsed {(datetime.now() - time_start).total_seconds()}"
+            )
+            i = i + 1
+        x_ca = pd.concat(x_ca_list, axis=0)
+        # save final x_ca result as pickle file to save time for future data loading
+        _to_pickle(df=x_ca, filename=x_ca_pickle_file)
+        print(
+            f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}"
+        )
 
     if CHOOSER_SEGMENT_COLUMN_NAME is not None:
         # label segments with names
@@ -214,6 +292,9 @@ def _read_csv(filename, **kwargs):
     else:
         av = 1
 
+    assert len(x_co) > 0, "Empty chooser dataframe"
+    assert len(x_ca_1) > 0, "Empty alternatives dataframe"
+
     d = DataFrames(co=x_co, ca=x_ca_1, av=av)
 
     m = Model(dataservice=d)
@@ -331,6 +412,14 @@ def workplace_location_model(**kwargs):
     )
 
 
+def external_workplace_location_model(**kwargs):
+    unused = kwargs.pop("name", None)
+    return location_choice_model(
+        name="external_workplace_location",
+        **kwargs,
+    )
+
+
 def school_location_model(**kwargs):
     unused = kwargs.pop("name", None)
     return location_choice_model(
@@ -367,6 +456,14 @@ def non_mandatory_tour_destination_model(**kwargs):
     )
 
 
+def external_non_mandatory_destination_model(**kwargs):
+    unused = kwargs.pop("name", None)
+    return location_choice_model(
+        name="external_non_mandatory_destination",
+        **kwargs,
+    )
+
+
 def trip_destination_model(**kwargs):
     unused = kwargs.pop("name", None)
     return location_choice_model(