diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index dd5e279b39..7e9a76d551 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -17,6 +17,7 @@ ) from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.core.util import reindex from .util import estimation from .util import logsums as logsum @@ -138,15 +139,8 @@ def _location_sample( logger.info("Running %s with %d persons" % (trace_label, len(choosers.index))) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting("disable_destination_sampling", False) or ( - estimator and estimator.want_unsampled_alternatives - ): - # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count - logger.info( - "Estimation mode for %s using unsampled alternatives short_circuit_choices" - % (trace_label,) - ) - sample_size = 0 + if estimator: + sample_size = model_settings.get("ESTIMATION_SAMPLE_SIZE", 0) locals_d = { "skims": skims, @@ -154,6 +148,8 @@ def _location_sample( "orig_col_name": skims.orig_key, # added for sharrow flows "dest_col_name": skims.dest_key, # added for sharrow flows "timeframe": "timeless", + "reindex": reindex, + "land_use": inject.get_table("land_use").to_frame(), } constants = config.get_model_constants(model_settings) locals_d.update(constants) @@ -470,6 +466,38 @@ def run_location_sample( trace_label=trace_label, ) + # FIXME temporary code to ensure sampled alternative is in choices for estimation + # Hack to get shorter run times when you don't care about creating EDB for location choice models + if estimator: + # grabbing survey values + survey_persons = estimation.manager.get_survey_table("persons") + if "school_location" in trace_label: + survey_choices = survey_persons["school_zone_id"].reset_index() + elif ("workplace_location" in trace_label) and ("external" not in trace_label): + survey_choices = survey_persons["workplace_zone_id"].reset_index() + else: + return choices + survey_choices.columns = ["person_id", "alt_dest"] + survey_choices = survey_choices[ + survey_choices["person_id"].isin(choices.index) + & (survey_choices.alt_dest > 0) + ] + # merging survey destination into table if not available + joined_data = survey_choices.merge( + choices, on=["person_id", "alt_dest"], how="left", indicator=True + ) + missing_rows = joined_data[joined_data["_merge"] == "left_only"] + missing_rows["pick_count"] = 1 + if len(missing_rows) > 0: + new_choices = missing_rows[ + ["person_id", "alt_dest", "prob", "pick_count"] + ].set_index("person_id") + choices = choices.append(new_choices, ignore_index=False).sort_index() + # making probability the mean of all other sampled destinations by person + choices["prob"] = choices["prob"].fillna( + choices.groupby("person_id")["prob"].transform("mean") + ) + return choices @@ -601,6 +629,8 @@ def run_location_simulate( "orig_col_name": skims.orig_key, # added for sharrow flows "dest_col_name": skims.dest_key, # added for sharrow flows "timeframe": "timeless", + "reindex": reindex, + "land_use": inject.get_table("land_use").to_frame(), } constants = config.get_model_constants(model_settings) if constants is not None: @@ -808,6 +838,24 @@ def run_location_choice( ) tracing.trace_df(choices_df, estimation_trace_label) + if want_logsums & (not skip_choice): + # grabbing index, could be person_id or proto_person_id + index_name = choices_df.index.name + # merging mode choice logsum of chosen alternative to choices + choices_df = ( + pd.merge( + choices_df.reset_index(), + location_sample_df.reset_index()[ + [index_name, model_settings["ALT_DEST_COL_NAME"], ALT_LOGSUM] + ], + how="left", + left_on=[index_name, "choice"], + right_on=[index_name, model_settings["ALT_DEST_COL_NAME"]], + ) + .drop(columns=model_settings["ALT_DEST_COL_NAME"]) + .set_index(index_name) + ) + choices_list.append(choices_df) if want_sample_table: @@ -825,7 +873,7 @@ def run_location_choice( else: # this will only happen with small samples (e.g. singleton) with no (e.g.) school segs logger.warning("%s no choices", trace_label) - choices_df = pd.DataFrame(columns=["choice", "logsum"]) + choices_df = pd.DataFrame(columns=["choice", "logsum", ALT_LOGSUM]) if len(sample_list) > 0: save_sample_df = pd.concat(sample_list) @@ -869,7 +917,8 @@ def iterate_location_choice( Returns ------- adds choice column model_settings['DEST_CHOICE_COLUMN_NAME'] - adds logsum column model_settings['DEST_CHOICE_LOGSUM_COLUMN_NAME']- if provided + adds destination choice logsum column model_settings['DEST_CHOICE_LOGSUM_COLUMN_NAME']- if provided + adds mode choice logsum to selected destination column model_settings['MODE_CHOICE_LOGSUM_COLUMN_NAME']- if provided adds annotations to persons table """ @@ -879,7 +928,11 @@ def iterate_location_choice( chooser_filter_column = model_settings["CHOOSER_FILTER_COLUMN_NAME"] dest_choice_column_name = model_settings["DEST_CHOICE_COLUMN_NAME"] - logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") + dc_logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") + mc_logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") + want_logsums = (dc_logsum_column_name is not None) | ( + mc_logsum_column_name is not None + ) sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") want_sample_table = ( @@ -929,7 +982,7 @@ def iterate_location_choice( persons_merged_df_, network_los, shadow_price_calculator=spc, - want_logsums=logsum_column_name is not None, + want_logsums=want_logsums, want_sample_table=want_sample_table, estimator=estimator, model_settings=model_settings, @@ -1005,10 +1058,15 @@ def iterate_location_choice( ) # add the dest_choice_logsum column to persons dataframe - if logsum_column_name: - persons_df[logsum_column_name] = ( + if dc_logsum_column_name: + persons_df[dc_logsum_column_name] = ( choices_df["logsum"].reindex(persons_df.index).astype("float") ) + # add the mode choice logsum column to persons dataframe + if mc_logsum_column_name: + persons_df[mc_logsum_column_name] = ( + choices_df[ALT_LOGSUM].reindex(persons_df.index).astype("float") + ) if save_sample_df is not None: # might be None for tiny samples even if sample_table_name was specified @@ -1047,9 +1105,13 @@ def iterate_location_choice( if trace_hh_id: tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) - if logsum_column_name: + if dc_logsum_column_name: + tracing.print_summary( + dc_logsum_column_name, choices_df["logsum"], value_counts=True + ) + if mc_logsum_column_name: tracing.print_summary( - logsum_column_name, choices_df["logsum"], value_counts=True + mc_logsum_column_name, choices_df[ALT_LOGSUM], value_counts=True ) return persons_df diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py index 74a426e714..9a3e54d27b 100644 --- a/activitysim/estimation/larch/location_choice.py +++ b/activitysim/estimation/larch/location_choice.py @@ -1,6 +1,8 @@ import os from pathlib import Path from typing import Collection +import pickle +from datetime import datetime import numpy as np import pandas as pd @@ -44,6 +46,8 @@ def location_choice_model( settings_file="{name}_model_settings.yaml", landuse_file="{name}_landuse.csv", return_data=False, + alt_values_to_feather=False, + chunking_size=None, ): model_selector = name.replace("_location", "") model_selector = model_selector.replace("_destination", "") @@ -57,12 +61,42 @@ def _read_csv(filename, **kwargs): filename = filename.format(name=name) return pd.read_csv(os.path.join(edb_directory, filename), **kwargs) + def _read_feather(filename, **kwargs): + filename = filename.format(name=name) + return pd.read_feather(os.path.join(edb_directory, filename), **kwargs) + + def _to_feather(df, filename, **kwargs): + filename = filename.format(name=name) + return df.to_feather(os.path.join(edb_directory, filename), **kwargs) + + def _read_pickle(filename, **kwargs): + filename = filename.format(name=name) + return pd.read_pickle(os.path.join(edb_directory, filename)) + + def _to_pickle(df, filename, **kwargs): + filename = filename.format(name=name) + return df.to_pickle(os.path.join(edb_directory, filename)) + + def _file_exists(filename): + filename = filename.format(name=name) + return os.path.exists(os.path.join(edb_directory, filename)) + coefficients = _read_csv( coefficients_file, index_col="coefficient_name", ) spec = _read_csv(spec_file, comment="#") - alt_values = _read_csv(alt_values_file) + + # read alternative values either as csv or feather file + alt_values_fea_file = alt_values_file.replace(".csv", ".fea") + if os.path.exists( + os.path.join(edb_directory, alt_values_fea_file.format(name=name)) + ): + alt_values = _read_feather(alt_values_fea_file) + else: + alt_values = _read_csv(alt_values_file) + if alt_values_to_feather: + _to_feather(df=alt_values, filename=alt_values_fea_file) chooser_data = _read_csv(chooser_file) landuse = _read_csv(landuse_file, index_col="zone_id") master_size_spec = _read_csv(size_spec_file) @@ -106,6 +140,9 @@ def _read_csv(filename, **kwargs): .set_index("segment") ) size_spec = size_spec.loc[:, size_spec.max() > 0] + assert ( + len(size_spec) > 0 + ), f"Empty size_spec, is model_selector {SIZE_TERM_SELECTOR} in your size term file?" size_coef = size_coefficients_from_spec(size_spec) @@ -148,7 +185,48 @@ def _read_csv(filename, **kwargs): chooser_index_name = chooser_data.columns[0] x_co = chooser_data.set_index(chooser_index_name) - x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]])) + + def split(a, n): + k, m = divmod(len(a), n) + return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) + + # process x_ca with cv_to_ca with or without chunking + x_ca_pickle_file = "{name}_x_ca.pkl" + if chunking_size == None: + x_ca = cv_to_ca( + alt_values.set_index([chooser_index_name, alt_values.columns[1]]) + ) + elif _file_exists(x_ca_pickle_file): + # if pickle file from previous x_ca processing exist, load it to save time + time_start = datetime.now() + x_ca = _read_pickle(x_ca_pickle_file) + print( + f"x_ca data loaded from {name}_x_ca.fea - time elapsed {(datetime.now() - time_start).total_seconds()}" + ) + else: + time_start = datetime.now() + # calculate num_chunks based on chunking_size (or max number of rows per chunk) + num_chunks = int(len(alt_values) / chunking_size) + all_person_ids = list(alt_values["person_id"].unique()) + split_ids = list(split(all_person_ids, num_chunks)) + x_ca_list = [] + i = 0 + for chunk_ids in split_ids: + alt_values_i = alt_values[alt_values["person_id"].isin(chunk_ids)] + x_ca_i = cv_to_ca( + alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]]) + ) + x_ca_list.append(x_ca_i) + print( + f"\rx_ca_i compute done for chunk {i}/{num_chunks} - time elapsed {(datetime.now() - time_start).total_seconds()}" + ) + i = i + 1 + x_ca = pd.concat(x_ca_list, axis=0) + # save final x_ca result as pickle file to save time for future data loading + _to_pickle(df=x_ca, filename=x_ca_pickle_file) + print( + f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}" + ) if CHOOSER_SEGMENT_COLUMN_NAME is not None: # label segments with names @@ -214,6 +292,9 @@ def _read_csv(filename, **kwargs): else: av = 1 + assert len(x_co) > 0, "Empty chooser dataframe" + assert len(x_ca_1) > 0, "Empty alternatives dataframe" + d = DataFrames(co=x_co, ca=x_ca_1, av=av) m = Model(dataservice=d) @@ -331,6 +412,14 @@ def workplace_location_model(**kwargs): ) +def external_workplace_location_model(**kwargs): + unused = kwargs.pop("name", None) + return location_choice_model( + name="external_workplace_location", + **kwargs, + ) + + def school_location_model(**kwargs): unused = kwargs.pop("name", None) return location_choice_model( @@ -367,6 +456,14 @@ def non_mandatory_tour_destination_model(**kwargs): ) +def external_non_mandatory_destination_model(**kwargs): + unused = kwargs.pop("name", None) + return location_choice_model( + name="external_non_mandatory_destination", + **kwargs, + ) + + def trip_destination_model(**kwargs): unused = kwargs.pop("name", None) return location_choice_model(