add alt_values_to_feather option

bwentl · bwentl · commit 0e5d3bf56fef · 2023-06-23T09:57:57.000-07:00
and update comments
default behavior does not change, to enable use of feather
and the use of chunking, do this:

    # load from original
    model, data = component_model(modelname,
                                  return_data=True,
                                  alt_values_to_feather=True,
                                  chunking_size=chunking_size)
diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py
@@ -46,6 +46,7 @@ def location_choice_model(
     settings_file="{name}_model_settings.yaml",
     landuse_file="{name}_landuse.csv",
     return_data=False,
+    alt_values_to_feather=False,
     chunking_size=None,
 ):
     model_selector = name.replace("_location", "")
@@ -85,12 +86,15 @@ def _file_exists(filename):
         index_col="coefficient_name",
     )
     spec = _read_csv(spec_file, comment="#")
+
+    # read alternative values either as csv or feather file
     alt_values_fea_file = alt_values_file.replace(".csv", ".fea")
     if os.path.exists(os.path.join(edb_directory, alt_values_fea_file.format(name=name))):
         alt_values = _read_feather(alt_values_fea_file)
     else:
         alt_values = _read_csv(alt_values_file)
-        _to_feather(df=alt_values, filename=alt_values_fea_file)
+        if alt_values_to_feather:
+            _to_feather(df=alt_values, filename=alt_values_fea_file)
     chooser_data = _read_csv(chooser_file)
     landuse = _read_csv(landuse_file, index_col="zone_id")
     master_size_spec = _read_csv(size_spec_file)
@@ -181,10 +185,12 @@ def split(a, n):
         k, m = divmod(len(a), n)
         return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
 
+    # process x_ca with cv_to_ca with or without chunking
     x_ca_pickle_file = "{name}_x_ca.pkl"
     if chunking_size == None:
         x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]]))
     elif _file_exists(x_ca_pickle_file):
+        # if pickle file from previous x_ca processing exist, load it to save time
         time_start = datetime.now()
         x_ca = _read_pickle(x_ca_pickle_file)
         print(
@@ -208,6 +214,7 @@ def split(a, n):
             )
             i = i + 1
         x_ca = pd.concat(x_ca_list, axis=0)
+        # save final x_ca result as pickle file to save time for future data loading
         _to_pickle(df=x_ca, filename=x_ca_pickle_file)
         print(
             f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}")