Skip to content

Commit 9f81a89

Browse files
committed
implement poisson sampling
- restructure to avoid order reshuffling
1 parent 48ff088 commit 9f81a89

9 files changed

Lines changed: 199 additions & 76 deletions

activitysim/abm/models/location_choice.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
)
1818
from activitysim.core.interaction_sample import interaction_sample
1919
from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
20+
from activitysim.core.logit import AltsContext
2021
from activitysim.core.util import reindex
2122
from activitysim.core.exceptions import DuplicateWorkflowTableError
2223

@@ -603,6 +604,7 @@ def run_location_simulate(
603604
chunk_tag,
604605
trace_label,
605606
skip_choice=False,
607+
alts_context: AltsContext | None = None,
606608
):
607609
"""
608610
run location model on location_sample annotated with mode_choice logsum
@@ -712,6 +714,7 @@ def run_location_simulate(
712714
compute_settings=model_settings.compute_settings.subcomponent_settings(
713715
"simulate"
714716
),
717+
alts_context=alts_context,
715718
)
716719

717720
if not want_logsums:
@@ -788,6 +791,11 @@ def run_location_choice(
788791
if choosers.shape[0] == 0:
789792
logger.info(f"{trace_label} skipping segment {segment_name}: no choosers")
790793
continue
794+
# using land use rather than size terms in case something goes 0 base -> nonzero project, double
795+
# check if that would be in dest_size_terms as a zero
796+
alts_context = AltsContext.from_series(dest_size_terms.index) # index zone_id, not ALT_DEST_COL_NAME
797+
# assumes that dest_size_terms will always contain zeros for non-attractive zones, i.e. it will have the
798+
# same length as land_use
791799

792800
# - location_sample
793801
location_sample_df = run_location_sample(
@@ -841,6 +849,7 @@ def run_location_choice(
841849
trace_label, "simulate.%s" % segment_name
842850
),
843851
skip_choice=skip_choice,
852+
alts_context=alts_context,
844853
)
845854

846855
if estimator:

activitysim/abm/models/parking_location_choice.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from activitysim.core.configuration.base import PreprocessorSettings
2222
from activitysim.core.configuration.logit import LogitComponentSettings
2323
from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
24+
from activitysim.core.logit import AltsContext
2425
from activitysim.core.tracing import print_elapsed_time
2526
from activitysim.core.util import assign_in_place, drop_unused_columns
2627
from activitysim.core.exceptions import DuplicateWorkflowTableError
@@ -112,6 +113,7 @@ def parking_destination_simulate(
112113
chunk_size,
113114
trace_hh_id,
114115
trace_label,
116+
alts_context: AltsContext | None = None,
115117
):
116118
"""
117119
Chose destination from destination_sample (with od_logsum and dp_logsum columns added)
@@ -150,6 +152,7 @@ def parking_destination_simulate(
150152
trace_label=trace_label,
151153
trace_choice_name="parking_loc",
152154
explicit_chunk_size=model_settings.explicit_chunk,
155+
alts_context=alts_context,
153156
)
154157

155158
# drop any failed zero_prob destinations
@@ -211,6 +214,9 @@ def choose_parking_location(
211214
)
212215
destination_sample.index = np.repeat(trips.index.values, len(alternatives))
213216
destination_sample.index.name = trips.index.name
217+
# using destination_sample would also be right because destination_sample isn't a sample here,
218+
# but that could change
219+
alts_context = AltsContext.from_series(alternatives[alt_dest_col_name])
214220

215221
destinations = parking_destination_simulate(
216222
state,
@@ -223,6 +229,7 @@ def choose_parking_location(
223229
chunk_size=chunk_size,
224230
trace_hh_id=trace_hh_id,
225231
trace_label=trace_label,
232+
alts_context=alts_context
226233
)
227234

228235
if want_sample_table:

activitysim/abm/models/trip_destination.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from activitysim.core.configuration.logit import LocationComponentSettings
3333
from activitysim.core.interaction_sample import interaction_sample
3434
from activitysim.core.interaction_sample_simulate import interaction_sample_simulate
35+
from activitysim.core.logit import AltsContext
3536
from activitysim.core.skim_dictionary import DataFrameMatrix
3637
from activitysim.core.tracing import print_elapsed_time
3738
from activitysim.core.util import assign_in_place, reindex
@@ -950,6 +951,7 @@ def trip_destination_simulate(
950951
skim_hotel,
951952
estimator,
952953
trace_label,
954+
alts_context: AltsContext | None = None,
953955
):
954956
"""
955957
Chose destination from destination_sample (with od_logsum and dp_logsum columns added)
@@ -1036,6 +1038,7 @@ def trip_destination_simulate(
10361038
trace_choice_name="trip_dest",
10371039
estimator=estimator,
10381040
explicit_chunk_size=model_settings.explicit_chunk,
1041+
alts_context=alts_context,
10391042
)
10401043

10411044
if not want_logsums:
@@ -1126,7 +1129,10 @@ def choose_trip_destination(
11261129
destination_sample["dp_logsum"] = 0.0
11271130

11281131
t0 = print_elapsed_time("%s.compute_logsums" % trace_label, t0, debug=True)
1129-
1132+
alt_dest_col_name = model_settings.ALT_DEST_COL_NAME
1133+
alts = alternatives.index
1134+
assert alts.name == alt_dest_col_name
1135+
alts_context = AltsContext.from_series(alts)
11301136
destinations = trip_destination_simulate(
11311137
state,
11321138
primary_purpose=primary_purpose,
@@ -1138,6 +1144,7 @@ def choose_trip_destination(
11381144
skim_hotel=skim_hotel,
11391145
estimator=estimator,
11401146
trace_label=trace_label,
1147+
alts_context=alts_context,
11411148
)
11421149

11431150
dropped_trips = ~trips.index.isin(destinations.index)

activitysim/abm/models/trip_scheduling_choice.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
)
2121
from activitysim.core.configuration.logit import LogitComponentSettings
2222
from activitysim.core.interaction_sample_simulate import _interaction_sample_simulate
23+
from activitysim.core.logit import AltsContext
2324
from activitysim.core.skim_dataset import SkimDataset
2425
from activitysim.core.skim_dictionary import SkimDict
2526

@@ -314,6 +315,7 @@ def run_trip_scheduling_choice(
314315
estimator=None,
315316
chunk_sizer=chunk_sizer,
316317
compute_settings=model_settings.compute_settings,
318+
alts_context= AltsContext(schedules[SCHEDULE_ID].min(), schedules[SCHEDULE_ID].max()),
317319
)
318320

319321
assert len(choices.index) == len(choosers.index)

activitysim/core/interaction_sample.py

Lines changed: 93 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
from __future__ import annotations
44

55
import logging
6+
import typing
67

78
import numpy as np
89
import pandas as pd
9-
1010
from activitysim.core import (
1111
chunk,
1212
estimation,
@@ -17,16 +17,34 @@
1717
util,
1818
workflow,
1919
)
20+
from activitysim.core.chunk import ChunkSizer
2021
from activitysim.core.configuration.base import ComputeSettings
2122
from activitysim.core.exceptions import SegmentedSpecificationError
2223
from activitysim.core.skim_dataset import DatasetWrapper
2324
from activitysim.core.skim_dictionary import SkimWrapper
25+
if typing.TYPE_CHECKING:
26+
from activitysim.core.random import Random
2427

2528
logger = logging.getLogger(__name__)
2629

2730
DUMP = False
2831

2932

33+
def _poisson_sample_alternatives_inner(
34+
alternative_count: int,
35+
probs: pd.DataFrame,
36+
poisson_inclusion_probs: pd.DataFrame,
37+
rng: Random,
38+
trace_label: str | None,
39+
chunk_sizer:ChunkSizer,
40+
) -> pd.DataFrame:
41+
rands = rng.random_for_df(probs, n=alternative_count)
42+
chunk_sizer.log_df(trace_label, "rands", rands)
43+
sampled_mask = rands < poisson_inclusion_probs
44+
sampled_results = poisson_inclusion_probs.where(sampled_mask)
45+
return sampled_results
46+
47+
3048
def make_sample_choices_utility_based(
3149
state: workflow.State,
3250
choosers,
@@ -36,8 +54,8 @@ def make_sample_choices_utility_based(
3654
alternative_count,
3755
alt_col_name,
3856
allow_zero_probs,
39-
trace_label,
40-
chunk_sizer,
57+
trace_label:str,
58+
chunk_sizer:ChunkSizer,
4159
):
4260
assert isinstance(utilities, pd.DataFrame)
4361
assert utilities.shape == (len(choosers), alternative_count)
@@ -60,32 +78,6 @@ def make_sample_choices_utility_based(
6078

6179
utils_array = utilities.to_numpy()
6280
chunk_sizer.log_df(trace_label, "utils_array", utils_array)
63-
chosen_destinations = []
64-
65-
rands = state.get_rn_generator().gumbel_for_df(utilities, n=alternative_count)
66-
chunk_sizer.log_df(trace_label, "rands", rands)
67-
68-
# TODO-EET [janzill Jun2022]: using for-loop to keep memory usage low, an array of dimension
69-
# (len(choosers), alternative_count, sample_size) can get very large. Probably better to
70-
# use chunking for this.
71-
for i in range(sample_size):
72-
# created this once for memory logging
73-
if i > 0:
74-
rands = state.get_rn_generator().gumbel_for_df(
75-
utilities, n=alternative_count
76-
)
77-
chosen_destinations.append(np.argmax(utils_array + rands, axis=1))
78-
chosen_destinations = np.concatenate(chosen_destinations, axis=0)
79-
80-
chunk_sizer.log_df(trace_label, "chosen_destinations", chosen_destinations)
81-
82-
del utils_array
83-
chunk_sizer.log_df(trace_label, "utils_array", None)
84-
del rands
85-
chunk_sizer.log_df(trace_label, "rands", None)
86-
87-
chooser_idx = np.tile(np.arange(utilities.shape[0]), sample_size)
88-
chunk_sizer.log_df(trace_label, "chooser_idx", chooser_idx)
8981

9082
probs = logit.utils_to_probs(
9183
state,
@@ -95,28 +87,69 @@ def make_sample_choices_utility_based(
9587
overflow_protection=not allow_zero_probs,
9688
trace_choosers=choosers,
9789
)
98-
chunk_sizer.log_df(trace_label, "probs", probs)
99-
100-
choices_df = pd.DataFrame(
101-
{
102-
alt_col_name: alternatives.index.values[chosen_destinations],
103-
"prob": probs.to_numpy()[chooser_idx, chosen_destinations],
104-
choosers.index.name: choosers.index.values[chooser_idx],
105-
}
90+
inclusion_probs, sampled_alternatives = _poisson_sample_alternatives(alternative_count, chunk_sizer, probs,
91+
sample_size, state, trace_label)
92+
93+
# Stack removes the NaNs (the ones that weren't sampled)
94+
# and gives us a multi-index of (person_id, alt_id)
95+
choices_df = (
96+
sampled_alternatives.rename_axis("alt_idx", axis=1)
97+
.stack()
98+
.reset_index(name="prob")
99+
.assign(**{alt_col_name: lambda df: alternatives.index.values[df["alt_idx"]]})
100+
.drop(columns=["alt_idx"])
106101
)
107-
chunk_sizer.log_df(trace_label, "choices_df", choices_df)
108-
109-
del chooser_idx
110-
chunk_sizer.log_df(trace_label, "chooser_idx", None)
111-
del chosen_destinations
112-
chunk_sizer.log_df(trace_label, "chosen_destinations", None)
113-
del probs
114-
chunk_sizer.log_df(trace_label, "probs", None)
115102

116-
# handing this off to caller
117-
chunk_sizer.log_df(trace_label, "choices_df", None)
118-
119-
return choices_df
103+
# Here we return the inclusion probabilities i.e. the true probability of being sampled and (ab)use the fact
104+
# that pick_count=1 by definition and ln(1)=0 and recover the standard sample correction term.
105+
# In non-Poisson sampling, we would return the probs of sampling an alternative once
106+
# and the sampling correction factor np.log(df.pick_count/df.prob) is applied to the simulate utilities.
107+
# TODO is it safe change the meaning of df.prob, given it's referenced in expression csvs?
108+
# (but the alternative is to update all the expression CSV for sampling?)
109+
return choices_df, inclusion_probs
110+
111+
112+
def _poisson_sample_alternatives(alternative_count, chunk_sizer: ChunkSizer, probs: pd.DataFrame, sample_size,
113+
state: workflow.State, trace_label: str) -> tuple[pd.DataFrame, pd.DataFrame]:
114+
# compute the inclusion probability as the reciprocal of alt never being drawn
115+
# -- these are common, so compute once upfront
116+
exclusion_probs = (1 - probs) ** sample_size
117+
inclusion_probs = 1 - exclusion_probs
118+
119+
n = 0
120+
probs_subset = probs
121+
inclusion_probs_subset = inclusion_probs
122+
sampled_alternatives = pd.DataFrame(0.0, index=inclusion_probs.index, columns=inclusion_probs.columns)
123+
while True:
124+
sampled_results_subset = _poisson_sample_alternatives_inner(
125+
alternative_count, probs_subset, inclusion_probs_subset, state.get_rn_generator(), trace_label, chunk_sizer
126+
)
127+
no_alts_sampled_mask = sampled_results_subset.isna().all(axis=1)
128+
alts_with_sampled_alternatives = sampled_results_subset[~no_alts_sampled_mask]
129+
sampled_alternatives.loc[alts_with_sampled_alternatives.index, :] = alts_with_sampled_alternatives
130+
if no_alts_sampled_mask.any():
131+
# TODO if this happens in base but the project case is such that something is picked, random numbers won't
132+
# be consistent - we're asserting that this is very rare models where the sample size is not too small
133+
logger.info(f"Poisson sampling of alternatives failed with {n=}, retrying")
134+
# TODO put this behind a debug guard, because it will be slow
135+
logger.info(
136+
f"Sampled size was {sample_size}, poisson method mean expected sample size was {inclusion_probs.sum(axis=1).mean():.1f}, actual sampled mean was {(sampled_alternatives > 0).sum(axis=1).mean():.1f} and highest zero selection prob was {(exclusion_probs).product(axis=1).max():.2g}")
137+
probs_subset = probs[no_alts_sampled_mask]
138+
inclusion_probs_subset = inclusion_probs[no_alts_sampled_mask]
139+
140+
else: # All alternatives are fine
141+
break
142+
143+
n += 1
144+
if n == 10:
145+
choosers_no_alts_sampled = sampled_results_subset[no_alts_sampled_mask]
146+
msg = (f"Poisson choice set sampling failed after 10 attempts for these cases:\n"
147+
f"{choosers_no_alts_sampled}\n{probs_subset}")
148+
raise ValueError(msg)
149+
150+
chunk_sizer.log_df(trace_label, "sampled_alternatives", sampled_alternatives)
151+
152+
return inclusion_probs, sampled_alternatives
120153

121154

122155
def make_sample_choices(
@@ -227,7 +260,7 @@ def _interaction_sample(
227260
locals_d=None,
228261
trace_label=None,
229262
zone_layer=None,
230-
chunk_sizer=None,
263+
chunk_sizer: ChunkSizer|None=None,
231264
compute_settings: ComputeSettings | None = None,
232265
):
233266
"""
@@ -292,6 +325,9 @@ def _interaction_sample(
292325
pick_count : int
293326
number of duplicate picks for chooser, alt
294327
"""
328+
assert chunk_sizer is not None, "chunk_sizer cannot be None but old nullable signature is preserved"
329+
# TODO it's probably safe to reorder these arguments to make chunk_sizer mandatory since
330+
# _interaction_sample is private?
295331

296332
have_trace_targets = state.tracing.has_trace_targets(choosers)
297333
trace_ids = None
@@ -812,7 +848,13 @@ def interaction_sample(
812848
assert choosers.index.is_monotonic_increasing
813849

814850
# FIXME - legacy logic - not sure this is needed or even correct?
815-
sample_size = min(sample_size, len(alternatives.index))
851+
if not state.settings.use_explicit_error_terms:
852+
sample_size = min(sample_size, len(alternatives.index))
853+
# with poisson sampling, definitely don't want to reduce sample size - it's not a sample size but a number
854+
# of theoretical draws. Another options would be to disable sampling if # alts < sample size to ensure
855+
# all are included (but this wouldn't behave well if there were land use changes in the project case which
856+
# switched regimes)
857+
816858
logger.info(f" --- interaction_sample sample size = {sample_size}")
817859

818860
result_list = []

0 commit comments

Comments
 (0)