33from __future__ import annotations
44
55import logging
6+ import typing
67
78import numpy as np
89import pandas as pd
9-
1010from activitysim .core import (
1111 chunk ,
1212 estimation ,
1717 util ,
1818 workflow ,
1919)
20+ from activitysim .core .chunk import ChunkSizer
2021from activitysim .core .configuration .base import ComputeSettings
2122from activitysim .core .exceptions import SegmentedSpecificationError
2223from activitysim .core .skim_dataset import DatasetWrapper
2324from activitysim .core .skim_dictionary import SkimWrapper
25+ if typing .TYPE_CHECKING :
26+ from activitysim .core .random import Random
2427
2528logger = logging .getLogger (__name__ )
2629
2730DUMP = False
2831
2932
33+ def _poisson_sample_alternatives_inner (
34+ alternative_count : int ,
35+ probs : pd .DataFrame ,
36+ poisson_inclusion_probs : pd .DataFrame ,
37+ rng : Random ,
38+ trace_label : str | None ,
39+ chunk_sizer :ChunkSizer ,
40+ ) -> pd .DataFrame :
41+ rands = rng .random_for_df (probs , n = alternative_count )
42+ chunk_sizer .log_df (trace_label , "rands" , rands )
43+ sampled_mask = rands < poisson_inclusion_probs
44+ sampled_results = poisson_inclusion_probs .where (sampled_mask )
45+ return sampled_results
46+
47+
3048def make_sample_choices_utility_based (
3149 state : workflow .State ,
3250 choosers ,
@@ -36,8 +54,8 @@ def make_sample_choices_utility_based(
3654 alternative_count ,
3755 alt_col_name ,
3856 allow_zero_probs ,
39- trace_label ,
40- chunk_sizer ,
57+ trace_label : str ,
58+ chunk_sizer : ChunkSizer ,
4159):
4260 assert isinstance (utilities , pd .DataFrame )
4361 assert utilities .shape == (len (choosers ), alternative_count )
@@ -60,32 +78,6 @@ def make_sample_choices_utility_based(
6078
6179 utils_array = utilities .to_numpy ()
6280 chunk_sizer .log_df (trace_label , "utils_array" , utils_array )
63- chosen_destinations = []
64-
65- rands = state .get_rn_generator ().gumbel_for_df (utilities , n = alternative_count )
66- chunk_sizer .log_df (trace_label , "rands" , rands )
67-
68- # TODO-EET [janzill Jun2022]: using for-loop to keep memory usage low, an array of dimension
69- # (len(choosers), alternative_count, sample_size) can get very large. Probably better to
70- # use chunking for this.
71- for i in range (sample_size ):
72- # created this once for memory logging
73- if i > 0 :
74- rands = state .get_rn_generator ().gumbel_for_df (
75- utilities , n = alternative_count
76- )
77- chosen_destinations .append (np .argmax (utils_array + rands , axis = 1 ))
78- chosen_destinations = np .concatenate (chosen_destinations , axis = 0 )
79-
80- chunk_sizer .log_df (trace_label , "chosen_destinations" , chosen_destinations )
81-
82- del utils_array
83- chunk_sizer .log_df (trace_label , "utils_array" , None )
84- del rands
85- chunk_sizer .log_df (trace_label , "rands" , None )
86-
87- chooser_idx = np .tile (np .arange (utilities .shape [0 ]), sample_size )
88- chunk_sizer .log_df (trace_label , "chooser_idx" , chooser_idx )
8981
9082 probs = logit .utils_to_probs (
9183 state ,
@@ -95,28 +87,69 @@ def make_sample_choices_utility_based(
9587 overflow_protection = not allow_zero_probs ,
9688 trace_choosers = choosers ,
9789 )
98- chunk_sizer .log_df (trace_label , "probs" , probs )
99-
100- choices_df = pd .DataFrame (
101- {
102- alt_col_name : alternatives .index .values [chosen_destinations ],
103- "prob" : probs .to_numpy ()[chooser_idx , chosen_destinations ],
104- choosers .index .name : choosers .index .values [chooser_idx ],
105- }
90+ inclusion_probs , sampled_alternatives = _poisson_sample_alternatives (alternative_count , chunk_sizer , probs ,
91+ sample_size , state , trace_label )
92+
93+ # Stack removes the NaNs (the ones that weren't sampled)
94+ # and gives us a multi-index of (person_id, alt_id)
95+ choices_df = (
96+ sampled_alternatives .rename_axis ("alt_idx" , axis = 1 )
97+ .stack ()
98+ .reset_index (name = "prob" )
99+ .assign (** {alt_col_name : lambda df : alternatives .index .values [df ["alt_idx" ]]})
100+ .drop (columns = ["alt_idx" ])
106101 )
107- chunk_sizer .log_df (trace_label , "choices_df" , choices_df )
108-
109- del chooser_idx
110- chunk_sizer .log_df (trace_label , "chooser_idx" , None )
111- del chosen_destinations
112- chunk_sizer .log_df (trace_label , "chosen_destinations" , None )
113- del probs
114- chunk_sizer .log_df (trace_label , "probs" , None )
115102
116- # handing this off to caller
117- chunk_sizer .log_df (trace_label , "choices_df" , None )
118-
119- return choices_df
103+ # Here we return the inclusion probabilities i.e. the true probability of being sampled and (ab)use the fact
104+ # that pick_count=1 by definition and ln(1)=0 and recover the standard sample correction term.
105+ # In non-Poisson sampling, we would return the probs of sampling an alternative once
106+ # and the sampling correction factor np.log(df.pick_count/df.prob) is applied to the simulate utilities.
107+ # TODO is it safe change the meaning of df.prob, given it's referenced in expression csvs?
108+ # (but the alternative is to update all the expression CSV for sampling?)
109+ return choices_df , inclusion_probs
110+
111+
112+ def _poisson_sample_alternatives (alternative_count , chunk_sizer : ChunkSizer , probs : pd .DataFrame , sample_size ,
113+ state : workflow .State , trace_label : str ) -> tuple [pd .DataFrame , pd .DataFrame ]:
114+ # compute the inclusion probability as the reciprocal of alt never being drawn
115+ # -- these are common, so compute once upfront
116+ exclusion_probs = (1 - probs ) ** sample_size
117+ inclusion_probs = 1 - exclusion_probs
118+
119+ n = 0
120+ probs_subset = probs
121+ inclusion_probs_subset = inclusion_probs
122+ sampled_alternatives = pd .DataFrame (0.0 , index = inclusion_probs .index , columns = inclusion_probs .columns )
123+ while True :
124+ sampled_results_subset = _poisson_sample_alternatives_inner (
125+ alternative_count , probs_subset , inclusion_probs_subset , state .get_rn_generator (), trace_label , chunk_sizer
126+ )
127+ no_alts_sampled_mask = sampled_results_subset .isna ().all (axis = 1 )
128+ alts_with_sampled_alternatives = sampled_results_subset [~ no_alts_sampled_mask ]
129+ sampled_alternatives .loc [alts_with_sampled_alternatives .index , :] = alts_with_sampled_alternatives
130+ if no_alts_sampled_mask .any ():
131+ # TODO if this happens in base but the project case is such that something is picked, random numbers won't
132+ # be consistent - we're asserting that this is very rare models where the sample size is not too small
133+ logger .info (f"Poisson sampling of alternatives failed with { n = } , retrying" )
134+ # TODO put this behind a debug guard, because it will be slow
135+ logger .info (
136+ f"Sampled size was { sample_size } , poisson method mean expected sample size was { inclusion_probs .sum (axis = 1 ).mean ():.1f} , actual sampled mean was { (sampled_alternatives > 0 ).sum (axis = 1 ).mean ():.1f} and highest zero selection prob was { (exclusion_probs ).product (axis = 1 ).max ():.2g} " )
137+ probs_subset = probs [no_alts_sampled_mask ]
138+ inclusion_probs_subset = inclusion_probs [no_alts_sampled_mask ]
139+
140+ else : # All alternatives are fine
141+ break
142+
143+ n += 1
144+ if n == 10 :
145+ choosers_no_alts_sampled = sampled_results_subset [no_alts_sampled_mask ]
146+ msg = (f"Poisson choice set sampling failed after 10 attempts for these cases:\n "
147+ f"{ choosers_no_alts_sampled } \n { probs_subset } " )
148+ raise ValueError (msg )
149+
150+ chunk_sizer .log_df (trace_label , "sampled_alternatives" , sampled_alternatives )
151+
152+ return inclusion_probs , sampled_alternatives
120153
121154
122155def make_sample_choices (
@@ -227,7 +260,7 @@ def _interaction_sample(
227260 locals_d = None ,
228261 trace_label = None ,
229262 zone_layer = None ,
230- chunk_sizer = None ,
263+ chunk_sizer : ChunkSizer | None = None ,
231264 compute_settings : ComputeSettings | None = None ,
232265):
233266 """
@@ -292,6 +325,9 @@ def _interaction_sample(
292325 pick_count : int
293326 number of duplicate picks for chooser, alt
294327 """
328+ assert chunk_sizer is not None , "chunk_sizer cannot be None but old nullable signature is preserved"
329+ # TODO it's probably safe to reorder these arguments to make chunk_sizer mandatory since
330+ # _interaction_sample is private?
295331
296332 have_trace_targets = state .tracing .has_trace_targets (choosers )
297333 trace_ids = None
@@ -812,7 +848,13 @@ def interaction_sample(
812848 assert choosers .index .is_monotonic_increasing
813849
814850 # FIXME - legacy logic - not sure this is needed or even correct?
815- sample_size = min (sample_size , len (alternatives .index ))
851+ if not state .settings .use_explicit_error_terms :
852+ sample_size = min (sample_size , len (alternatives .index ))
853+ # with poisson sampling, definitely don't want to reduce sample size - it's not a sample size but a number
854+ # of theoretical draws. Another options would be to disable sampling if # alts < sample size to ensure
855+ # all are included (but this wouldn't behave well if there were land use changes in the project case which
856+ # switched regimes)
857+
816858 logger .info (f" --- interaction_sample sample size = { sample_size } " )
817859
818860 result_list = []
0 commit comments