99
1010import numpy as np
1111import pandas as pd
12+ from tqdm import tqdm
1213
1314
1415class SyntheticDataGenerator :
@@ -146,49 +147,121 @@ def __init__(
146147
147148 def _generate_column_names (self ) -> List [str ]:
148149 """Generates a list of realistic, structured feature names."""
149- generated_names = set ( )
150+ self . logger . info ( f"Generating { self . n_features } column names..." )
150151
151- # Add some special features to ensure they are present
152- generated_names .update (
153- random .sample (self ._special_features , min (len (self ._special_features ), 5 ))
154- )
155-
156- # Generate structured feature groups
157- while len (generated_names ) < self .n_features :
158- prefix = random .choice (self ._feature_prefixes )
152+ generated_names = []
159153
160- # For each prefix, create a few related features
161- num_suffixes_for_prefix = random .randint (1 , 4 )
162- suffixes_to_add = random .sample (
163- self ._feature_suffixes , num_suffixes_for_prefix
164- )
154+ # Add some special features first
155+ num_special = min (len (self ._special_features ), 5 )
156+ generated_names .extend (random .sample (self ._special_features , num_special ))
165157
166- for suffix in suffixes_to_add :
167- if len (generated_names ) >= self .n_features :
168- break
169- new_name = f"{ prefix } { suffix } "
170- generated_names .add (new_name )
158+ # Calculate how many more names we need
159+ remaining = self .n_features - len (generated_names )
171160
172- final_names = list ( generated_names )
173- random . shuffle ( final_names )
161+ if remaining <= 0 :
162+ return generated_names [: self . n_features ]
174163
175- return final_names [: self .n_features ]
164+ # 1. Generate all possible clean combinations (Prefix + Suffix)
165+ # This avoids _r1 suffixes unless we run out of unique clean names
166+ clean_combinations = [
167+ f"{ prefix } { suffix } "
168+ for prefix in self ._feature_prefixes
169+ for suffix in self ._feature_suffixes
170+ ]
171+ random .shuffle (clean_combinations )
172+
173+ # Take as many as we need from clean combinations
174+ num_from_clean = min (len (clean_combinations ), remaining )
175+ generated_names .extend (clean_combinations [:num_from_clean ])
176+ remaining -= num_from_clean
177+
178+ # 2. If we still need more, use rounds (_r1, _r2, etc.)
179+ round_num = 1
180+ max_rounds = 10
181+ while remaining > 0 and round_num < max_rounds :
182+ round_candidates = [
183+ f"{ prefix } _r{ round_num } { suffix } "
184+ for prefix in self ._feature_prefixes
185+ for suffix in self ._feature_suffixes
186+ ]
187+ random .shuffle (round_candidates )
188+
189+ take_round = min (remaining , len (round_candidates ))
190+ generated_names .extend (round_candidates [:take_round ])
191+ remaining -= take_round
192+ round_num += 1
193+
194+ # Final shuffle
195+ random .shuffle (generated_names )
196+ return generated_names [: self .n_features ]
197+
198+ def _generate_typed_data (self , feature_names : List [str ]) -> np .ndarray :
199+ """
200+ Generates data with appropriate types based on column names.
176201
177- def _assign_feature_types (self , df : pd .DataFrame ):
178- """Modifies DataFrame columns in-place to have more realistic data types."""
179- for col in df .columns :
180- # Handle special cases first
202+ This is significantly faster than modifying DataFrame columns after creation.
203+ """
204+ # Pre-categorize columns to avoid repeated string matching
205+ age_cols = []
206+ binary_cols = []
207+ bmi_cols = []
208+ int_cols = []
209+ binary_suffix_cols = []
210+ normal_cols = []
211+
212+ self .logger .info ("Categorizing columns..." )
213+ for idx , col in enumerate (feature_names ):
181214 if col == "age" :
182- df [ col ] = np . random . randint ( 20 , 90 , size = self . n_rows )
215+ age_cols . append ( idx )
183216 elif col == "male" or "vte_status" in col or "bed_type" in col :
184- df [ col ] = np . random . randint ( 0 , 2 , size = self . n_rows )
217+ binary_cols . append ( idx )
185218 elif col == "bmi_value" :
186- df [col ] = np .random .uniform (18 , 45 , size = self .n_rows )
187- # Handle suffixes
219+ bmi_cols .append (idx )
188220 elif any (s in col for s in self ._int_suffixes ):
189- df [ col ] = np . random . poisson ( 5 , size = self . n_rows ) * random . randint ( 1 , 5 )
221+ int_cols . append ( idx )
190222 elif any (s in col for s in self ._binary_suffixes ):
191- df [col ] = np .random .randint (0 , 2 , size = self .n_rows )
223+ binary_suffix_cols .append (idx )
224+ else :
225+ normal_cols .append (idx )
226+
227+ # Pre-allocate array
228+ data = np .empty ((self .n_rows , len (feature_names )), dtype = np .float32 )
229+
230+ # Generate data in bulk for each category
231+ self .logger .info ("Generating typed data..." )
232+ if age_cols :
233+ for idx in age_cols :
234+ data [:, idx ] = np .random .randint (20 , 90 , size = self .n_rows )
235+
236+ if binary_cols :
237+ for idx in binary_cols :
238+ data [:, idx ] = np .random .randint (0 , 2 , size = self .n_rows )
239+
240+ if bmi_cols :
241+ for idx in bmi_cols :
242+ data [:, idx ] = np .random .uniform (18 , 45 , size = self .n_rows )
243+
244+ if int_cols :
245+ for idx in int_cols :
246+ data [:, idx ] = np .random .poisson (5 , size = self .n_rows ) * random .randint (
247+ 1 , 5
248+ )
249+
250+ if binary_suffix_cols :
251+ for idx in binary_suffix_cols :
252+ data [:, idx ] = np .random .randint (0 , 2 , size = self .n_rows )
253+
254+ # Generate all normal columns at once
255+ if normal_cols :
256+ self .logger .info (
257+ f"Generating { len (normal_cols )} normal distribution columns..."
258+ )
259+ normal_data = np .random .randn (self .n_rows , len (normal_cols )).astype (
260+ np .float32
261+ )
262+ data [:, normal_cols ] = normal_data
263+
264+ return data
192265
193266 def generate (self ) -> tuple [pd .DataFrame , dict [str , list [str ]]]:
194267 """
@@ -199,28 +272,35 @@ def generate(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
199272 - The fully generated synthetic dataset.
200273 - A dictionary mapping each outcome variable to its list of important features.
201274 """
202- # 1. Generate feature data
275+ self .logger .info (
276+ f"Starting generation: { self .n_rows } rows × { self .n_features } features"
277+ )
278+
279+ # 1. Generate feature names
280+ self .logger .info ("Generating column names..." )
203281 feature_names = self ._generate_column_names ()
204- data = np .random .randn (self .n_rows , len (feature_names ))
282+
283+ # 2. Generate typed data directly (much faster than modifying after)
284+ data = self ._generate_typed_data (feature_names )
285+
286+ self .logger .info ("Creating DataFrame..." )
205287 df = pd .DataFrame (data , columns = feature_names )
206288
207- # This will hold new columns to be added efficiently at the end
289+ # Dictionary to hold outcome variables and metadata
208290 new_cols_dict = {}
209-
210- # This will store the mapping of outcome -> important features
211291 outcome_to_features_map = {}
212292
213- # 1.5. Assign more realistic data types and distributions
214- self ._assign_feature_types (df )
215-
216- # 2. Determine number of important features
217- n_important = int (self .n_features * self .percent_important_features )
218- n_important = max (1 , n_important ) # Ensure at least one important feature
293+ # 3. Determine number of important features
294+ n_important = max (1 , int (self .n_features * self .percent_important_features ))
219295
220296 self .logger .info (f"Generating { self .n_outcome_vars } outcome variables." )
221297
222- # 3. Generate outcome variables
223- for i in range (1 , self .n_outcome_vars + 1 ):
298+ # 4. Generate outcome variables with progress bar
299+ for i in tqdm (
300+ range (1 , self .n_outcome_vars + 1 ),
301+ desc = "Creating outcomes" ,
302+ disable = not self .logger .isEnabledFor (logging .INFO ),
303+ ):
224304 outcome_col_name = f"outcome_var_{ i } "
225305
226306 # Select a unique set of important features for *this* outcome
@@ -232,58 +312,78 @@ def generate(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
232312 .tolist ()
233313 )
234314 outcome_to_features_map [outcome_col_name ] = important_features
235- self .logger .info (
236- f" For '{ outcome_col_name } ', selected { len (important_features )} important features: { important_features [:3 ]} ..."
237- )
238315
239- # Create signal from important features
240- signal = df [important_features ].sum (axis = 1 ) * self .feature_strength
316+ if i <= 3 or i == self .n_outcome_vars : # Only log first 3 and last
317+ self .logger .info (
318+ f" For '{ outcome_col_name } ', selected { len (important_features )} important features"
319+ )
320+
321+ # Create signal from important features (vectorized)
322+ signal = df [important_features ].values .sum (axis = 1 ) * self .feature_strength
241323
242324 # Create noise
243325 noise_strength = 1 - self .feature_strength
244- noise = np .random .randn (self .n_rows ) * noise_strength * signal .std ()
326+ noise = (
327+ np .random .randn (self .n_rows ).astype (np .float32 )
328+ * noise_strength
329+ * signal .std ()
330+ )
245331
246332 # Combine signal and noise, then create binary outcome
247333 combined_signal = signal + noise
248- # Use median as a threshold to get a balanced-ish outcome
249- threshold = combined_signal .median ()
250- df [outcome_col_name ] = (combined_signal > threshold ).astype (int )
251-
252- # Randomly flip some outcomes to make it harder
253- flip_mask = np .random .rand (self .n_rows ) < 0.1 # Flip 10%
254- df .loc [flip_mask , outcome_col_name ] = (
255- 1 - df .loc [flip_mask , outcome_col_name ]
256- )
334+ threshold = np .median (combined_signal )
335+ outcome = (combined_signal > threshold ).astype (np .int8 )
257336
258- # Move the new outcome column to the dictionary for later concatenation
259- new_cols_dict [outcome_col_name ] = df .pop (outcome_col_name )
337+ # Randomly flip 10% of outcomes
338+ flip_mask = np .random .rand (self .n_rows ) < 0.1
339+ outcome [flip_mask ] = 1 - outcome [flip_mask ]
260340
261- # 4. Add metadata columns to match real data format
341+ new_cols_dict [outcome_col_name ] = outcome
342+
343+ # 5. Add metadata columns
344+ self .logger .info ("Adding metadata columns..." )
262345 if "client_idcode" not in df .columns :
263346 new_cols_dict ["client_idcode" ] = [f"id_{ j } " for j in range (self .n_rows )]
264347
265- # Add 'Unnamed: 0' to mimic CSV read-in
266- new_cols_dict ["Unnamed: 0" ] = range (self .n_rows )
348+ new_cols_dict ["Unnamed: 0" ] = np .arange (self .n_rows , dtype = np .int32 )
267349
268- # Concatenate all new columns at once to avoid fragmentation
350+ # 6. Concatenate all at once
351+ self .logger .info ("Concatenating final DataFrame..." )
269352 new_cols_df = pd .DataFrame (new_cols_dict , index = df .index )
270353 df = pd .concat (
271- [new_cols_df [["Unnamed: 0" ]], df , new_cols_df .drop (columns = ["Unnamed: 0" ])],
354+ [
355+ new_cols_df [["Unnamed: 0" ]],
356+ df ,
357+ new_cols_df .drop (columns = ["Unnamed: 0" ]),
358+ ],
272359 axis = 1 ,
273360 )
274361
275- # Introduce some missing values
276- for col in df .columns :
277- if (
362+ # 7. Introduce missing values (vectorized per column)
363+ self .logger .info ("Introducing missing values..." )
364+ feature_cols = [
365+ col
366+ for col in df .columns
367+ if not (
278368 col .startswith ("outcome_var" )
279369 or col == "client_idcode"
280370 or col == "Unnamed: 0"
281- ):
282- continue
283- if random .random () < 0.15 : # 15% chance for a column to have NaNs
284- nan_mask = df .sample (frac = random .uniform (0.01 , 0.2 )).index
285- df .loc [nan_mask , col ] = np .nan
371+ )
372+ ]
373+
374+ cols_with_nans = random .sample (feature_cols , int (len (feature_cols ) * 0.15 ))
375+
376+ for col in tqdm (
377+ cols_with_nans ,
378+ desc = "Adding NaNs" ,
379+ disable = not self .logger .isEnabledFor (logging .INFO ),
380+ ):
381+ frac = random .uniform (0.01 , 0.2 )
382+ n_nans = int (self .n_rows * frac )
383+ nan_indices = np .random .choice (self .n_rows , size = n_nans , replace = False )
384+ df .loc [nan_indices , col ] = np .nan
286385
386+ self .logger .info (f"Generation complete! Shape: { df .shape } " )
287387 return df , outcome_to_features_map
288388
289389
0 commit comments