Skip to content

Commit 67b7937

Browse files
author
SamoraHunter
committed
Optimize synthetic data generator performance and naming logic
- **Performance:** Replaced iterative data generation with vectorized NumPy operations, significantly reducing generation time for large datasets. - **Naming Logic:** Updated `_generate_column_names` to prioritize clean `Prefix+Suffix` combinations before resorting to round-based duplicates. - **Suffix Compatibility:** Modified the round counter insertion strategy to place `_r{n}` in the middle of the feature name (e.g., `Monocytes_r1_num-tests`) instead of appending it. This ensures standard suffixes remain at the end of the string, preserving compatibility with downstream suffix-based type detection. - **UX:** Added `tqdm` progress bars to track generation steps.
1 parent f4d0ef0 commit 67b7937

1 file changed

Lines changed: 175 additions & 75 deletions

File tree

ml_grid/util/synthetic_data_generator.py

Lines changed: 175 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import numpy as np
1111
import pandas as pd
12+
from tqdm import tqdm
1213

1314

1415
class SyntheticDataGenerator:
@@ -146,49 +147,121 @@ def __init__(
146147

147148
def _generate_column_names(self) -> List[str]:
148149
"""Generates a list of realistic, structured feature names."""
149-
generated_names = set()
150+
self.logger.info(f"Generating {self.n_features} column names...")
150151

151-
# Add some special features to ensure they are present
152-
generated_names.update(
153-
random.sample(self._special_features, min(len(self._special_features), 5))
154-
)
155-
156-
# Generate structured feature groups
157-
while len(generated_names) < self.n_features:
158-
prefix = random.choice(self._feature_prefixes)
152+
generated_names = []
159153

160-
# For each prefix, create a few related features
161-
num_suffixes_for_prefix = random.randint(1, 4)
162-
suffixes_to_add = random.sample(
163-
self._feature_suffixes, num_suffixes_for_prefix
164-
)
154+
# Add some special features first
155+
num_special = min(len(self._special_features), 5)
156+
generated_names.extend(random.sample(self._special_features, num_special))
165157

166-
for suffix in suffixes_to_add:
167-
if len(generated_names) >= self.n_features:
168-
break
169-
new_name = f"{prefix}{suffix}"
170-
generated_names.add(new_name)
158+
# Calculate how many more names we need
159+
remaining = self.n_features - len(generated_names)
171160

172-
final_names = list(generated_names)
173-
random.shuffle(final_names)
161+
if remaining <= 0:
162+
return generated_names[: self.n_features]
174163

175-
return final_names[: self.n_features]
164+
# 1. Generate all possible clean combinations (Prefix + Suffix)
165+
# This avoids _r1 suffixes unless we run out of unique clean names
166+
clean_combinations = [
167+
f"{prefix}{suffix}"
168+
for prefix in self._feature_prefixes
169+
for suffix in self._feature_suffixes
170+
]
171+
random.shuffle(clean_combinations)
172+
173+
# Take as many as we need from clean combinations
174+
num_from_clean = min(len(clean_combinations), remaining)
175+
generated_names.extend(clean_combinations[:num_from_clean])
176+
remaining -= num_from_clean
177+
178+
# 2. If we still need more, use rounds (_r1, _r2, etc.)
179+
round_num = 1
180+
max_rounds = 10
181+
while remaining > 0 and round_num < max_rounds:
182+
round_candidates = [
183+
f"{prefix}_r{round_num}{suffix}"
184+
for prefix in self._feature_prefixes
185+
for suffix in self._feature_suffixes
186+
]
187+
random.shuffle(round_candidates)
188+
189+
take_round = min(remaining, len(round_candidates))
190+
generated_names.extend(round_candidates[:take_round])
191+
remaining -= take_round
192+
round_num += 1
193+
194+
# Final shuffle
195+
random.shuffle(generated_names)
196+
return generated_names[: self.n_features]
197+
198+
def _generate_typed_data(self, feature_names: List[str]) -> np.ndarray:
199+
"""
200+
Generates data with appropriate types based on column names.
176201
177-
def _assign_feature_types(self, df: pd.DataFrame):
178-
"""Modifies DataFrame columns in-place to have more realistic data types."""
179-
for col in df.columns:
180-
# Handle special cases first
202+
This is significantly faster than modifying DataFrame columns after creation.
203+
"""
204+
# Pre-categorize columns to avoid repeated string matching
205+
age_cols = []
206+
binary_cols = []
207+
bmi_cols = []
208+
int_cols = []
209+
binary_suffix_cols = []
210+
normal_cols = []
211+
212+
self.logger.info("Categorizing columns...")
213+
for idx, col in enumerate(feature_names):
181214
if col == "age":
182-
df[col] = np.random.randint(20, 90, size=self.n_rows)
215+
age_cols.append(idx)
183216
elif col == "male" or "vte_status" in col or "bed_type" in col:
184-
df[col] = np.random.randint(0, 2, size=self.n_rows)
217+
binary_cols.append(idx)
185218
elif col == "bmi_value":
186-
df[col] = np.random.uniform(18, 45, size=self.n_rows)
187-
# Handle suffixes
219+
bmi_cols.append(idx)
188220
elif any(s in col for s in self._int_suffixes):
189-
df[col] = np.random.poisson(5, size=self.n_rows) * random.randint(1, 5)
221+
int_cols.append(idx)
190222
elif any(s in col for s in self._binary_suffixes):
191-
df[col] = np.random.randint(0, 2, size=self.n_rows)
223+
binary_suffix_cols.append(idx)
224+
else:
225+
normal_cols.append(idx)
226+
227+
# Pre-allocate array
228+
data = np.empty((self.n_rows, len(feature_names)), dtype=np.float32)
229+
230+
# Generate data in bulk for each category
231+
self.logger.info("Generating typed data...")
232+
if age_cols:
233+
for idx in age_cols:
234+
data[:, idx] = np.random.randint(20, 90, size=self.n_rows)
235+
236+
if binary_cols:
237+
for idx in binary_cols:
238+
data[:, idx] = np.random.randint(0, 2, size=self.n_rows)
239+
240+
if bmi_cols:
241+
for idx in bmi_cols:
242+
data[:, idx] = np.random.uniform(18, 45, size=self.n_rows)
243+
244+
if int_cols:
245+
for idx in int_cols:
246+
data[:, idx] = np.random.poisson(5, size=self.n_rows) * random.randint(
247+
1, 5
248+
)
249+
250+
if binary_suffix_cols:
251+
for idx in binary_suffix_cols:
252+
data[:, idx] = np.random.randint(0, 2, size=self.n_rows)
253+
254+
# Generate all normal columns at once
255+
if normal_cols:
256+
self.logger.info(
257+
f"Generating {len(normal_cols)} normal distribution columns..."
258+
)
259+
normal_data = np.random.randn(self.n_rows, len(normal_cols)).astype(
260+
np.float32
261+
)
262+
data[:, normal_cols] = normal_data
263+
264+
return data
192265

193266
def generate(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
194267
"""
@@ -199,28 +272,35 @@ def generate(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
199272
- The fully generated synthetic dataset.
200273
- A dictionary mapping each outcome variable to its list of important features.
201274
"""
202-
# 1. Generate feature data
275+
self.logger.info(
276+
f"Starting generation: {self.n_rows} rows × {self.n_features} features"
277+
)
278+
279+
# 1. Generate feature names
280+
self.logger.info("Generating column names...")
203281
feature_names = self._generate_column_names()
204-
data = np.random.randn(self.n_rows, len(feature_names))
282+
283+
# 2. Generate typed data directly (much faster than modifying after)
284+
data = self._generate_typed_data(feature_names)
285+
286+
self.logger.info("Creating DataFrame...")
205287
df = pd.DataFrame(data, columns=feature_names)
206288

207-
# This will hold new columns to be added efficiently at the end
289+
# Dictionary to hold outcome variables and metadata
208290
new_cols_dict = {}
209-
210-
# This will store the mapping of outcome -> important features
211291
outcome_to_features_map = {}
212292

213-
# 1.5. Assign more realistic data types and distributions
214-
self._assign_feature_types(df)
215-
216-
# 2. Determine number of important features
217-
n_important = int(self.n_features * self.percent_important_features)
218-
n_important = max(1, n_important) # Ensure at least one important feature
293+
# 3. Determine number of important features
294+
n_important = max(1, int(self.n_features * self.percent_important_features))
219295

220296
self.logger.info(f"Generating {self.n_outcome_vars} outcome variables.")
221297

222-
# 3. Generate outcome variables
223-
for i in range(1, self.n_outcome_vars + 1):
298+
# 4. Generate outcome variables with progress bar
299+
for i in tqdm(
300+
range(1, self.n_outcome_vars + 1),
301+
desc="Creating outcomes",
302+
disable=not self.logger.isEnabledFor(logging.INFO),
303+
):
224304
outcome_col_name = f"outcome_var_{i}"
225305

226306
# Select a unique set of important features for *this* outcome
@@ -232,58 +312,78 @@ def generate(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
232312
.tolist()
233313
)
234314
outcome_to_features_map[outcome_col_name] = important_features
235-
self.logger.info(
236-
f" For '{outcome_col_name}', selected {len(important_features)} important features: {important_features[:3]}..."
237-
)
238315

239-
# Create signal from important features
240-
signal = df[important_features].sum(axis=1) * self.feature_strength
316+
if i <= 3 or i == self.n_outcome_vars: # Only log first 3 and last
317+
self.logger.info(
318+
f" For '{outcome_col_name}', selected {len(important_features)} important features"
319+
)
320+
321+
# Create signal from important features (vectorized)
322+
signal = df[important_features].values.sum(axis=1) * self.feature_strength
241323

242324
# Create noise
243325
noise_strength = 1 - self.feature_strength
244-
noise = np.random.randn(self.n_rows) * noise_strength * signal.std()
326+
noise = (
327+
np.random.randn(self.n_rows).astype(np.float32)
328+
* noise_strength
329+
* signal.std()
330+
)
245331

246332
# Combine signal and noise, then create binary outcome
247333
combined_signal = signal + noise
248-
# Use median as a threshold to get a balanced-ish outcome
249-
threshold = combined_signal.median()
250-
df[outcome_col_name] = (combined_signal > threshold).astype(int)
251-
252-
# Randomly flip some outcomes to make it harder
253-
flip_mask = np.random.rand(self.n_rows) < 0.1 # Flip 10%
254-
df.loc[flip_mask, outcome_col_name] = (
255-
1 - df.loc[flip_mask, outcome_col_name]
256-
)
334+
threshold = np.median(combined_signal)
335+
outcome = (combined_signal > threshold).astype(np.int8)
257336

258-
# Move the new outcome column to the dictionary for later concatenation
259-
new_cols_dict[outcome_col_name] = df.pop(outcome_col_name)
337+
# Randomly flip 10% of outcomes
338+
flip_mask = np.random.rand(self.n_rows) < 0.1
339+
outcome[flip_mask] = 1 - outcome[flip_mask]
260340

261-
# 4. Add metadata columns to match real data format
341+
new_cols_dict[outcome_col_name] = outcome
342+
343+
# 5. Add metadata columns
344+
self.logger.info("Adding metadata columns...")
262345
if "client_idcode" not in df.columns:
263346
new_cols_dict["client_idcode"] = [f"id_{j}" for j in range(self.n_rows)]
264347

265-
# Add 'Unnamed: 0' to mimic CSV read-in
266-
new_cols_dict["Unnamed: 0"] = range(self.n_rows)
348+
new_cols_dict["Unnamed: 0"] = np.arange(self.n_rows, dtype=np.int32)
267349

268-
# Concatenate all new columns at once to avoid fragmentation
350+
# 6. Concatenate all at once
351+
self.logger.info("Concatenating final DataFrame...")
269352
new_cols_df = pd.DataFrame(new_cols_dict, index=df.index)
270353
df = pd.concat(
271-
[new_cols_df[["Unnamed: 0"]], df, new_cols_df.drop(columns=["Unnamed: 0"])],
354+
[
355+
new_cols_df[["Unnamed: 0"]],
356+
df,
357+
new_cols_df.drop(columns=["Unnamed: 0"]),
358+
],
272359
axis=1,
273360
)
274361

275-
# Introduce some missing values
276-
for col in df.columns:
277-
if (
362+
# 7. Introduce missing values (vectorized per column)
363+
self.logger.info("Introducing missing values...")
364+
feature_cols = [
365+
col
366+
for col in df.columns
367+
if not (
278368
col.startswith("outcome_var")
279369
or col == "client_idcode"
280370
or col == "Unnamed: 0"
281-
):
282-
continue
283-
if random.random() < 0.15: # 15% chance for a column to have NaNs
284-
nan_mask = df.sample(frac=random.uniform(0.01, 0.2)).index
285-
df.loc[nan_mask, col] = np.nan
371+
)
372+
]
373+
374+
cols_with_nans = random.sample(feature_cols, int(len(feature_cols) * 0.15))
375+
376+
for col in tqdm(
377+
cols_with_nans,
378+
desc="Adding NaNs",
379+
disable=not self.logger.isEnabledFor(logging.INFO),
380+
):
381+
frac = random.uniform(0.01, 0.2)
382+
n_nans = int(self.n_rows * frac)
383+
nan_indices = np.random.choice(self.n_rows, size=n_nans, replace=False)
384+
df.loc[nan_indices, col] = np.nan
286385

386+
self.logger.info(f"Generation complete! Shape: {df.shape}")
287387
return df, outcome_to_features_map
288388

289389

0 commit comments

Comments
 (0)