Skip to content

Commit 3272f67

Browse files
authored
Add files via upload
added mermaid diagrams for key functionality
1 parent c7a415a commit 3272f67

18 files changed

Lines changed: 1539 additions & 0 deletions
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
flowchart TD
2+
A[ml_grid_object] --> B[feature_importance_methods]
3+
4+
B --> C{Check feature_selection_method}
5+
6+
C -->|"anova or None"| D[ANOVA Method]
7+
C -->|"markov_blanket"| E[Markov Blanket Method]
8+
9+
D --> F[feature_methods.getNfeaturesANOVAF]
10+
E --> G[feature_methods.getNFeaturesMarkovBlanket]
11+
12+
F --> H[Calculate F-values using f_classif]
13+
F --> I[Sort features by F-value]
14+
F --> J[Return top n feature names]
15+
16+
G --> K[Initialize PPIMBC with SVC]
17+
G --> L[Fit and transform training data]
18+
G --> M[Extract top n features from MB]
19+
20+
J --> N[Apply feature selection]
21+
M --> N
22+
23+
N --> O[Filter X_train with selected features]
24+
N --> P[Filter X_test with selected features]
25+
N --> Q[Filter X_test_orig with selected features]
26+
27+
O --> R[Return filtered datasets]
28+
P --> R
29+
Q --> R
30+
31+
subgraph "Input Data"
32+
S[target_n_features]
33+
T[X_train]
34+
U[X_test]
35+
V[y_train]
36+
W[X_test_orig]
37+
end
38+
39+
subgraph "Feature Methods Class"
40+
F
41+
G
42+
H
43+
I
44+
J
45+
K
46+
L
47+
M
48+
end
49+
50+
subgraph "Output"
51+
X[Filtered X_train]
52+
Y[Filtered X_test]
53+
Z[Filtered X_test_orig]
54+
end
55+
56+
S --> B
57+
T --> B
58+
U --> B
59+
V --> B
60+
W --> B
61+
62+
R --> X
63+
R --> Y
64+
R --> Z
65+
66+
style B fill:#e1f5fe
67+
style C fill:#fff3e0
68+
style D fill:#f3e5f5
69+
style E fill:#e8f5e8
70+
style N fill:#fff8e1

assets/data_feature_importance_methods.svg

Lines changed: 102 additions & 0 deletions
Loading

assets/data_pipeline.mmd

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
flowchart TD
2+
A[Start Pipeline] --> B[Initialize Parameters]
3+
B --> C{Read Data}
4+
C -->|Sample Mode| D[Read Sample Data]
5+
C -->|Full Mode| E[Read Full Data]
6+
7+
D --> F[Apply Test/Column Sampling]
8+
E --> F
9+
10+
F --> G[Get Perturbation Columns]
11+
G --> H[Set Outcome Variable]
12+
H --> I[Handle Correlation Matrix]
13+
14+
I --> J[Handle Percent Missing]
15+
J --> K[Handle Outcome List]
16+
K --> L[Remove Constant Columns]
17+
18+
L --> M{Final Column List Empty?}
19+
M -->|Yes| N[Safety Retention Mechanism]
20+
M -->|No| O[Proceed with Final Columns]
21+
22+
N --> P[Retain Protected Columns]
23+
P --> Q{Still Empty?}
24+
Q -->|Yes| R[Select Random Features]
25+
Q -->|No| O
26+
R --> O
27+
28+
O --> S[Clean Up Data]
29+
S --> T[Screen Non-Float Types]
30+
T --> U[Handle Column Names]
31+
32+
U --> V{Scale Data?}
33+
V -->|Yes| W[Apply Standard Scaling]
34+
V -->|No| X[Skip Scaling]
35+
W --> X
36+
37+
X --> Y{Time Series Mode?}
38+
Y -->|Yes| Z[Convert to Time Series]
39+
Y -->|No| AA[Regular Processing]
40+
41+
Z --> BB[Get Max Sequence Length]
42+
BB --> CC[Convert X,y to Time Series]
43+
CC --> DD[Train/Test Split]
44+
45+
AA --> DD
46+
DD --> EE[Remove Constant Columns After Split]
47+
48+
EE --> FF{Feature Reduction Needed?}
49+
FF -->|Yes| GG[Apply Feature Importance Methods]
50+
FF -->|No| HH[Skip Feature Reduction]
51+
52+
GG --> II{Features Remain?}
53+
II -->|No| JJ[Error: All Features Removed]
54+
II -->|Yes| HH
55+
56+
HH --> KK{Time Series Mode?}
57+
KK -->|Yes| LL[Get Time Series Model List]
58+
KK -->|No| MM[Get Regular Model List]
59+
60+
LL --> NN[Pipeline Complete]
61+
MM --> NN
62+
63+
JJ --> OO[Pipeline Failed]
64+
65+
style A fill:#e1f5fe
66+
style NN fill:#c8e6c9
67+
style OO fill:#ffcdd2
68+
style N fill:#fff3e0
69+
style P fill:#fff3e0
70+
style R fill:#fff3e0

assets/data_pipeline.svg

Lines changed: 102 additions & 0 deletions
Loading

assets/grid_param_space.mmd

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
graph TD
2+
A[Grid Class Initialization] --> B[Set Global Parameters]
3+
B --> C[Set sample_n default=1000]
4+
C --> D[Initialize Grid Dictionary]
5+
6+
D --> E[Grid Parameters]
7+
E --> E1[resample: undersample/oversample/None]
8+
E --> E2[scale: True/False]
9+
E --> E3[feature_n: 100,95,75,50,25,5]
10+
E --> E4[param_space_size: medium/xsmall]
11+
E --> E5[n_unique_out: 10]
12+
E --> E6[outcome_var_n: 1]
13+
E --> E7[percent_missing: 99,95,80]
14+
E --> E8[corr: 0.98,0.85,0.5,0.25]
15+
E --> E9[feature_selection_method: anova/markov_blanket]
16+
E --> E10[data: nested dictionary]
17+
18+
E10 --> F[Data Features]
19+
F --> F1[age: True/False]
20+
F --> F2[sex: True/False]
21+
F --> F3[bmi: True]
22+
F --> F4[ethnicity: True/False]
23+
F --> F5[bloods: True/False]
24+
F --> F6[diagnostic_order: True/False]
25+
F --> F7[drug_order: True/False]
26+
F --> F8[annotation_n: True/False]
27+
F --> F9[meta_sp_annotation_n: True/False]
28+
F --> F10[annotation_mrc_n: True/False]
29+
F --> F11[meta_sp_annotation_mrc_n: True/False]
30+
F --> F12[vte_status: True]
31+
F --> F13[hosp_site: True]
32+
F --> F14[Other features: False]
33+
34+
D --> G[c_prod Function Definition]
35+
G --> H[Generate Cartesian Product]
36+
H --> I[Create settings_list]
37+
I --> J[Print Full Size]
38+
J --> K[Shuffle List]
39+
K --> L[Sample n Items]
40+
L --> M[Create Iterator]
41+
42+
subgraph "c_prod Function Logic"
43+
N[Input: Dictionary or List]
44+
N --> O{Is List?}
45+
O -->|Yes| P[Yield Items Recursively]
46+
O -->|No| Q[Generate Product of Values]
47+
Q --> R[Yield Dictionary with Keys]
48+
end
49+
50+
subgraph "Key Dependencies"
51+
S[itertools as it]
52+
T[random]
53+
U[ml_grid.util.global_params]
54+
end
55+
56+
subgraph "Class Attributes"
57+
V[self.global_params]
58+
W[self.verbose]
59+
X[self.sample_n]
60+
Y[self.grid]
61+
Z[self.settings_list]
62+
AA[self.settings_list_iterator]
63+
end
64+
65+
style A fill:#e1f5fe
66+
style D fill:#f3e5f5
67+
style G fill:#fff3e0
68+
style M fill:#e8f5e8

assets/grid_param_space.svg

Lines changed: 102 additions & 0 deletions
Loading

assets/hyperparameter_search.mmd

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
graph TD
2+
A[HyperparameterSearch.__init__] --> B{Validate ml_grid_object}
3+
B -->|None| C[Raise ValueError]
4+
B -->|Valid| D{Check Algorithm Type}
5+
6+
D -->|Module| E[Raise ValueError: Expected classifier instance]
7+
D -->|Valid Classifier| F[Algorithm Validation]
8+
9+
F --> G{Check Custom Classifiers}
10+
G -->|KNNWrapper<br/>h2o_classifier_class<br/>kerasClassifier_class| H[Valid Custom Classifier]
11+
G -->|Other| I{Check sklearn classifier}
12+
I -->|is_classifier = True| J[Valid sklearn Classifier]
13+
I -->|False| K{Has fit & predict methods?}
14+
K -->|Yes| L[Valid Generic Classifier]
15+
K -->|No| M[Raise ValueError: Invalid classifier]
16+
17+
H --> N[Configure Warnings]
18+
J --> N
19+
L --> N
20+
21+
N --> O{Method contains keras/xgb/catboost?}
22+
O -->|Yes| P[Configure GPU]
23+
O -->|No| Q[Skip GPU Config]
24+
25+
P --> R[run_search Method]
26+
Q --> R
27+
28+
R --> S{Check bayessearch flag}
29+
S -->|True| T{Algorithm is KNNWrapper or kerasClassifier?}
30+
S -->|False| U[Validate Parameters]
31+
32+
T -->|Yes| V[Set grid_n_jobs = 1]
33+
T -->|No| W[Use default grid_n_jobs]
34+
35+
V --> X[Use parameter_space as-is]
36+
W --> X
37+
U --> Y[Get validated parameters]
38+
39+
X --> Z{bayessearch = True?}
40+
Y --> AA{random_search = True?}
41+
42+
Z -->|True| BB[BayesSearchCV]
43+
AA -->|True| CC[Calculate n_iter<br/>min max_iter, parameter_grid_size * sub_sample_pct / 100]
44+
AA -->|False| DD[GridSearchCV]
45+
46+
CC --> EE[RandomizedSearchCV]
47+
48+
BB --> FF[grid.fit X_train, y_train]
49+
EE --> FF
50+
DD --> FF
51+
52+
FF --> GG[Return grid.best_estimator_]
53+
54+
style A fill:#e1f5fe
55+
style R fill:#e8f5e8
56+
style GG fill:#fff3e0
57+
style C fill:#ffebee
58+
style E fill:#ffebee
59+
style M fill:#ffebee

assets/hyperparameter_search.svg

Lines changed: 102 additions & 0 deletions
Loading

assets/impute_data_for_pipe.mmd

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
flowchart TD
2+
A[Input DataFrame] --> B[Drop Completely Empty Columns]
3+
B --> C[Separate Features X and Target y]
4+
C --> D[Train-Test Split]
5+
D --> E[Train-Validation Split]
6+
7+
E --> F[Identify Column Types]
8+
F --> G{Are there Numeric Columns?}
9+
10+
G -->|Yes| H[Process Each Numeric Column]
11+
G -->|No| M[Skip Numeric Imputation]
12+
13+
H --> I{Is Column Completely Empty?}
14+
I -->|Yes| J[Fill with 0]
15+
I -->|No| K[Apply Mean Imputation]
16+
17+
J --> L[Continue to Next Column]
18+
K --> L
19+
L --> N{More Numeric Columns?}
20+
N -->|Yes| H
21+
N -->|No| M
22+
23+
M --> O[Combine Train/Val/Test Sets]
24+
O --> P[Reset Indices]
25+
P --> Q[Combine Features and Target]
26+
Q --> R[Verify No NaN Values]
27+
R --> S[Return Final DataFrame]
28+
29+
T[save_missing_percentage Function] --> U[Calculate Missing Percentages]
30+
U --> V[Convert to Dictionary]
31+
V --> W[Save to Pickle File]
32+
W --> X[Return Dictionary]
33+
34+
style A fill:#e1f5fe
35+
style S fill:#c8e6c9
36+
style T fill:#fff3e0
37+
style X fill:#ffecb3
38+
style R fill:#ffcdd2

0 commit comments

Comments
 (0)