ml_binary_classification_gridsearch_hyperOpt/config_hyperopt.yml at main · SamoraHunter/ml_binary_classification_gridsearch_hyperOpt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# config_hyperopt.yml
# Configuration for a full hyperparameter optimization search using Hyperopt.

# Global settings for the hyperopt run
global_params:
  verbose: 1 # Less verbose for multiple runs
  error_raise: true # Don't stop the search on a single trial failure
  # Set to true to see H2O's progress bars for parsing and model fitting.
  h2o_show_progress: false
  # Number of iterations for RandomizedSearchCV and BayesSearchCV
  n_iter: 2
  max_param_space_iter_value : 10
  sub_sample_param_space_pct: 0.05
  force_second_cv: false # If True, forces a second cross-validation run even if cached results are available. Defaults to False.
  model_eval_time_limit: 3600
  test_mode: true # If True, uses minimal parameter spaces and reduced cross-validation for fast testing.

# Experiment settings for the hyperopt run
experiment:
  # Base directory for storing all experiment results
  experiments_base_dir: "HFE_ML_experiments"
  # A descriptive name for this experiment batch
  additional_naming: ""

# Data settings for the hyperopt run
data:
  # Path to the dataset
  file_path: "synthetic_data_generated.csv"
  # List of substrings to identify columns to drop
  drop_term_list:
    - 'chrom'
    - 'hfe'
    - 'phlebo'
  # Set to true to iterate over all 'outcome_var_' columns in the dataset
  multiple_outcomes: true

# Models to be used during the hyperopt search
models:
  LogisticRegressionClass: true
  KNeighborsClassifierClass: true
  QuadraticDiscriminantAnalysisClass: true
  SVCClass: true
  XGBClassifierClass: true
  MLPClassifierClass: true
  RandomForestClassifierClass: true
  GradientBoostingClassifierClass: true
  CatBoostClassifierClass: true
  GaussianNBClassifierClass: true
  H2O_class: false # H2O AutoML
  H2O_GBM_class: true
  H2O_DRF_class: true
  H2O_DeepLearning_class: true
  H2O_GLM_class: true
  H2O_NaiveBayes_class: true
  H2O_RuleFit_class: true
  H2O_XGBoost_class: true
  H2O_StackedEnsemble_class: false
  H2O_GAM_class: true
  LightGBMClassifierWrapper: true # noqa
  AdaBoostClassifierClass: true
  NeuralNetworkClassifier_class: true # noqa
  TabTransformerClass: false
  # Set the following to true if a GPU is available and configured
  kerasClassifier_class: false
  knn__gpu_wrapper_class: false
  # Additional
  TabPFNClassifierClass: false # requires hf token and agreement
  AutoGluonClassifierClass: false # disabled by default
  TPOTClassifierClass: false # disabled by default
  FLAMLClassifierClass: false # disabled by default
  AutoKerasClassifierClass: false # disabled by default


# Time-series models to be used during the hyperopt search.
# These are only loaded if time_series_mode is enabled in the pipeline.
ts_models:
  # Enabled by default
  RocketClassifier: true
  TimeSeriesForestClassifier: true
  KNeighborsTimeSeriesClassifier: true
  Catch22Classifier: true
  HIVECOTEV2: false # Disabled due to internal numba typing error in aeon library
  TSFreshClassifier: true
  Arsenal: true
  CNNClassifier: true
  ElasticEnsemble: true
  EncoderClassifier: true
  FCNClassifier: true
  FreshPRINCEClassifier: true
  InceptionTimeClassifier: false # Disabled due to Keras input shape mismatch error
  IndividualInceptionClassifier: false # Disabled due to Keras input shape mismatch error
  IndividualTDE: true
  MLPClassifier: true
  MUSE: true
  OrdinalTDE: true
  ResNetClassifier: true
  SignatureClassifier: true
  SummaryClassifier: true
  TemporalDictionaryEnsemble: true
  # Univariate models, disabled by default
  ContractableBOSS: false
  HIVECOTEV1: false

# This section defines the parameter search space for Hyperopt.
# The structure uses lists of options, which will be parsed into hp.choice.
hyperopt_search_space:
  resample: ['undersample', 'oversample', 'null']
  scale: [true, false]
  feature_n: [100, 95, 75, 50, 25, 5]
  param_space_size: ['medium', 'xsmall']
  percent_missing: [99, 95, 80]
  correlation_threshold: [0.98, 0.85, 0.5, 0.25]
  feature_selection_method: ['anova', 'markov_blanket']
  outcome_var_n: ['1'] # Default, will be overridden if multiple_outcomes is true

  # Embedding hyperparameters
  use_embedding: [true, false]
  embedding_method: ['pca', 'svd']
  embedding_dim: [32, 64, 128]
  scale_features_before_embedding: [true, false]

  # Feature group toggles
  data:
    age: [true, false]
    sex: [true, false]
    bmi: [true]
    ethnicity: [true, false]
    bloods: [true, false]
    diagnostic_order: [true, false]
    drug_order: [true, false]
    annotation_n: [true, false]
    meta_sp_annotation_n: [true, false]
    annotation_mrc_n: [true, false]
    meta_sp_annotation_mrc_n: [true, false]
    core_02: [False]
    bed: [False]
    vte_status: [true]
    hosp_site: [true]
    core_resus: [False]
    news: [False]
    date_time_stamp: [False]
    appointments: [False]

# Hyperopt-specific settings
hyperopt_settings:
  max_evals: 2 # Number of iterations per outcome variable
  trial_timeout: 1120 # Timeout in seconds for a full trial (data prep + all models)