ml_binary_classification_gridsearch_hyperOpt/config_single_run.yml at main · SamoraHunter/ml_binary_classification_gridsearch_hyperOpt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# config_single_run.yml
# Configuration for a single, deterministic pipeline run for debugging and testing.

# Global settings for logging and error handling
global_params:
  verbose: 2
  # Verbosity for the scikit-learn search object (GridSearchCV, etc.). 0 is silent.
  search_verbose: 0
  error_raise: true
  # Set to true to see H2O's progress bars for parsing and model fitting.
  h2o_show_progress: false

# Experiment settings
experiment:
  # Base directory for storing all experiment results, relative to project root
  experiments_base_dir: "experiments"
  # Optional suffix for the experiment directory name
  additional_naming: "SingleRunTest"

# Data pipeline settings
data:
  # Path to the dataset, relative to the notebook's directory
  file_path: "synthetic_data_generated.csv"
  # List of substrings to identify columns to drop
  drop_term_list:
    - 'chrom'
    - 'hfe'
    - 'phlebo'
  # Override the outcome variable. Set to null or remove to use 'outcome_var_n' from run_params.
  outcome_var_override: 'outcome_var_1'

# Model settings
models:
  # This dictionary toggles which model classes will be loaded for the single run.
  LogisticRegression: true
  RandomForestClassifier: true
  XGB_class: false
  H2O_class: false # H2O AutoML
  H2O_GBM_class: true
  H2O_DRF_class: true
  H2O_DeepLearning_class: true
  H2O_GLM_class: true
  H2O_NaiveBayes_class: true
  H2O_RuleFit_class: true
  H2O_XGBoost_class: true
  H2O_StackedEnsemble_class: false
  H2O_GAM_class: true
  knn__gpu_wrapper_class: false
  TabPFNClassifierClass: false # requires hf token and agreement
  AutoGluonClassifierClass: false
  TPOTClassifierClass: false
  FLAMLClassifierClass: false
  AutoKerasClassifierClass: false

# Time-series models for a single run.
# These are only loaded if time_series_mode is enabled in the pipeline.
ts_models:
  RocketClassifier: true
  TimeSeriesForestClassifier: false
  KNeighborsTimeSeriesClassifier: false

# This section defines a single set of parameters for a standalone run.
run_params:
  outcome_var_n: 1
  param_space_size: 'xsmall'
  scale: true
  feature_n: 70
  use_embedding: false
  embedding_method: 'pca'
  embedding_dim: 10
  scale_features_before_embedding: true
  percent_missing: 80
  corr: 0.9
  feature_selection_method: 'anova'
  test_size: 0.2
  random_state: 42
  # The 'data' sub-dictionary toggles feature groups on and off.
  data:
    age: true
    sex: true
    bmi: true
    ethnicity: true
    bloods: true
    diagnostic_order: true
    drug_order: true
    annotation_n: true
    meta_sp_annotation_n: true
    annotation_mrc_n: true
    meta_sp_annotation_mrc_n: true
    core_02: true
    bed: true
    vte_status: true
    hosp_site: true
    core_resus: true
    news: true
    date_time_stamp: false
    appointments: false