APT-Detection-System/main.py at main · Ap6pack/APT-Detection-System · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os
import yaml
import logging
import argparse
from threading import Thread
from data_preprocessing import preprocess
from feature_selection import hhosssa_feature_selection
from data_balancing import hhosssa_smote
from models import train_models
from evaluation import evaluation_metrics
from real_time_detection import data_ingestion, prediction_engine
from dashboard import app

# Conditionally import simulation only if needed
simulation_available = False
try:
    from simulation import SecurityEventSimulator
    simulation_available = True
except ImportError:
    pass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("apt_detection.log"),
        logging.StreamHandler()
    ]
)

def load_config():
    """Load configuration from config.yaml file."""
    config_path = os.path.join(os.path.dirname(__file__), 'config', 'config.yaml')
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

def run_data_ingestion(config):
    """Run the data ingestion process for real-time detection."""
    logging.info("Starting real-time data ingestion...")
    data_ingestion.run()
    logging.info("Real-time data ingestion and prediction setup completed.")

def initialize_baselines():
    """Initialize baseline models if they don't exist."""
    logging.info("Checking for baseline models...")
    engine = prediction_engine.PredictionEngine()

    # Check if baseline models exist
    if not engine.behavioral_analytics.baseline_models:
        logging.info("No baseline models found. Establishing baselines...")
        try:
            # Generate synthetic data and establish baselines
            engine.establish_baseline(days=7)
            logging.info("Baseline models established successfully.")
        except Exception as e:
            logging.error(f"Failed to establish baseline models: {e}")
    else:
        logging.info("Baseline models already exist.")

def run_dashboard(config):
    """Run the dashboard application."""
    logging.info("Starting dashboard...")
    app.run(
        host=config['dashboard']['host'],
        port=config['dashboard']['port'],
        debug=config['dashboard']['debug']
    )

def run_simulation(config):
    """Run the security event simulation system (if enabled in config)."""
    if not simulation_available:
        logging.warning("Simulation module not available. Skipping.")
        return None

    if 'simulation' not in config or not config['simulation'].get('enabled', False):
        logging.info("Simulation is disabled in configuration. Skipping.")
        return None

    logging.info("Starting security event simulation...")
    try:
        simulator = SecurityEventSimulator()
        simulator.start()
        logging.info("Security event simulation started successfully.")
        return simulator
    except Exception as e:
        logging.error(f"Failed to start security event simulation: {e}")
        return None

def parse_arguments():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description='APT Detection System')
    parser.add_argument('--train', action='store_true', help='Train models')
    parser.add_argument('--predict', action='store_true', help='Run prediction engine')
    parser.add_argument('--dashboard', action='store_true', help='Run dashboard')
    parser.add_argument('--simulation', action='store_true', help='Run security event simulation')
    parser.add_argument('--production', action='store_true', help='Run in production mode (no simulation)')
    parser.add_argument('--all', action='store_true', help='Run all components')
    return parser.parse_args()

if __name__ == "__main__":
    try:
        # Load configuration
        config = load_config()

        # Parse command line arguments
        args = parse_arguments()

        # If no arguments provided, run in production mode
        if not (args.train or args.predict or args.dashboard or args.simulation or args.all):
            args.production = True

        # Production mode overrides
        if args.production:
            args.predict = True
            args.dashboard = True
            args.simulation = False
            # Ensure simulation is disabled in config
            if 'simulation' in config:
                config['simulation']['enabled'] = False
            logging.info("Running in production mode (real data sources, no simulation)")

        # Train models if requested
        if args.train or args.all:
            # Load and preprocess data
            logging.info("Starting data preprocessing...")
            dataset_path = os.path.join(os.getcwd(), config['data_paths']['dataset'])
            df = preprocess.run(dataset_path)
            logging.info("Data preprocessing completed.")

            # Feature selection
            logging.info("Starting feature selection...")
            selected_features = hhosssa_feature_selection.run(df)
            logging.info("Feature selection completed.")

            # Data balancing
            logging.info("Starting data balancing...")
            balanced_data = hhosssa_smote.run(selected_features)
            logging.info("Data balancing completed.")

            # Train models
            logging.info("Starting model training...")
            lgbm_model, bilstm_model, hybrid_model = train_models.run(balanced_data, save=True)
            logging.info("Model training completed.")

            # Evaluate models
            logging.info("Starting model evaluation...")
            accuracy, roc_auc = evaluation_metrics.evaluate(hybrid_model, balanced_data)
            logging.info(f"Model evaluation completed with Accuracy: {accuracy}, ROC-AUC: {roc_auc}")

        # Initialize models for prediction
        models = None
        if args.train or args.all:
            # Use freshly trained models
            models = {'lgbm_model': lgbm_model, 'bilstm_model': bilstm_model}

        # Initialize threads
        ingestion_thread = None
        dashboard_thread = None
        simulation_thread = None

        # Initialize baseline models if needed
        if args.predict or args.dashboard or args.all or args.production:
            initialize_baselines()

        # Run prediction engine if requested
        if args.predict or args.all or args.production:
            # Real-time detection setup
            ingestion_thread = Thread(target=run_data_ingestion, args=(config,))
            ingestion_thread.daemon = True
            ingestion_thread.start()

            # Start prediction engine
            logging.info("Starting prediction engine...")
            try:
                if models:
                    # Use freshly trained models
                    predict_fn = prediction_engine.run(models, use_saved_models=False)
                else:
                    # Load models from disk
                    predict_fn = prediction_engine.run(use_saved_models=True)
                logging.info("Prediction engine started successfully.")
            except Exception as e:
                logging.error(f"Failed to start prediction engine: {e}")
                # Continue with other components even if prediction engine fails

        # Run dashboard if requested
        if args.dashboard or args.all or args.production:
            dashboard_thread = Thread(target=run_dashboard, args=(config,))
            dashboard_thread.start()

        # Run simulation if requested and not in production mode
        if (args.simulation or args.all) and not args.production:
            # Check if simulation is enabled in config
            if 'simulation' in config and config['simulation'].get('enabled', False):
                # Start simulation in a separate thread
                simulation_thread = Thread(target=lambda: run_simulation(config))
                simulation_thread.daemon = True  # Make thread daemon so it exits when main thread exits
                simulation_thread.start()
                logging.info("Simulation thread started")
            else:
                logging.warning("Simulation requested but disabled in config. Enable it in config.yaml to use simulation.")

        # Wait for dashboard thread to complete (main thread)
        if dashboard_thread:
            dashboard_thread.join()

        # Note: We don't join the ingestion or simulation threads because they run indefinitely
        # and we want the program to exit when the dashboard thread completes

    except KeyboardInterrupt:
        logging.info("Application terminated by user")
    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)